]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 0d63ed13ea3d closures: Fix race in closure_sync()
authorKent Overstreet <kent.overstreet@linux.dev>
Wed, 25 Oct 2023 06:09:44 +0000 (02:09 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Wed, 25 Oct 2023 17:59:16 +0000 (13:59 -0400)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
52 files changed:
.bcachefs_revision
include/linux/atomic.h
include/linux/closure.h
include/linux/sched.h
libbcachefs/bbpos.h
libbcachefs/bbpos_types.h [new file with mode: 0644]
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bkey_methods.h
libbcachefs/btree_trans_commit.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/chardev.c
libbcachefs/compress.c
libbcachefs/compress.h
libbcachefs/data_update.c
libbcachefs/data_update.h
libbcachefs/disk_groups.c
libbcachefs/disk_groups.h
libbcachefs/disk_groups_types.h [new file with mode: 0644]
libbcachefs/errcode.h
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/fs-io-direct.c
libbcachefs/fsck.c
libbcachefs/fsck.h
libbcachefs/inode.c
libbcachefs/inode.h
libbcachefs/io_misc.c
libbcachefs/io_write.c
libbcachefs/journal.c
libbcachefs/journal.h
libbcachefs/move.c
libbcachefs/move.h
libbcachefs/move_types.h
libbcachefs/movinggc.c
libbcachefs/opts.c
libbcachefs/opts.h
libbcachefs/printbuf.c
libbcachefs/rebalance.c
libbcachefs/rebalance.h
libbcachefs/rebalance_types.h
libbcachefs/recovery.c
libbcachefs/recovery_types.h
libbcachefs/reflink.c
libbcachefs/super.c
libbcachefs/super_types.h
libbcachefs/sysfs.c
libbcachefs/trace.c
libbcachefs/trace.h
libbcachefs/xattr.c
linux/closure.c

index da47120b31695ae83b950dabaacae3ee9f26c230..db0b3f7ab23057b1ace93292efe83019a48906c7 100644 (file)
@@ -1 +1 @@
-f70a3402188ea797a38fa9f5b729fb6fbe5f5b83
+0d63ed13ea3d867055ae5752e2e0514a227d1dcb
index f4d047c1505e207049956ba63511b8a7be0681f0..f1464cf3e0c371721745f78de13b2f03496eecc0 100644 (file)
@@ -47,6 +47,7 @@ typedef struct {
 #define smp_rmb()                      cmm_smp_rmb()
 #define smp_mb()                       cmm_smp_mb()
 #define smp_read_barrier_depends()     cmm_smp_read_barrier_depends()
+#define smp_acquire__after_ctrl_dep()  cmm_smp_mb()
 
 #else /* C11_ATOMICS */
 
@@ -205,6 +206,11 @@ static inline i_type a_type##_dec_return(a_type##_t *v)                    \
        return __ATOMIC_DEC_RETURN(&v->counter);                        \
 }                                                                      \
                                                                        \
+static inline i_type a_type##_dec_return_release(a_type##_t *v)                \
+{                                                                      \
+       return __ATOMIC_SUB_RETURN_RELEASE(1, &v->counter);             \
+}                                                                      \
+                                                                       \
 static inline void a_type##_inc(a_type##_t *v)                         \
 {                                                                      \
        __ATOMIC_INC(&v->counter);                                      \
index 722a586bb22444418d31eb80d9d25e89de5d72a2..de7bb47d8a46ace38d95a81ed6df231d91ac725b 100644 (file)
@@ -154,6 +154,7 @@ struct closure {
        struct closure          *parent;
 
        atomic_t                remaining;
+       bool                    closure_get_happened;
 
 #ifdef CONFIG_DEBUG_CLOSURES
 #define CLOSURE_MAGIC_DEAD     0xc054dead
@@ -185,7 +186,11 @@ static inline unsigned closure_nr_remaining(struct closure *cl)
  */
 static inline void closure_sync(struct closure *cl)
 {
-       if (closure_nr_remaining(cl) != 1)
+#ifdef CONFIG_DEBUG_CLOSURES
+       BUG_ON(closure_nr_remaining(cl) != 1 && !cl->closure_get_happened);
+#endif
+
+       if (cl->closure_get_happened)
                __closure_sync(cl);
 }
 
@@ -233,8 +238,6 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
        closure_set_ip(cl);
        cl->fn = fn;
        cl->wq = wq;
-       /* between atomic_dec() in closure_put() */
-       smp_mb__before_atomic();
 }
 
 static inline void closure_queue(struct closure *cl)
@@ -259,6 +262,8 @@ static inline void closure_queue(struct closure *cl)
  */
 static inline void closure_get(struct closure *cl)
 {
+       cl->closure_get_happened = true;
+
 #ifdef CONFIG_DEBUG_CLOSURES
        BUG_ON((atomic_inc_return(&cl->remaining) &
                CLOSURE_REMAINING_MASK) <= 1);
@@ -281,6 +286,7 @@ static inline void closure_init(struct closure *cl, struct closure *parent)
                closure_get(parent);
 
        atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+       cl->closure_get_happened = false;
 
        closure_debug_create(cl);
        closure_set_ip(cl);
index 825eea5c31c59a7a7c1d1f4b5d1665e1dca449cb..7afb6d54bb34101e3b78a9d02ce1196ffa1fbaf6 100644 (file)
@@ -151,6 +151,14 @@ static inline u64 ktime_get_seconds(void)
        return ts.tv_sec;
 }
 
+static inline u64 ktime_get_real_ns(void)
+{
+       struct timespec ts;
+
+       clock_gettime(CLOCK_REALTIME, &ts);
+       return timespec_to_ns(&ts);
+}
+
 static inline u64 ktime_get_real_seconds(void)
 {
        struct timespec ts;
index 0038bc28ba8c01b45b8962571ed37f107c1447b0..be2edced52133e6592092d5d8e20c643cd8b372a 100644 (file)
@@ -2,22 +2,10 @@
 #ifndef _BCACHEFS_BBPOS_H
 #define _BCACHEFS_BBPOS_H
 
+#include "bbpos_types.h"
 #include "bkey_methods.h"
 #include "btree_cache.h"
 
-struct bbpos {
-       enum btree_id           btree;
-       struct bpos             pos;
-};
-
-static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
-{
-       return (struct bbpos) { btree, pos };
-}
-
-#define BBPOS_MIN      BBPOS(0, POS_MIN)
-#define BBPOS_MAX      BBPOS(BTREE_ID_NR - 1, POS_MAX)
-
 static inline int bbpos_cmp(struct bbpos l, struct bbpos r)
 {
        return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos);
diff --git a/libbcachefs/bbpos_types.h b/libbcachefs/bbpos_types.h
new file mode 100644 (file)
index 0000000..5198e94
--- /dev/null
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BBPOS_TYPES_H
+#define _BCACHEFS_BBPOS_TYPES_H
+
+struct bbpos {
+       enum btree_id           btree;
+       struct bpos             pos;
+};
+
+static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
+{
+       return (struct bbpos) { btree, pos };
+}
+
+#define BBPOS_MIN      BBPOS(0, POS_MIN)
+#define BBPOS_MAX      BBPOS(BTREE_ID_NR - 1, POS_MAX)
+
+#endif /* _BCACHEFS_BBPOS_TYPES_H */
index 9863571feebf38503f655218c855a06c1efb4a62..68f0ff03c28a603e104d625640703d7a69f33fcd 100644 (file)
@@ -418,6 +418,7 @@ enum bch_time_stats {
 #include "buckets_types.h"
 #include "buckets_waiting_for_journal_types.h"
 #include "clock_types.h"
+#include "disk_groups_types.h"
 #include "ec_types.h"
 #include "journal_types.h"
 #include "keylist_types.h"
@@ -463,6 +464,7 @@ enum gc_phase {
        GC_PHASE_BTREE_snapshot_trees,
        GC_PHASE_BTREE_deleted_inodes,
        GC_PHASE_BTREE_logged_ops,
+       GC_PHASE_BTREE_rebalance_work,
 
        GC_PHASE_PENDING_DELETE,
 };
@@ -938,9 +940,6 @@ struct bch_fs {
        struct list_head        moving_context_list;
        struct mutex            moving_context_lock;
 
-       struct list_head        data_progress_list;
-       struct mutex            data_progress_lock;
-
        /* REBALANCE */
        struct bch_fs_rebalance rebalance;
 
index 99749f3315fec5fafb0b6d051ca01f0b7384834a..e04999c578920f63bec8b5baec4ec4efffbd1c72 100644 (file)
@@ -613,31 +613,17 @@ struct bch_extent_stripe_ptr {
 #endif
 };
 
-struct bch_extent_reservation {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u64                   type:6,
-                               unused:22,
-                               replicas:4,
-                               generation:32;
-#elif defined (__BIG_ENDIAN_BITFIELD)
-       __u64                   generation:32,
-                               replicas:4,
-                               unused:22,
-                               type:6;
-#endif
-};
-
 struct bch_extent_rebalance {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
-       __u64                   type:7,
-                               unused:33,
-                               compression:8,
+       __u64                   type:6,
+                               unused:34,
+                               compression:8, /* enum bch_compression_opt */
                                target:16;
 #elif defined (__BIG_ENDIAN_BITFIELD)
        __u64                   target:16,
                                compression:8,
-                               unused:33,
-                               type:7;
+                               unused:34,
+                               type:6;
 #endif
 };
 
@@ -1682,7 +1668,9 @@ struct bch_sb_field_journal_seq_blacklist {
        x(snapshot_skiplists,           BCH_VERSION(1,  1),             \
          BIT_ULL(BCH_RECOVERY_PASS_check_snapshots))                   \
        x(deleted_inodes,               BCH_VERSION(1,  2),             \
-         BIT_ULL(BCH_RECOVERY_PASS_check_inodes))
+         BIT_ULL(BCH_RECOVERY_PASS_check_inodes))                      \
+       x(rebalance_work,               BCH_VERSION(1,  3),             \
+         BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
 
 enum bcachefs_metadata_version {
        bcachefs_metadata_version_min = 9,
@@ -1693,7 +1681,7 @@ enum bcachefs_metadata_version {
 };
 
 static const __maybe_unused
-unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor;
+unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_rebalance_work;
 
 #define bcachefs_metadata_version_current      (bcachefs_metadata_version_max - 1)
 
@@ -2306,7 +2294,9 @@ enum btree_id_flags {
          BIT_ULL(KEY_TYPE_set))                                                \
        x(logged_ops,           17,     0,                                      \
          BIT_ULL(KEY_TYPE_logged_op_truncate)|                                 \
-         BIT_ULL(KEY_TYPE_logged_op_finsert))
+         BIT_ULL(KEY_TYPE_logged_op_finsert))                                  \
+       x(rebalance_work,       18,     BTREE_ID_SNAPSHOTS,                     \
+         BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
 
 enum btree_id {
 #define x(name, nr, ...) BTREE_ID_##name = nr,
index 668f595e2fcfeadf29547bd33d46448bad6dc7e3..c829c8e381a7edc550641d04a78149cd848e04c3 100644 (file)
@@ -119,16 +119,6 @@ enum btree_update_flags {
 #define BTREE_TRIGGER_BUCKET_INVALIDATE        (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
 #define BTREE_TRIGGER_NOATOMIC         (1U << __BTREE_TRIGGER_NOATOMIC)
 
-#define BTREE_TRIGGER_WANTS_OLD_AND_NEW                \
-       ((1U << KEY_TYPE_alloc)|                \
-        (1U << KEY_TYPE_alloc_v2)|             \
-        (1U << KEY_TYPE_alloc_v3)|             \
-        (1U << KEY_TYPE_alloc_v4)|             \
-        (1U << KEY_TYPE_stripe)|               \
-        (1U << KEY_TYPE_inode)|                \
-        (1U << KEY_TYPE_inode_v2)|             \
-        (1U << KEY_TYPE_snapshot))
-
 static inline int bch2_trans_mark_key(struct btree_trans *trans,
                                      enum btree_id btree_id, unsigned level,
                                      struct bkey_s_c old, struct bkey_i *new,
index 1000b456d232d14f7d603bc74c7651b4d2810647..53ddcaf042a20b255f6e73bb5a72d42d77c03e7a 100644 (file)
@@ -382,8 +382,7 @@ static int run_one_mem_trigger(struct btree_trans *trans,
        if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)))
                return 0;
 
-       if (old_ops->atomic_trigger == new_ops->atomic_trigger &&
-           ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+       if (old_ops->atomic_trigger == new_ops->atomic_trigger) {
                ret   = bch2_mark_key(trans, i->btree_id, i->level,
                                old, bkey_i_to_s_c(new),
                                BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
@@ -425,8 +424,7 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
 
        if (!i->insert_trigger_run &&
            !i->overwrite_trigger_run &&
-           old_ops->trans_trigger == new_ops->trans_trigger &&
-           ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) {
+           old_ops->trans_trigger == new_ops->trans_trigger) {
                i->overwrite_trigger_run = true;
                i->insert_trigger_run = true;
                return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
index a1a4b5feadaa2cf36fc3b0657241efc464b6faab..a8af803e7289fec94dfcc03929b1fcc1c94d5b98 100644 (file)
@@ -935,14 +935,12 @@ static int bch2_mark_stripe_ptr(struct btree_trans *trans,
        return 0;
 }
 
-int bch2_mark_extent(struct btree_trans *trans,
-                    enum btree_id btree_id, unsigned level,
-                    struct bkey_s_c old, struct bkey_s_c new,
-                    unsigned flags)
+static int __mark_extent(struct btree_trans *trans,
+                        enum btree_id btree_id, unsigned level,
+                        struct bkey_s_c k, unsigned flags)
 {
        u64 journal_seq = trans->journal_res.seq;
        struct bch_fs *c = trans->c;
-       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
        struct extent_ptr_decoded p;
@@ -1018,6 +1016,14 @@ int bch2_mark_extent(struct btree_trans *trans,
        return 0;
 }
 
+int bch2_mark_extent(struct btree_trans *trans,
+                    enum btree_id btree_id, unsigned level,
+                    struct bkey_s_c old, struct bkey_s_c new,
+                    unsigned flags)
+{
+       return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags);
+}
+
 int bch2_mark_stripe(struct btree_trans *trans,
                     enum btree_id btree_id, unsigned level,
                     struct bkey_s_c old, struct bkey_s_c new,
@@ -1124,13 +1130,11 @@ int bch2_mark_stripe(struct btree_trans *trans,
        return 0;
 }
 
-int bch2_mark_reservation(struct btree_trans *trans,
-                         enum btree_id btree_id, unsigned level,
-                         struct bkey_s_c old, struct bkey_s_c new,
-                         unsigned flags)
+static int __mark_reservation(struct btree_trans *trans,
+                             enum btree_id btree_id, unsigned level,
+                             struct bkey_s_c k, unsigned flags)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
        struct bch_fs_usage *fs_usage;
        unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
        s64 sectors = (s64) k.k->size;
@@ -1157,6 +1161,14 @@ int bch2_mark_reservation(struct btree_trans *trans,
        return 0;
 }
 
+int bch2_mark_reservation(struct btree_trans *trans,
+                         enum btree_id btree_id, unsigned level,
+                         struct bkey_s_c old, struct bkey_s_c new,
+                         unsigned flags)
+{
+       return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags);
+}
+
 static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
                                 struct bkey_s_c_reflink_p p,
                                 u64 start, u64 end,
@@ -1211,13 +1223,11 @@ fsck_err:
        return ret;
 }
 
-int bch2_mark_reflink_p(struct btree_trans *trans,
-                       enum btree_id btree_id, unsigned level,
-                       struct bkey_s_c old, struct bkey_s_c new,
-                       unsigned flags)
+static int __mark_reflink_p(struct btree_trans *trans,
+                           enum btree_id btree_id, unsigned level,
+                           struct bkey_s_c k, unsigned flags)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new;
        struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
        struct reflink_gc *ref;
        size_t l, r, m;
@@ -1251,6 +1261,14 @@ int bch2_mark_reflink_p(struct btree_trans *trans,
        return ret;
 }
 
+int bch2_mark_reflink_p(struct btree_trans *trans,
+                       enum btree_id btree_id, unsigned level,
+                       struct bkey_s_c old, struct bkey_s_c new,
+                       unsigned flags)
+{
+       return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
 void bch2_trans_fs_usage_revert(struct btree_trans *trans,
                                struct replicas_delta_list *deltas)
 {
@@ -1452,15 +1470,11 @@ err:
        return ret;
 }
 
-int bch2_trans_mark_extent(struct btree_trans *trans,
-                          enum btree_id btree_id, unsigned level,
-                          struct bkey_s_c old, struct bkey_i *new,
-                          unsigned flags)
+static int __trans_mark_extent(struct btree_trans *trans,
+                              enum btree_id btree_id, unsigned level,
+                              struct bkey_s_c k, unsigned flags)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
-               ? old
-               : bkey_i_to_s_c(new);
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
        struct extent_ptr_decoded p;
@@ -1517,6 +1531,24 @@ int bch2_trans_mark_extent(struct btree_trans *trans,
        return ret;
 }
 
+int bch2_trans_mark_extent(struct btree_trans *trans,
+                          enum btree_id btree_id, unsigned level,
+                          struct bkey_s_c old, struct bkey_i *new,
+                          unsigned flags)
+{
+       struct bch_fs *c = trans->c;
+       int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) -
+                 (int) bch2_bkey_needs_rebalance(c, old);
+
+       if (mod) {
+               int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0);
+               if (ret)
+                       return ret;
+       }
+
+       return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags);
+}
+
 static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
                                         struct bkey_s_c_stripe s,
                                         unsigned idx, bool deleting)
@@ -1670,15 +1702,10 @@ int bch2_trans_mark_stripe(struct btree_trans *trans,
        return ret;
 }
 
-int bch2_trans_mark_reservation(struct btree_trans *trans,
-                               enum btree_id btree_id, unsigned level,
-                               struct bkey_s_c old,
-                               struct bkey_i *new,
-                               unsigned flags)
+static int __trans_mark_reservation(struct btree_trans *trans,
+                                   enum btree_id btree_id, unsigned level,
+                                   struct bkey_s_c k, unsigned flags)
 {
-       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
-               ? old
-               : bkey_i_to_s_c(new);
        unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
        s64 sectors = (s64) k.k->size;
        struct replicas_delta_list *d;
@@ -1700,7 +1727,16 @@ int bch2_trans_mark_reservation(struct btree_trans *trans,
        return 0;
 }
 
-static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
+int bch2_trans_mark_reservation(struct btree_trans *trans,
+                               enum btree_id btree_id, unsigned level,
+                               struct bkey_s_c old,
+                               struct bkey_i *new,
+                               unsigned flags)
+{
+       return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags);
+}
+
+static int trans_mark_reflink_p_segment(struct btree_trans *trans,
                        struct bkey_s_c_reflink_p p,
                        u64 *idx, unsigned flags)
 {
@@ -1767,35 +1803,38 @@ err:
        return ret;
 }
 
-int bch2_trans_mark_reflink_p(struct btree_trans *trans,
-                             enum btree_id btree_id, unsigned level,
-                             struct bkey_s_c old,
-                             struct bkey_i *new,
-                             unsigned flags)
+static int __trans_mark_reflink_p(struct btree_trans *trans,
+                               enum btree_id btree_id, unsigned level,
+                               struct bkey_s_c k, unsigned flags)
 {
-       struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE
-               ? old
-               : bkey_i_to_s_c(new);
        struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
        u64 idx, end_idx;
        int ret = 0;
 
-       if (flags & BTREE_TRIGGER_INSERT) {
-               struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
-
-               v->front_pad = v->back_pad = 0;
-       }
-
        idx     = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
        end_idx = le64_to_cpu(p.v->idx) + p.k->size +
                le32_to_cpu(p.v->back_pad);
 
        while (idx < end_idx && !ret)
-               ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags);
-
+               ret = trans_mark_reflink_p_segment(trans, p, &idx, flags);
        return ret;
 }
 
+int bch2_trans_mark_reflink_p(struct btree_trans *trans,
+                             enum btree_id btree_id, unsigned level,
+                             struct bkey_s_c old,
+                             struct bkey_i *new,
+                             unsigned flags)
+{
+       if (flags & BTREE_TRIGGER_INSERT) {
+               struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v;
+
+               v->front_pad = v->back_pad = 0;
+       }
+
+       return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
 static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
                                    struct bch_dev *ca, size_t b,
                                    enum bch_data_type type,
@@ -1825,16 +1864,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
                        bch2_data_types[type],
                        bch2_data_types[type]);
                ret = -EIO;
-               goto out;
+               goto err;
        }
 
-       a->v.data_type          = type;
-       a->v.dirty_sectors      = sectors;
-
-       ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
-       if (ret)
-               goto out;
-out:
+       if (a->v.data_type      != type ||
+           a->v.dirty_sectors  != sectors) {
+               a->v.data_type          = type;
+               a->v.dirty_sectors      = sectors;
+               ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+       }
+err:
        bch2_trans_iter_exit(trans, &iter);
        return ret;
 }
@@ -1929,6 +1968,22 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
        return ret;
 }
 
+int bch2_trans_mark_dev_sbs(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i;
+
+       for_each_online_member(ca, c, i) {
+               int ret = bch2_trans_mark_dev_sb(c, ca);
+               if (ret) {
+                       percpu_ref_put(&ca->ref);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
 /* Disk reservations: */
 
 #define SECTORS_CACHE  1024
index bf8d7f407e9cf372c2a5cf4b636e7bddf906e913..21f6cb356921f1e3b1f9df59fbdae7309f3931fa 100644 (file)
@@ -339,12 +339,27 @@ int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct
 int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
 int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
 
+#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
+({                                                                                             \
+       int ret = 0;                                                                            \
+                                                                                               \
+       if (_old.k->type)                                                                       \
+               ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT);     \
+       if (!ret && _new.k->type)                                                               \
+               ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE);  \
+       ret;                                                                                    \
+})
+
+#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)  \
+       mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags)
+
 void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
 int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
 
 int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *,
                                    size_t, enum bch_data_type, unsigned);
 int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *);
+int bch2_trans_mark_dev_sbs(struct bch_fs *);
 
 static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
 {
index f69e15dc699c9b6b22c07c8a1ce709bf478e57d8..4bb88aefed121f275582df94e3cea9dcdec7c58c 100644 (file)
@@ -332,8 +332,8 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
        struct bch_ioctl_data_event e = {
                .type                   = BCH_DATA_EVENT_PROGRESS,
                .p.data_type            = ctx->stats.data_type,
-               .p.btree_id             = ctx->stats.btree_id,
-               .p.pos                  = ctx->stats.pos,
+               .p.btree_id             = ctx->stats.pos.btree,
+               .p.pos                  = ctx->stats.pos.pos,
                .p.sectors_done         = atomic64_read(&ctx->stats.sectors_seen),
                .p.sectors_total        = bch2_fs_usage_read_short(c).used,
        };
index 1480b64547b0c961d7a62f06071e8bffcf7e35a8..a8b148ec2a2b6b8ed1f33d10ad195b72afa112e0 100644 (file)
@@ -697,14 +697,32 @@ err:
        return ret;
 }
 
+void bch2_compression_opt_to_text(struct printbuf *out, u64 v)
+{
+       struct bch_compression_opt opt = bch2_compression_decode(v);
+
+       if (opt.type < BCH_COMPRESSION_OPT_NR)
+               prt_str(out, bch2_compression_opts[opt.type]);
+       else
+               prt_printf(out, "(unknown compression opt %u)", opt.type);
+       if (opt.level)
+               prt_printf(out, ":%u", opt.level);
+}
+
 void bch2_opt_compression_to_text(struct printbuf *out,
                                  struct bch_fs *c,
                                  struct bch_sb *sb,
                                  u64 v)
 {
-       struct bch_compression_opt opt = bch2_compression_decode(v);
+       return bch2_compression_opt_to_text(out, v);
+}
 
-       prt_str(out, bch2_compression_opts[opt.type]);
-       if (opt.level)
-               prt_printf(out, ":%u", opt.level);
+int bch2_opt_compression_validate(u64 v, struct printbuf *err)
+{
+       if (!bch2_compression_opt_valid(v)) {
+               prt_printf(err, "invalid compression opt %llu", v);
+               return -BCH_ERR_invalid_sb_opt_compression;
+       }
+
+       return 0;
 }
index 052ea303241fc31407edde0bcc2d3037d7691137..607fd5e232c902dbb39f3dac84ea2e214e6b106c 100644 (file)
@@ -4,12 +4,18 @@
 
 #include "extents_types.h"
 
+static const unsigned __bch2_compression_opt_to_type[] = {
+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
+       BCH_COMPRESSION_OPTS()
+#undef x
+};
+
 struct bch_compression_opt {
        u8              type:4,
                        level:4;
 };
 
-static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
+static inline struct bch_compression_opt __bch2_compression_decode(unsigned v)
 {
        return (struct bch_compression_opt) {
                .type   = v & 15,
@@ -17,17 +23,25 @@ static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
        };
 }
 
+static inline bool bch2_compression_opt_valid(unsigned v)
+{
+       struct bch_compression_opt opt = __bch2_compression_decode(v);
+
+       return opt.type < ARRAY_SIZE(__bch2_compression_opt_to_type) && !(!opt.type && opt.level);
+}
+
+static inline struct bch_compression_opt bch2_compression_decode(unsigned v)
+{
+       return bch2_compression_opt_valid(v)
+               ? __bch2_compression_decode(v)
+               : (struct bch_compression_opt) { 0 };
+}
+
 static inline unsigned bch2_compression_encode(struct bch_compression_opt opt)
 {
        return opt.type|(opt.level << 4);
 }
 
-static const unsigned __bch2_compression_opt_to_type[] = {
-#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t,
-       BCH_COMPRESSION_OPTS()
-#undef x
-};
-
 static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
 {
        return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
@@ -44,12 +58,16 @@ int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
 void bch2_fs_compress_exit(struct bch_fs *);
 int bch2_fs_compress_init(struct bch_fs *);
 
+void bch2_compression_opt_to_text(struct printbuf *, u64);
+
 int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
 void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+int bch2_opt_compression_validate(u64, struct printbuf *);
 
 #define bch2_opt_compression (struct bch_opt_fn) {             \
-       .parse          = bch2_opt_compression_parse,   \
-       .to_text        = bch2_opt_compression_to_text, \
+       .parse          = bch2_opt_compression_parse,           \
+       .to_text        = bch2_opt_compression_to_text,         \
+       .validate       = bch2_opt_compression_validate,        \
 }
 
 #endif /* _BCACHEFS_COMPRESS_H */
index 899ff46de8e062aa4213d4815b8b20789d54bae3..d116f2f03db24a8949ac9bc728cfea2b80e9ce30 100644 (file)
@@ -13,6 +13,7 @@
 #include "keylist.h"
 #include "move.h"
 #include "nocow_locking.h"
+#include "rebalance.h"
 #include "subvolume.h"
 #include "trace.h"
 
@@ -251,11 +252,11 @@ restart_drop_extra_replicas:
                ret =   bch2_insert_snapshot_whiteouts(trans, m->btree_id,
                                                k.k->p, bkey_start_pos(&insert->k)) ?:
                        bch2_insert_snapshot_whiteouts(trans, m->btree_id,
-                                               k.k->p, insert->k.p);
-               if (ret)
-                       goto err;
-
-               ret   = bch2_trans_update(trans, &iter, insert,
+                                               k.k->p, insert->k.p) ?:
+                       bch2_bkey_set_needs_rebalance(c, insert,
+                                                     op->opts.background_target,
+                                                     op->opts.background_compression) ?:
+                       bch2_trans_update(trans, &iter, insert,
                                BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
                        bch2_trans_commit(trans, &op->res,
                                NULL,
@@ -281,11 +282,11 @@ next:
                }
                continue;
 nowork:
-               if (m->ctxt && m->ctxt->stats) {
+               if (m->stats && m->stats) {
                        BUG_ON(k.k->p.offset <= iter.pos.offset);
-                       atomic64_inc(&m->ctxt->stats->keys_raced);
+                       atomic64_inc(&m->stats->keys_raced);
                        atomic64_add(k.k->p.offset - iter.pos.offset,
-                                    &m->ctxt->stats->sectors_raced);
+                                    &m->stats->sectors_raced);
                }
 
                this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]);
@@ -439,6 +440,8 @@ int bch2_data_update_init(struct btree_trans *trans,
        bch2_bkey_buf_reassemble(&m->k, c, k);
        m->btree_id     = btree_id;
        m->data_opts    = data_opts;
+       m->ctxt         = ctxt;
+       m->stats        = ctxt ? ctxt->stats : NULL;
 
        bch2_write_op_init(&m->op, c, io_opts);
        m->op.pos       = bkey_start_pos(k.k);
@@ -487,7 +490,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 
                if (c->opts.nocow_enabled) {
                        if (ctxt) {
-                               move_ctxt_wait_event(ctxt, trans,
+                               move_ctxt_wait_event(ctxt,
                                                (locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
                                                                          PTR_BUCKET_POS(c, &p.ptr), 0)) ||
                                                !atomic_read(&ctxt->read_sectors));
index 7ca1f98d7e9462d3563f3149f96d288300d54388..9dc17b9d83795181798deb5af39401d4d6248581 100644 (file)
@@ -23,6 +23,7 @@ struct data_update {
        struct bkey_buf         k;
        struct data_update_opts data_opts;
        struct moving_context   *ctxt;
+       struct bch_move_stats   *stats;
        struct bch_write_op     op;
 };
 
index e00133b6ea517be6a6544973532e0c299e7371b5..d613695abf9f67c2e9f2ab4ce91d863bdfd743c7 100644 (file)
@@ -175,6 +175,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
 
                dst->deleted    = BCH_GROUP_DELETED(src);
                dst->parent     = BCH_GROUP_PARENT(src);
+               memcpy(dst->label, src->label, sizeof(dst->label));
        }
 
        for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
@@ -382,7 +383,57 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
        return v;
 }
 
-void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v)
+void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
+{
+       struct bch_disk_groups_cpu *groups;
+       struct bch_disk_group_cpu *g;
+       unsigned nr = 0;
+       u16 path[32];
+
+       out->atomic++;
+       rcu_read_lock();
+       groups = rcu_dereference(c->disk_groups);
+       if (!groups)
+               goto invalid;
+
+       while (1) {
+               if (nr == ARRAY_SIZE(path))
+                       goto invalid;
+
+               if (v >= groups->nr)
+                       goto invalid;
+
+               g = groups->entries + v;
+
+               if (g->deleted)
+                       goto invalid;
+
+               path[nr++] = v;
+
+               if (!g->parent)
+                       break;
+
+               v = g->parent - 1;
+       }
+
+       while (nr) {
+               v = path[--nr];
+               g = groups->entries + v;
+
+               prt_printf(out, "%.*s", (int) sizeof(g->label), g->label);
+               if (nr)
+                       prt_printf(out, ".");
+       }
+out:
+       rcu_read_unlock();
+       out->atomic--;
+       return;
+invalid:
+       prt_printf(out, "invalid label %u", v);
+       goto out;
+}
+
+void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
 {
        struct bch_sb_field_disk_groups *groups =
                bch2_sb_field_get(sb, disk_groups);
@@ -493,10 +544,7 @@ int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res,
        return -EINVAL;
 }
 
-void bch2_opt_target_to_text(struct printbuf *out,
-                            struct bch_fs *c,
-                            struct bch_sb *sb,
-                            u64 v)
+void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
 {
        struct target t = target_decode(v);
 
@@ -504,47 +552,69 @@ void bch2_opt_target_to_text(struct printbuf *out,
        case TARGET_NULL:
                prt_printf(out, "none");
                break;
-       case TARGET_DEV:
-               if (c) {
-                       struct bch_dev *ca;
-
-                       rcu_read_lock();
-                       ca = t.dev < c->sb.nr_devices
-                               ? rcu_dereference(c->devs[t.dev])
-                               : NULL;
-
-                       if (ca && percpu_ref_tryget(&ca->io_ref)) {
-                               prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
-                               percpu_ref_put(&ca->io_ref);
-                       } else if (ca) {
-                               prt_printf(out, "offline device %u", t.dev);
-                       } else {
-                               prt_printf(out, "invalid device %u", t.dev);
-                       }
-
-                       rcu_read_unlock();
+       case TARGET_DEV: {
+               struct bch_dev *ca;
+
+               rcu_read_lock();
+               ca = t.dev < c->sb.nr_devices
+                       ? rcu_dereference(c->devs[t.dev])
+                       : NULL;
+
+               if (ca && percpu_ref_tryget(&ca->io_ref)) {
+                       prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
+                       percpu_ref_put(&ca->io_ref);
+               } else if (ca) {
+                       prt_printf(out, "offline device %u", t.dev);
                } else {
-                       struct bch_member m = bch2_sb_member_get(sb, t.dev);
-
-                       if (bch2_dev_exists(sb, t.dev)) {
-                               prt_printf(out, "Device ");
-                               pr_uuid(out, m.uuid.b);
-                               prt_printf(out, " (%u)", t.dev);
-                       } else {
-                               prt_printf(out, "Bad device %u", t.dev);
-                       }
+                       prt_printf(out, "invalid device %u", t.dev);
                }
+
+               rcu_read_unlock();
                break;
+       }
        case TARGET_GROUP:
-               if (c) {
-                       mutex_lock(&c->sb_lock);
-                       bch2_disk_path_to_text(out, c->disk_sb.sb, t.group);
-                       mutex_unlock(&c->sb_lock);
+               bch2_disk_path_to_text(out, c, t.group);
+               break;
+       default:
+               BUG();
+       }
+}
+
+void bch2_target_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v)
+{
+       struct target t = target_decode(v);
+
+       switch (t.type) {
+       case TARGET_NULL:
+               prt_printf(out, "none");
+               break;
+       case TARGET_DEV: {
+               struct bch_member m = bch2_sb_member_get(sb, t.dev);
+
+               if (bch2_dev_exists(sb, t.dev)) {
+                       prt_printf(out, "Device ");
+                       pr_uuid(out, m.uuid.b);
+                       prt_printf(out, " (%u)", t.dev);
                } else {
-                       bch2_disk_path_to_text(out, sb, t.group);
+                       prt_printf(out, "Bad device %u", t.dev);
                }
                break;
+       }
+       case TARGET_GROUP:
+               bch2_disk_path_to_text_sb(out, sb, t.group);
+               break;
        default:
                BUG();
        }
 }
+
+void bch2_opt_target_to_text(struct printbuf *out,
+                            struct bch_fs *c,
+                            struct bch_sb *sb,
+                            u64 v)
+{
+       if (c)
+               bch2_target_to_text(out, c, v);
+       else
+               bch2_target_to_text_sb(out, sb, v);
+}
index bd7711767fd4f95537fb2ed38d615fdf6aeec250..441826fff224369b79698442e6b314cf5331c02c 100644 (file)
@@ -2,6 +2,8 @@
 #ifndef _BCACHEFS_DISK_GROUPS_H
 #define _BCACHEFS_DISK_GROUPS_H
 
+#include "disk_groups_types.h"
+
 extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
 
 static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
@@ -83,7 +85,10 @@ int bch2_disk_path_find(struct bch_sb_handle *, const char *);
 /* Exported for userspace bcachefs-tools: */
 int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
 
-void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned);
+void bch2_disk_path_to_text(struct printbuf *, struct bch_fs *, unsigned);
+void bch2_disk_path_to_text_sb(struct printbuf *, struct bch_sb *, unsigned);
+
+void bch2_target_to_text(struct printbuf *out, struct bch_fs *, unsigned);
 
 int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *);
 void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
diff --git a/libbcachefs/disk_groups_types.h b/libbcachefs/disk_groups_types.h
new file mode 100644 (file)
index 0000000..a54ef08
--- /dev/null
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DISK_GROUPS_TYPES_H
+#define _BCACHEFS_DISK_GROUPS_TYPES_H
+
+struct bch_disk_group_cpu {
+       bool                            deleted;
+       u16                             parent;
+       u8                              label[BCH_SB_LABEL_SIZE];
+       struct bch_devs_mask            devs;
+};
+
+struct bch_disk_groups_cpu {
+       struct rcu_head                 rcu;
+       unsigned                        nr;
+       struct bch_disk_group_cpu       entries[] __counted_by(nr);
+};
+
+#endif /* _BCACHEFS_DISK_GROUPS_TYPES_H */
index 7cc083776a2e029a6ec2b11169867e4b02e1173d..3e9f09cea6c799477193cc535b33fb32462e13f4 100644 (file)
        x(BCH_ERR_invalid_sb,           invalid_sb_crypt)                       \
        x(BCH_ERR_invalid_sb,           invalid_sb_clean)                       \
        x(BCH_ERR_invalid_sb,           invalid_sb_quota)                       \
+       x(BCH_ERR_invalid_sb,           invalid_sb_opt_compression)             \
        x(BCH_ERR_invalid,              invalid_bkey)                           \
        x(BCH_ERR_operation_blocked,    nocow_lock_blocked)                     \
        x(EIO,                          btree_node_read_err)                    \
index 1b25f84e4b9cb883fe36dd70bfe43a8df10484aa..0c60d49c3599723d0a526fa9c7d06307e11603eb 100644 (file)
@@ -13,6 +13,7 @@
 #include "btree_iter.h"
 #include "buckets.h"
 #include "checksum.h"
+#include "compress.h"
 #include "debug.h"
 #include "disk_groups.h"
 #include "error.h"
@@ -757,18 +758,6 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
        return i;
 }
 
-static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
-{
-       union bch_extent_entry *next = extent_entry_next(entry);
-
-       /* stripes have ptrs, but their layout doesn't work with this code */
-       BUG_ON(k.k->type == KEY_TYPE_stripe);
-
-       memmove_u64s_down(entry, next,
-                         (u64 *) bkey_val_end(k) - (u64 *) next);
-       k.k->u64s -= (u64 *) next - (u64 *) entry;
-}
-
 /*
  * Returns pointer to the next entry after the one being dropped:
  */
@@ -992,10 +981,6 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
-       struct bch_extent_crc_unpacked crc;
-       const struct bch_extent_ptr *ptr;
-       const struct bch_extent_stripe_ptr *ec;
-       struct bch_dev *ca;
        bool first = true;
 
        if (c)
@@ -1006,9 +991,9 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
                        prt_printf(out, " ");
 
                switch (__extent_entry_type(entry)) {
-               case BCH_EXTENT_ENTRY_ptr:
-                       ptr = entry_to_ptr(entry);
-                       ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+               case BCH_EXTENT_ENTRY_ptr: {
+                       const struct bch_extent_ptr *ptr = entry_to_ptr(entry);
+                       struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
                                ? bch_dev_bkey_exists(c, ptr->dev)
                                : NULL;
 
@@ -1030,10 +1015,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
                                        prt_printf(out, " stale");
                        }
                        break;
+               }
                case BCH_EXTENT_ENTRY_crc32:
                case BCH_EXTENT_ENTRY_crc64:
-               case BCH_EXTENT_ENTRY_crc128:
-                       crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
+               case BCH_EXTENT_ENTRY_crc128: {
+                       struct bch_extent_crc_unpacked crc =
+                               bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
                        prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
                               crc.compressed_size,
@@ -1042,12 +1029,26 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
                               bch2_csum_types[crc.csum_type],
                               bch2_compression_types[crc.compression_type]);
                        break;
-               case BCH_EXTENT_ENTRY_stripe_ptr:
-                       ec = &entry->stripe_ptr;
+               }
+               case BCH_EXTENT_ENTRY_stripe_ptr: {
+                       const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
 
                        prt_printf(out, "ec: idx %llu block %u",
                               (u64) ec->idx, ec->block);
                        break;
+               }
+               case BCH_EXTENT_ENTRY_rebalance: {
+                       const struct bch_extent_rebalance *r = &entry->rebalance;
+
+                       prt_str(out, "rebalance: target ");
+                       if (c)
+                               bch2_target_to_text(out, c, r->target);
+                       else
+                               prt_printf(out, "%u", r->target);
+                       prt_str(out, " compression ");
+                       bch2_compression_opt_to_text(out, r->compression);
+                       break;
+               }
                default:
                        prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
                        return;
@@ -1207,6 +1208,14 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
                                return -BCH_ERR_invalid_bkey;
                        }
                        crc_since_last_ptr = true;
+
+                       if (crc_is_encoded(crc) &&
+                           (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
+                           (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT))) {
+                               prt_printf(err, "too large encoded extent");
+                               return -BCH_ERR_invalid_bkey;
+                       }
+
                        break;
                case BCH_EXTENT_ENTRY_stripe_ptr:
                        if (have_ec) {
@@ -1215,9 +1224,18 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
                        }
                        have_ec = true;
                        break;
-               case BCH_EXTENT_ENTRY_rebalance:
+               case BCH_EXTENT_ENTRY_rebalance: {
+                       const struct bch_extent_rebalance *r = &entry->rebalance;
+
+                       if (!bch2_compression_opt_valid(r->compression)) {
+                               struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
+                               prt_printf(err, "invalid compression opt %u:%u",
+                                          opt.type, opt.level);
+                               return -BCH_ERR_invalid_bkey;
+                       }
                        break;
                }
+               }
        }
 
        if (!nr_ptrs) {
@@ -1281,6 +1299,125 @@ void bch2_ptr_swab(struct bkey_s k)
        }
 }
 
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const union bch_extent_entry *entry;
+
+       bkey_extent_entry_for_each(ptrs, entry)
+               if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
+                       return &entry->rebalance;
+
+       return NULL;
+}
+
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
+                                      unsigned target, unsigned compression)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       unsigned rewrite_ptrs = 0;
+
+       if (compression) {
+               unsigned compression_type = bch2_compression_opt_to_type(compression);
+               const union bch_extent_entry *entry;
+               struct extent_ptr_decoded p;
+               unsigned i = 0;
+
+               bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+                       if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) {
+                               rewrite_ptrs = 0;
+                               goto incompressible;
+                       }
+
+                       if (!p.ptr.cached && p.crc.compression_type != compression_type)
+                               rewrite_ptrs |= 1U << i;
+                       i++;
+               }
+       }
+incompressible:
+       if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
+               const struct bch_extent_ptr *ptr;
+               unsigned i = 0;
+
+               bkey_for_each_ptr(ptrs, ptr) {
+                       if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target))
+                               rewrite_ptrs |= 1U << i;
+                       i++;
+               }
+       }
+
+       return rewrite_ptrs;
+}
+
+bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+       const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+
+       /*
+        * If it's an indirect extent, we don't delete the rebalance entry when
+        * done so that we know what options were applied - check if it still
+        * needs work done:
+        */
+       if (r &&
+           k.k->type == KEY_TYPE_reflink_v &&
+           !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression))
+               r = NULL;
+
+       return r != NULL;
+}
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
+                                 unsigned target, unsigned compression)
+{
+       struct bkey_s k = bkey_i_to_s(_k);
+       struct bch_extent_rebalance *r;
+       bool needs_rebalance;
+
+       if (!bkey_extent_is_direct_data(k.k))
+               return 0;
+
+       /* get existing rebalance entry: */
+       r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
+       if (r) {
+               if (k.k->type == KEY_TYPE_reflink_v) {
+                       /*
+                        * indirect extents: existing options take precedence,
+                        * so that we don't move extents back and forth if
+                        * they're referenced by different inodes with different
+                        * options:
+                        */
+                       if (r->target)
+                               target = r->target;
+                       if (r->compression)
+                               compression = r->compression;
+               }
+
+               r->target       = target;
+               r->compression  = compression;
+       }
+
+       needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression);
+
+       if (needs_rebalance && !r) {
+               union bch_extent_entry *new = bkey_val_end(k);
+
+               new->rebalance.type             = 1U << BCH_EXTENT_ENTRY_rebalance;
+               new->rebalance.compression      = compression;
+               new->rebalance.target           = target;
+               new->rebalance.unused           = 0;
+               k.k->u64s += extent_entry_u64s(new);
+       } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) {
+               /*
+                * For indirect extents, don't delete the rebalance entry when
+                * we're finished so that we know we specifically moved it or
+                * compressed it to its current location/compression type
+                */
+               extent_entry_drop(k, (union bch_extent_entry *) r);
+       }
+
+       return 0;
+}
+
 /* Generic extent code: */
 
 int bch2_cut_front_s(struct bpos where, struct bkey_s k)
index 879e7d218b6a4baf58b7a567266d1941f72de1fe..9110acae7e3ca94fea1e56d805405a4326ab73a1 100644 (file)
@@ -89,6 +89,18 @@ static inline void __extent_entry_insert(struct bkey_i *k,
        memcpy_u64s_small(dst, new, extent_entry_u64s(new));
 }
 
+static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
+{
+       union bch_extent_entry *next = extent_entry_next(entry);
+
+       /* stripes have ptrs, but their layout doesn't work with this code */
+       BUG_ON(k.k->type == KEY_TYPE_stripe);
+
+       memmove_u64s_down(entry, next,
+                         (u64 *) bkey_val_end(k) - (u64 *) next);
+       k.k->u64s -= (u64 *) next - (u64 *) entry;
+}
+
 static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
 {
        return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
@@ -190,6 +202,11 @@ static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc)
                crc.compression_type != BCH_COMPRESSION_TYPE_incompressible);
 }
 
+static inline bool crc_is_encoded(struct bch_extent_crc_unpacked crc)
+{
+       return crc.csum_type != BCH_CSUM_none || crc_is_compressed(crc);
+}
+
 /* bkey_ptrs: generically over any key type that has ptrs */
 
 struct bkey_ptrs_c {
@@ -693,6 +710,14 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c,
 
 void bch2_ptr_swab(struct bkey_s);
 
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
+                                      unsigned, unsigned);
+bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
+                                 unsigned, unsigned);
+
 /* Generic extent code: */
 
 enum bch_extent_overlap {
@@ -737,22 +762,4 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size)
        k->size = new_size;
 }
 
-/*
- * In extent_sort_fix_overlapping(), insert_fixup_extent(),
- * extent_merge_inline() - we're modifying keys in place that are packed. To do
- * that we have to unpack the key, modify the unpacked key - then this
- * copies/repacks the unpacked to the original as necessary.
- */
-static inline void extent_save(struct btree *b, struct bkey_packed *dst,
-                              struct bkey *src)
-{
-       struct bkey_format *f = &b->format;
-       struct bkey_i *dst_unpacked;
-
-       if ((dst_unpacked = packed_to_bkey(dst)))
-               dst_unpacked->k = *src;
-       else
-               BUG_ON(!bch2_bkey_pack_key(dst, src, f));
-}
-
 #endif /* _BCACHEFS_EXTENTS_H */
index 6a9557e7ecabb47d1a30b7c665367decb3fecbe4..5b42a76c4796f90062bb86e2914d0301e52cf7d0 100644 (file)
@@ -113,6 +113,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
        } else {
                atomic_set(&dio->cl.remaining,
                           CLOSURE_REMAINING_INITIALIZER + 1);
+               dio->cl.closure_get_happened = true;
        }
 
        dio->req        = req;
index f26b824e70a84bedd1d16c45ea966b4f8589971c..328cb3b3e21338878f3ea594cf7342983cf15168 100644 (file)
@@ -1299,6 +1299,28 @@ err:
        return ret;
 }
 
+static int check_extent_overbig(struct btree_trans *trans, struct btree_iter *iter,
+                               struct bkey_s_c k)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       struct bch_extent_crc_unpacked crc;
+       const union bch_extent_entry *i;
+       unsigned encoded_extent_max_sectors = c->opts.encoded_extent_max >> 9;
+
+       bkey_for_each_crc(k.k, ptrs, crc, i)
+               if (crc_is_encoded(crc) &&
+                   crc.uncompressed_size > encoded_extent_max_sectors) {
+                       struct printbuf buf = PRINTBUF;
+
+                       bch2_bkey_val_to_text(&buf, c, k);
+                       bch_err(c, "overbig encoded extent, please report this:\n  %s", buf.buf);
+                       printbuf_exit(&buf);
+               }
+
+       return 0;
+}
+
 static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
                        struct bkey_s_c k,
                        struct inode_walker *inode,
@@ -1434,7 +1456,8 @@ int bch2_check_extents(struct bch_fs *c)
                        &res, NULL,
                        BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
                bch2_disk_reservation_put(c, &res);
-               check_extent(trans, &iter, k, &w, &s, &extent_ends);
+               check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
+               check_extent_overbig(trans, &iter, k);
        })) ?:
        check_i_sectors(trans, &w);
 
@@ -1448,6 +1471,30 @@ int bch2_check_extents(struct bch_fs *c)
        return ret;
 }
 
+int bch2_check_indirect_extents(struct bch_fs *c)
+{
+       struct btree_trans *trans = bch2_trans_get(c);
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct disk_reservation res = { 0 };
+       int ret = 0;
+
+       ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
+                       POS_MIN,
+                       BTREE_ITER_PREFETCH, k,
+                       &res, NULL,
+                       BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
+               bch2_disk_reservation_put(c, &res);
+               check_extent_overbig(trans, &iter, k);
+       }));
+
+       bch2_disk_reservation_put(c, &res);
+       bch2_trans_put(trans);
+
+       bch_err_fn(c, ret);
+       return ret;
+}
+
 static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 {
        struct bch_fs *c = trans->c;
index 90c87b5089a01403bceabeeed390fa3adfea760e..da991e8cf27eb493ed5aac5a3e3da606ae089968 100644 (file)
@@ -4,6 +4,7 @@
 
 int bch2_check_inodes(struct bch_fs *);
 int bch2_check_extents(struct bch_fs *);
+int bch2_check_indirect_extents(struct bch_fs *);
 int bch2_check_dirents(struct bch_fs *);
 int bch2_check_xattrs(struct bch_fs *);
 int bch2_check_root(struct bch_fs *);
index bb3f443d8381cc1dd087e961a593a989146e6014..23fcd442c5145aca5fdb754b78bff2d2c67044a1 100644 (file)
@@ -6,6 +6,7 @@
 #include "bkey_methods.h"
 #include "btree_update.h"
 #include "buckets.h"
+#include "compress.h"
 #include "error.h"
 #include "extents.h"
 #include "extent_update.h"
@@ -422,9 +423,10 @@ static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err)
                return -BCH_ERR_invalid_bkey;
        }
 
-       if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) {
-               prt_printf(err, "invalid data checksum type (%u >= %u)",
-                      unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1);
+       if (unpacked.bi_compression &&
+           !bch2_compression_opt_valid(unpacked.bi_compression - 1)) {
+               prt_printf(err, "invalid compression opt %u",
+                          unpacked.bi_compression - 1);
                return -BCH_ERR_invalid_bkey;
        }
 
@@ -979,6 +981,18 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c,
                opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0;
 }
 
+int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_io_opts *opts)
+{
+       struct bch_inode_unpacked inode;
+       int ret = lockrestart_do(trans, bch2_inode_find_by_inum_trans(trans, inum, &inode));
+
+       if (ret)
+               return ret;
+
+       bch2_inode_opts_get(opts, trans->c, &inode);
+       return 0;
+}
+
 int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
 {
        struct bch_fs *c = trans->c;
index a7464e1b696046a074f10f2a72cc38163718f041..2781e328158375322dad3b6d7be0858960f0471a 100644 (file)
@@ -200,6 +200,7 @@ void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *);
 struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *);
 void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *,
                         struct bch_inode_unpacked *);
+int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *);
 
 int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32);
 int bch2_delete_dead_inodes(struct bch_fs *);
index 119834cb8f9ee7f80c10b504ca8602ae5614ac37..0979d5e05713e4769af3b8ff299d7b0a737a7ce7 100644 (file)
@@ -16,6 +16,7 @@
 #include "io_misc.h"
 #include "io_write.h"
 #include "logged_ops.h"
+#include "rebalance.h"
 #include "subvolume.h"
 
 /* Overwrites whatever was present with zeroes: */
@@ -355,6 +356,7 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
        struct btree_iter iter;
        struct bkey_i_logged_op_finsert *op = bkey_i_to_logged_op_finsert(op_k);
        subvol_inum inum = { le32_to_cpu(op->v.subvol), le64_to_cpu(op->v.inum) };
+       struct bch_io_opts opts;
        u64 dst_offset = le64_to_cpu(op->v.dst_offset);
        u64 src_offset = le64_to_cpu(op->v.src_offset);
        s64 shift = dst_offset - src_offset;
@@ -363,6 +365,10 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
        bool insert = shift > 0;
        int ret = 0;
 
+       ret = bch2_inum_opts_get(trans, inum, &opts);
+       if (ret)
+               return ret;
+
        bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
                             POS(inum.inum, 0),
                             BTREE_ITER_INTENT);
@@ -443,7 +449,10 @@ case LOGGED_OP_FINSERT_shift_extents:
 
                op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
 
-               ret =   bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
+               ret =   bch2_bkey_set_needs_rebalance(c, copy,
+                                       opts.background_target,
+                                       opts.background_compression) ?:
+                       bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
                        bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
                        bch2_logged_op_update(trans, &op->k_i) ?:
                        bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);
index 6e4f85eb6ec8ddbaa81b4201d7660712f65474a0..6d9c777213e3ca56305f37123d7e1f82549a9c1a 100644 (file)
@@ -351,10 +351,13 @@ static int bch2_write_index_default(struct bch_write_op *op)
                                     bkey_start_pos(&sk.k->k),
                                     BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-               ret = bch2_extent_update(trans, inum, &iter, sk.k,
-                                        &op->res,
-                                        op->new_i_size, &op->i_sectors_delta,
-                                        op->flags & BCH_WRITE_CHECK_ENOSPC);
+               ret =   bch2_bkey_set_needs_rebalance(c, sk.k,
+                                       op->opts.background_target,
+                                       op->opts.background_compression) ?:
+                       bch2_extent_update(trans, inum, &iter, sk.k,
+                                       &op->res,
+                                       op->new_i_size, &op->i_sectors_delta,
+                                       op->flags & BCH_WRITE_CHECK_ENOSPC);
                bch2_trans_iter_exit(trans, &iter);
 
                if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -495,7 +498,6 @@ static void __bch2_write_index(struct bch_write_op *op)
 {
        struct bch_fs *c = op->c;
        struct keylist *keys = &op->insert_keys;
-       struct bkey_i *k;
        unsigned dev;
        int ret = 0;
 
@@ -505,14 +507,6 @@ static void __bch2_write_index(struct bch_write_op *op)
                        goto err;
        }
 
-       /*
-        * probably not the ideal place to hook this in, but I don't
-        * particularly want to plumb io_opts all the way through the btree
-        * update stack right now
-        */
-       for_each_keylist_key(keys, k)
-               bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
-
        if (!bch2_keylist_empty(keys)) {
                u64 sectors_start = keylist_sectors(keys);
 
@@ -816,6 +810,7 @@ static enum prep_encoded_ret {
 
        /* Can we just write the entire extent as is? */
        if (op->crc.uncompressed_size == op->crc.live_size &&
+           op->crc.uncompressed_size <= c->opts.encoded_extent_max >> 9 &&
            op->crc.compressed_size <= wp->sectors_free &&
            (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
             op->incompressible)) {
@@ -1091,9 +1086,7 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op,
 
        e = bkey_s_c_to_extent(k);
        extent_for_each_ptr_decode(e, p, entry) {
-               if (p.crc.csum_type ||
-                   crc_is_compressed(p.crc) ||
-                   p.has_ec)
+               if (crc_is_encoded(p.crc) || p.has_ec)
                        return false;
 
                replicas += bch2_extent_ptr_durability(c, &p);
index 0e7a9ffa3671f729459a3f1a6032021e09937925..5b5d69f2316b216746c0c08db2346c2c8c95ff16 100644 (file)
@@ -1019,6 +1019,25 @@ err:
        return ret;
 }
 
+int bch2_fs_journal_alloc(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i;
+
+       for_each_online_member(ca, c, i) {
+               if (ca->journal.nr)
+                       continue;
+
+               int ret = bch2_dev_journal_alloc(ca);
+               if (ret) {
+                       percpu_ref_put(&ca->io_ref);
+                       return ret;
+               }
+       }
+
+       return 0;
+}
+
 /* startup/shutdown: */
 
 static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
index 491133cc52f3bf38f9c80cf94daf7d0d5b0cc6c1..011711e99c8d825ec968cf513f82c08a66ecabc5 100644 (file)
@@ -534,6 +534,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *);
 int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *,
                                unsigned nr);
 int bch2_dev_journal_alloc(struct bch_dev *);
+int bch2_fs_journal_alloc(struct bch_fs *);
 
 void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
 
index 82f60c7883ba6b08396e763c4eeaea0328d07b44..1b15b010461ae19b7b52dfda1f7bcbcbad63bb93 100644 (file)
@@ -20,6 +20,7 @@
 #include "keylist.h"
 #include "move.h"
 #include "replicas.h"
+#include "snapshot.h"
 #include "super-io.h"
 #include "trace.h"
 
@@ -59,20 +60,6 @@ static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c
        }
 }
 
-static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats)
-{
-       mutex_lock(&c->data_progress_lock);
-       list_add(&stats->list, &c->data_progress_list);
-       mutex_unlock(&c->data_progress_lock);
-}
-
-static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats)
-{
-       mutex_lock(&c->data_progress_lock);
-       list_del(&stats->list);
-       mutex_unlock(&c->data_progress_lock);
-}
-
 struct moving_io {
        struct list_head                read_list;
        struct list_head                io_list;
@@ -156,13 +143,11 @@ static void move_read_endio(struct bio *bio)
        closure_put(&ctxt->cl);
 }
 
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
-                                       struct btree_trans *trans)
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt)
 {
        struct moving_io *io;
 
-       if (trans)
-               bch2_trans_unlock(trans);
+       bch2_trans_unlock(ctxt->trans);
 
        while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) {
                list_del(&io->read_list);
@@ -170,21 +155,20 @@ void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt,
        }
 }
 
-static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt,
-                                      struct btree_trans *trans)
+void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
 {
        unsigned sectors_pending = atomic_read(&ctxt->write_sectors);
 
-       move_ctxt_wait_event(ctxt, trans,
+       move_ctxt_wait_event(ctxt,
                !atomic_read(&ctxt->write_sectors) ||
                atomic_read(&ctxt->write_sectors) != sectors_pending);
 }
 
 void bch2_moving_ctxt_exit(struct moving_context *ctxt)
 {
-       struct bch_fs *c = ctxt->c;
+       struct bch_fs *c = ctxt->trans->c;
 
-       move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads));
+       move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
        closure_sync(&ctxt->cl);
 
        EBUG_ON(atomic_read(&ctxt->write_sectors));
@@ -192,16 +176,12 @@ void bch2_moving_ctxt_exit(struct moving_context *ctxt)
        EBUG_ON(atomic_read(&ctxt->read_sectors));
        EBUG_ON(atomic_read(&ctxt->read_ios));
 
-       if (ctxt->stats) {
-               progress_list_del(c, ctxt->stats);
-               trace_move_data(c,
-                               atomic64_read(&ctxt->stats->sectors_moved),
-                               atomic64_read(&ctxt->stats->keys_moved));
-       }
-
        mutex_lock(&c->moving_context_lock);
        list_del(&ctxt->list);
        mutex_unlock(&c->moving_context_lock);
+
+       bch2_trans_put(ctxt->trans);
+       memset(ctxt, 0, sizeof(*ctxt));
 }
 
 void bch2_moving_ctxt_init(struct moving_context *ctxt,
@@ -213,7 +193,7 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
 {
        memset(ctxt, 0, sizeof(*ctxt));
 
-       ctxt->c         = c;
+       ctxt->trans     = bch2_trans_get(c);
        ctxt->fn        = (void *) _RET_IP_;
        ctxt->rate      = rate;
        ctxt->stats     = stats;
@@ -230,16 +210,17 @@ void bch2_moving_ctxt_init(struct moving_context *ctxt,
        mutex_lock(&c->moving_context_lock);
        list_add(&ctxt->list, &c->moving_context_list);
        mutex_unlock(&c->moving_context_lock);
+}
 
-       if (stats) {
-               progress_list_add(c, stats);
-               stats->data_type = BCH_DATA_user;
-       }
+void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
+{
+       trace_move_data(c, stats);
 }
 
 void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
 {
        memset(stats, 0, sizeof(*stats));
+       stats->data_type = BCH_DATA_user;
        scnprintf(stats->name, sizeof(stats->name), "%s", name);
 }
 
@@ -286,15 +267,14 @@ static int bch2_extent_drop_ptrs(struct btree_trans *trans,
                bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
 }
 
-static int bch2_move_extent(struct btree_trans *trans,
-                           struct btree_iter *iter,
-                           struct moving_context *ctxt,
-                           struct move_bucket_in_flight *bucket_in_flight,
-                           struct bch_io_opts io_opts,
-                           enum btree_id btree_id,
-                           struct bkey_s_c k,
-                           struct data_update_opts data_opts)
+int bch2_move_extent(struct moving_context *ctxt,
+                    struct move_bucket_in_flight *bucket_in_flight,
+                    struct btree_iter *iter,
+                    struct bkey_s_c k,
+                    struct bch_io_opts io_opts,
+                    struct data_update_opts data_opts)
 {
+       struct btree_trans *trans = ctxt->trans;
        struct bch_fs *c = trans->c;
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        struct moving_io *io;
@@ -303,6 +283,8 @@ static int bch2_move_extent(struct btree_trans *trans,
        unsigned sectors = k.k->size, pages;
        int ret = -ENOMEM;
 
+       if (ctxt->stats)
+               ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
        trace_move_extent2(c, k);
 
        bch2_data_update_opts_normalize(k, &data_opts);
@@ -355,7 +337,7 @@ static int bch2_move_extent(struct btree_trans *trans,
        io->rbio.bio.bi_end_io          = move_read_endio;
 
        ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp,
-                                   io_opts, data_opts, btree_id, k);
+                                   io_opts, data_opts, iter->btree_id, k);
        if (ret && ret != -BCH_ERR_unwritten_extent_update)
                goto err_free_pages;
 
@@ -367,9 +349,11 @@ static int bch2_move_extent(struct btree_trans *trans,
 
        BUG_ON(ret);
 
-       io->write.ctxt = ctxt;
        io->write.op.end_io = move_write_done;
 
+       if (ctxt->rate)
+               bch2_ratelimit_increment(ctxt->rate, k.k->size);
+
        if (ctxt->stats) {
                atomic64_inc(&ctxt->stats->keys_moved);
                atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
@@ -399,7 +383,7 @@ static int bch2_move_extent(struct btree_trans *trans,
        closure_get(&ctxt->cl);
        bch2_read_extent(trans, &io->rbio,
                         bkey_start_pos(k.k),
-                        btree_id, k, 0,
+                        iter->btree_id, k, 0,
                         BCH_READ_NODECODE|
                         BCH_READ_LAST_FRAGMENT);
        return 0;
@@ -413,45 +397,96 @@ err:
        return ret;
 }
 
-static int lookup_inode(struct btree_trans *trans, struct bpos pos,
-                       struct bch_inode_unpacked *inode)
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
+                         struct per_snapshot_io_opts *io_opts,
+                         struct bkey_s_c extent_k)
+{
+       struct bch_fs *c = trans->c;
+       u32 restart_count = trans->restart_count;
+       int ret = 0;
+
+       if (io_opts->cur_inum != extent_k.k->p.inode) {
+               struct btree_iter iter;
+               struct bkey_s_c k;
+
+               io_opts->d.nr = 0;
+
+               for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
+                                  BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+                       if (k.k->p.offset != extent_k.k->p.inode)
+                               break;
+
+                       if (!bkey_is_inode(k.k))
+                               continue;
+
+                       struct bch_inode_unpacked inode;
+                       BUG_ON(bch2_inode_unpack(k, &inode));
+
+                       struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
+                       bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
+
+                       ret = darray_push(&io_opts->d, e);
+                       if (ret)
+                               break;
+               }
+               bch2_trans_iter_exit(trans, &iter);
+               io_opts->cur_inum = extent_k.k->p.inode;
+       }
+
+       ret = ret ?: trans_was_restarted(trans, restart_count);
+       if (ret)
+               return ERR_PTR(ret);
+
+       if (extent_k.k->p.snapshot) {
+               struct snapshot_io_opts_entry *i;
+               darray_for_each(io_opts->d, i)
+                       if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
+                               return &i->io_opts;
+       }
+
+       return &io_opts->fs_io_opts;
+}
+
+int bch2_move_get_io_opts_one(struct btree_trans *trans,
+                             struct bch_io_opts *io_opts,
+                             struct bkey_s_c extent_k)
 {
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
 
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos,
-                            BTREE_ITER_ALL_SNAPSHOTS);
-       k = bch2_btree_iter_peek(&iter);
+       /* reflink btree? */
+       if (!extent_k.k->p.inode) {
+               *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
+               return 0;
+       }
+
+       k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+                              SPOS(0, extent_k.k->p.inode, extent_k.k->p.snapshot),
+                              BTREE_ITER_CACHED);
        ret = bkey_err(k);
-       if (ret)
-               goto err;
+       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+               return ret;
 
-       if (!k.k || !bkey_eq(k.k->p, pos)) {
-               ret = -BCH_ERR_ENOENT_inode;
-               goto err;
+       if (!ret && bkey_is_inode(k.k)) {
+               struct bch_inode_unpacked inode;
+               bch2_inode_unpack(k, &inode);
+               bch2_inode_opts_get(io_opts, trans->c, &inode);
+       } else {
+               *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
        }
 
-       ret = bkey_is_inode(k.k) ? 0 : -EIO;
-       if (ret)
-               goto err;
-
-       ret = bch2_inode_unpack(k, inode);
-       if (ret)
-               goto err;
-err:
        bch2_trans_iter_exit(trans, &iter);
-       return ret;
+       return 0;
 }
 
-static int move_ratelimit(struct btree_trans *trans,
-                         struct moving_context *ctxt)
+int bch2_move_ratelimit(struct moving_context *ctxt)
 {
-       struct bch_fs *c = trans->c;
+       struct bch_fs *c = ctxt->trans->c;
        u64 delay;
 
        if (ctxt->wait_on_copygc) {
-               bch2_trans_unlock(trans);
+               bch2_trans_unlock(ctxt->trans);
                wait_event_killable(c->copygc_running_wq,
                                    !c->copygc_running ||
                                    kthread_should_stop());
@@ -461,7 +496,7 @@ static int move_ratelimit(struct btree_trans *trans,
                delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0;
 
                if (delay) {
-                       bch2_trans_unlock(trans);
+                       bch2_trans_unlock(ctxt->trans);
                        set_current_state(TASK_INTERRUPTIBLE);
                }
 
@@ -474,7 +509,7 @@ static int move_ratelimit(struct btree_trans *trans,
                        schedule_timeout(delay);
 
                if (unlikely(freezing(current))) {
-                       move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads));
+                       move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
                        try_to_freeze();
                }
        } while (delay);
@@ -483,7 +518,7 @@ static int move_ratelimit(struct btree_trans *trans,
         * XXX: these limits really ought to be per device, SSDs and hard drives
         * will want different limits
         */
-       move_ctxt_wait_event(ctxt, trans,
+       move_ctxt_wait_event(ctxt,
                atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 &&
                atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 &&
                atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight &&
@@ -492,52 +527,28 @@ static int move_ratelimit(struct btree_trans *trans,
        return 0;
 }
 
-static int move_get_io_opts(struct btree_trans *trans,
-                           struct bch_io_opts *io_opts,
-                           struct bkey_s_c k, u64 *cur_inum)
-{
-       struct bch_inode_unpacked inode;
-       int ret;
-
-       if (*cur_inum == k.k->p.inode)
-               return 0;
-
-       ret = lookup_inode(trans,
-                          SPOS(0, k.k->p.inode, k.k->p.snapshot),
-                          &inode);
-       if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
-               return ret;
-
-       if (!ret)
-               bch2_inode_opts_get(io_opts, trans->c, &inode);
-       else
-               *io_opts = bch2_opts_to_inode_opts(trans->c->opts);
-       *cur_inum = k.k->p.inode;
-       return 0;
-}
-
-static int __bch2_move_data(struct moving_context *ctxt,
-                           struct bpos start,
-                           struct bpos end,
-                           move_pred_fn pred, void *arg,
-                           enum btree_id btree_id)
+static int bch2_move_data_btree(struct moving_context *ctxt,
+                               struct bpos start,
+                               struct bpos end,
+                               move_pred_fn pred, void *arg,
+                               enum btree_id btree_id)
 {
-       struct bch_fs *c = ctxt->c;
-       struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
+       struct btree_trans *trans = ctxt->trans;
+       struct bch_fs *c = trans->c;
+       struct per_snapshot_io_opts snapshot_io_opts;
+       struct bch_io_opts *io_opts;
        struct bkey_buf sk;
-       struct btree_trans *trans = bch2_trans_get(c);
        struct btree_iter iter;
        struct bkey_s_c k;
        struct data_update_opts data_opts;
-       u64 cur_inum = U64_MAX;
        int ret = 0, ret2;
 
+       per_snapshot_io_opts_init(&snapshot_io_opts, c);
        bch2_bkey_buf_init(&sk);
 
        if (ctxt->stats) {
                ctxt->stats->data_type  = BCH_DATA_user;
-               ctxt->stats->btree_id   = btree_id;
-               ctxt->stats->pos        = start;
+               ctxt->stats->pos        = BBPOS(btree_id, start);
        }
 
        bch2_trans_iter_init(trans, &iter, btree_id, start,
@@ -547,7 +558,7 @@ static int __bch2_move_data(struct moving_context *ctxt,
        if (ctxt->rate)
                bch2_ratelimit_reset(ctxt->rate);
 
-       while (!move_ratelimit(trans, ctxt)) {
+       while (!bch2_move_ratelimit(ctxt)) {
                bch2_trans_begin(trans);
 
                k = bch2_btree_iter_peek(&iter);
@@ -564,17 +575,18 @@ static int __bch2_move_data(struct moving_context *ctxt,
                        break;
 
                if (ctxt->stats)
-                       ctxt->stats->pos = iter.pos;
+                       ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos);
 
                if (!bkey_extent_is_direct_data(k.k))
                        goto next_nondata;
 
-               ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
+               io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, k);
+               ret = PTR_ERR_OR_ZERO(io_opts);
                if (ret)
                        continue;
 
                memset(&data_opts, 0, sizeof(data_opts));
-               if (!pred(c, arg, k, &io_opts, &data_opts))
+               if (!pred(c, arg, k, io_opts, &data_opts))
                        goto next;
 
                /*
@@ -584,24 +596,20 @@ static int __bch2_move_data(struct moving_context *ctxt,
                bch2_bkey_buf_reassemble(&sk, c, k);
                k = bkey_i_to_s_c(sk.k);
 
-               ret2 = bch2_move_extent(trans, &iter, ctxt, NULL,
-                                       io_opts, btree_id, k, data_opts);
+               ret2 = bch2_move_extent(ctxt, NULL, &iter, k, *io_opts, data_opts);
                if (ret2) {
                        if (bch2_err_matches(ret2, BCH_ERR_transaction_restart))
                                continue;
 
                        if (ret2 == -ENOMEM) {
                                /* memory allocation failure, wait for some IO to finish */
-                               bch2_move_ctxt_wait_for_io(ctxt, trans);
+                               bch2_move_ctxt_wait_for_io(ctxt);
                                continue;
                        }
 
                        /* XXX signal failure */
                        goto next;
                }
-
-               if (ctxt->rate)
-                       bch2_ratelimit_increment(ctxt->rate, k.k->size);
 next:
                if (ctxt->stats)
                        atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
@@ -610,59 +618,68 @@ next_nondata:
        }
 
        bch2_trans_iter_exit(trans, &iter);
-       bch2_trans_put(trans);
        bch2_bkey_buf_exit(&sk, c);
+       per_snapshot_io_opts_exit(&snapshot_io_opts);
 
        return ret;
 }
 
-int bch2_move_data(struct bch_fs *c,
-                  enum btree_id start_btree_id, struct bpos start_pos,
-                  enum btree_id end_btree_id,   struct bpos end_pos,
-                  struct bch_ratelimit *rate,
-                  struct bch_move_stats *stats,
-                  struct write_point_specifier wp,
-                  bool wait_on_copygc,
-                  move_pred_fn pred, void *arg)
+int __bch2_move_data(struct moving_context *ctxt,
+                    struct bbpos start,
+                    struct bbpos end,
+                    move_pred_fn pred, void *arg)
 {
-       struct moving_context ctxt;
+       struct bch_fs *c = ctxt->trans->c;
        enum btree_id id;
        int ret = 0;
 
-       bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
-
-       for (id = start_btree_id;
-            id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
+       for (id = start.btree;
+            id <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
             id++) {
-               stats->btree_id = id;
+               ctxt->stats->pos = BBPOS(id, POS_MIN);
 
-               if (id != BTREE_ID_extents &&
-                   id != BTREE_ID_reflink)
+               if (!btree_type_has_ptrs(id) ||
+                   !bch2_btree_id_root(c, id)->b)
                        continue;
 
-               if (!bch2_btree_id_root(c, id)->b)
-                       continue;
-
-               ret = __bch2_move_data(&ctxt,
-                                      id == start_btree_id ? start_pos : POS_MIN,
-                                      id == end_btree_id   ? end_pos   : POS_MAX,
+               ret = bch2_move_data_btree(ctxt,
+                                      id == start.btree ? start.pos : POS_MIN,
+                                      id == end.btree   ? end.pos   : POS_MAX,
                                       pred, arg, id);
                if (ret)
                        break;
        }
 
+       return ret;
+}
+
+int bch2_move_data(struct bch_fs *c,
+                  struct bbpos start,
+                  struct bbpos end,
+                  struct bch_ratelimit *rate,
+                  struct bch_move_stats *stats,
+                  struct write_point_specifier wp,
+                  bool wait_on_copygc,
+                  move_pred_fn pred, void *arg)
+{
+
+       struct moving_context ctxt;
+       int ret;
+
+       bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
+       ret = __bch2_move_data(&ctxt, start, end, pred, arg);
        bch2_moving_ctxt_exit(&ctxt);
 
        return ret;
 }
 
-int __bch2_evacuate_bucket(struct btree_trans *trans,
-                          struct moving_context *ctxt,
+int __bch2_evacuate_bucket(struct moving_context *ctxt,
                           struct move_bucket_in_flight *bucket_in_flight,
                           struct bpos bucket, int gen,
                           struct data_update_opts _data_opts)
 {
-       struct bch_fs *c = ctxt->c;
+       struct btree_trans *trans = ctxt->trans;
+       struct bch_fs *c = trans->c;
        struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
        struct btree_iter iter;
        struct bkey_buf sk;
@@ -673,7 +690,6 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
        struct data_update_opts data_opts;
        unsigned dirty_sectors, bucket_size;
        u64 fragmentation;
-       u64 cur_inum = U64_MAX;
        struct bpos bp_pos = POS_MIN;
        int ret = 0;
 
@@ -708,7 +724,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
                goto err;
        }
 
-       while (!(ret = move_ratelimit(trans, ctxt))) {
+       while (!(ret = bch2_move_ratelimit(ctxt))) {
                bch2_trans_begin(trans);
 
                ret = bch2_get_next_backpointer(trans, bucket, gen,
@@ -737,7 +753,7 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
                        bch2_bkey_buf_reassemble(&sk, c, k);
                        k = bkey_i_to_s_c(sk.k);
 
-                       ret = move_get_io_opts(trans, &io_opts, k, &cur_inum);
+                       ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
                        if (ret) {
                                bch2_trans_iter_exit(trans, &iter);
                                continue;
@@ -758,23 +774,20 @@ int __bch2_evacuate_bucket(struct btree_trans *trans,
                                i++;
                        }
 
-                       ret = bch2_move_extent(trans, &iter, ctxt,
-                                       bucket_in_flight,
-                                       io_opts, bp.btree_id, k, data_opts);
+                       ret = bch2_move_extent(ctxt, bucket_in_flight,
+                                              &iter, k, io_opts, data_opts);
                        bch2_trans_iter_exit(trans, &iter);
 
                        if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                                continue;
                        if (ret == -ENOMEM) {
                                /* memory allocation failure, wait for some IO to finish */
-                               bch2_move_ctxt_wait_for_io(ctxt, trans);
+                               bch2_move_ctxt_wait_for_io(ctxt);
                                continue;
                        }
                        if (ret)
                                goto err;
 
-                       if (ctxt->rate)
-                               bch2_ratelimit_increment(ctxt->rate, k.k->size);
                        if (ctxt->stats)
                                atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
                } else {
@@ -825,14 +838,12 @@ int bch2_evacuate_bucket(struct bch_fs *c,
                         struct write_point_specifier wp,
                         bool wait_on_copygc)
 {
-       struct btree_trans *trans = bch2_trans_get(c);
        struct moving_context ctxt;
        int ret;
 
        bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
-       ret = __bch2_evacuate_bucket(trans, &ctxt, NULL, bucket, gen, data_opts);
+       ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts);
        bch2_moving_ctxt_exit(&ctxt);
-       bch2_trans_put(trans);
 
        return ret;
 }
@@ -849,21 +860,25 @@ static int bch2_move_btree(struct bch_fs *c,
 {
        bool kthread = (current->flags & PF_KTHREAD) != 0;
        struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
-       struct btree_trans *trans = bch2_trans_get(c);
+       struct moving_context ctxt;
+       struct btree_trans *trans;
        struct btree_iter iter;
        struct btree *b;
        enum btree_id id;
        struct data_update_opts data_opts;
        int ret = 0;
 
-       progress_list_add(c, stats);
+       bch2_moving_ctxt_init(&ctxt, c, NULL, stats,
+                             writepoint_ptr(&c->btree_write_point),
+                             true);
+       trans = ctxt.trans;
 
        stats->data_type = BCH_DATA_btree;
 
        for (id = start_btree_id;
             id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
             id++) {
-               stats->btree_id = id;
+               stats->pos = BBPOS(id, POS_MIN);
 
                if (!bch2_btree_id_root(c, id)->b)
                        continue;
@@ -882,7 +897,7 @@ retry:
                             bpos_cmp(b->key.k.p, end_pos)) > 0)
                                break;
 
-                       stats->pos = iter.pos;
+                       stats->pos = BBPOS(iter.btree_id, iter.pos);
 
                        if (!pred(c, arg, b, &io_opts, &data_opts))
                                goto next;
@@ -904,14 +919,10 @@ next:
                        break;
        }
 
-       bch2_trans_put(trans);
-
-       if (ret)
-               bch_err_fn(c, ret);
-
+       bch_err_fn(c, ret);
+       bch2_moving_ctxt_exit(&ctxt);
        bch2_btree_interior_updates_flush(c);
 
-       progress_list_del(c, stats);
        return ret;
 }
 
@@ -1032,8 +1043,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
                mutex_unlock(&c->sb_lock);
        }
 
-       if (ret)
-               bch_err_fn(c, ret);
+       bch_err_fn(c, ret);
        return ret;
 }
 
@@ -1056,14 +1066,16 @@ int bch2_data_job(struct bch_fs *c,
                ret = bch2_replicas_gc2(c) ?: ret;
 
                ret = bch2_move_data(c,
-                                    op.start_btree,    op.start_pos,
-                                    op.end_btree,      op.end_pos,
+                                    (struct bbpos) { op.start_btree,   op.start_pos },
+                                    (struct bbpos) { op.end_btree,     op.end_pos },
                                     NULL,
                                     stats,
                                     writepoint_hashed((unsigned long) current),
                                     true,
                                     rereplicate_pred, c) ?: ret;
                ret = bch2_replicas_gc2(c) ?: ret;
+
+               bch2_move_stats_exit(stats, c);
                break;
        case BCH_DATA_OP_MIGRATE:
                if (op.migrate.dev >= c->sb.nr_devices)
@@ -1080,18 +1092,21 @@ int bch2_data_job(struct bch_fs *c,
                ret = bch2_replicas_gc2(c) ?: ret;
 
                ret = bch2_move_data(c,
-                                    op.start_btree,    op.start_pos,
-                                    op.end_btree,      op.end_pos,
+                                    (struct bbpos) { op.start_btree,   op.start_pos },
+                                    (struct bbpos) { op.end_btree,     op.end_pos },
                                     NULL,
                                     stats,
                                     writepoint_hashed((unsigned long) current),
                                     true,
                                     migrate_pred, &op) ?: ret;
                ret = bch2_replicas_gc2(c) ?: ret;
+
+               bch2_move_stats_exit(stats, c);
                break;
        case BCH_DATA_OP_REWRITE_OLD_NODES:
                bch2_move_stats_init(stats, "rewrite_old_nodes");
                ret = bch2_scan_old_btree_nodes(c, stats);
+               bch2_move_stats_exit(stats, c);
                break;
        default:
                ret = -EINVAL;
@@ -1100,19 +1115,43 @@ int bch2_data_job(struct bch_fs *c,
        return ret;
 }
 
-static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
+void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
 {
-       struct bch_move_stats *stats = ctxt->stats;
-       struct moving_io *io;
+       prt_printf(out, "%s: data type=%s pos=",
+                  stats->name,
+                  bch2_data_types[stats->data_type]);
+       bch2_bbpos_to_text(out, stats->pos);
+       prt_newline(out);
+       printbuf_indent_add(out, 2);
+
+       prt_str(out, "keys moved:  ");
+       prt_u64(out, atomic64_read(&stats->keys_moved));
+       prt_newline(out);
+
+       prt_str(out, "keys raced:  ");
+       prt_u64(out, atomic64_read(&stats->keys_raced));
+       prt_newline(out);
+
+       prt_str(out, "bytes seen:  ");
+       prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9);
+       prt_newline(out);
 
-       prt_printf(out, "%s (%ps):", stats->name, ctxt->fn);
+       prt_str(out, "bytes moved: ");
+       prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9);
        prt_newline(out);
 
-       prt_printf(out, " data type %s btree_id %s position: ",
-                  bch2_data_types[stats->data_type],
-                  bch2_btree_id_str(stats->btree_id));
-       bch2_bpos_to_text(out, stats->pos);
+       prt_str(out, "bytes raced: ");
+       prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9);
        prt_newline(out);
+
+       printbuf_indent_sub(out, 2);
+}
+
+static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt)
+{
+       struct moving_io *io;
+
+       bch2_move_stats_to_text(out, ctxt->stats);
        printbuf_indent_add(out, 2);
 
        prt_printf(out, "reads: ios %u/%u sectors %u/%u",
@@ -1153,7 +1192,4 @@ void bch2_fs_move_init(struct bch_fs *c)
 {
        INIT_LIST_HEAD(&c->moving_context_list);
        mutex_init(&c->moving_context_lock);
-
-       INIT_LIST_HEAD(&c->data_progress_list);
-       mutex_init(&c->data_progress_lock);
 }
index cbdd58db8782b24e06f03fbecd23efeaf8aaace7..1b1e8678bfaef452f3d8ccd456fc679bdc8b46c5 100644 (file)
@@ -2,6 +2,7 @@
 #ifndef _BCACHEFS_MOVE_H
 #define _BCACHEFS_MOVE_H
 
+#include "bbpos.h"
 #include "bcachefs_ioctl.h"
 #include "btree_iter.h"
 #include "buckets.h"
@@ -11,7 +12,7 @@
 struct bch_read_bio;
 
 struct moving_context {
-       struct bch_fs           *c;
+       struct btree_trans      *trans;
        struct list_head        list;
        void                    *fn;
 
@@ -37,10 +38,10 @@ struct moving_context {
        wait_queue_head_t       wait;
 };
 
-#define move_ctxt_wait_event(_ctxt, _trans, _cond)                     \
+#define move_ctxt_wait_event(_ctxt, _cond)                             \
 do {                                                                   \
        bool cond_finished = false;                                     \
-       bch2_moving_ctxt_do_pending_writes(_ctxt, _trans);              \
+       bch2_moving_ctxt_do_pending_writes(_ctxt);                      \
                                                                        \
        if (_cond)                                                      \
                break;                                                  \
@@ -59,22 +60,60 @@ void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
                           struct bch_ratelimit *, struct bch_move_stats *,
                           struct write_point_specifier, bool);
 struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
-void bch2_moving_ctxt_do_pending_writes(struct moving_context *,
-                                       struct btree_trans *);
+void bch2_moving_ctxt_do_pending_writes(struct moving_context *);
+void bch2_move_ctxt_wait_for_io(struct moving_context *);
+int bch2_move_ratelimit(struct moving_context *);
+
+/* Inodes in different snapshots may have different IO options: */
+struct snapshot_io_opts_entry {
+       u32                     snapshot;
+       struct bch_io_opts      io_opts;
+};
+
+struct per_snapshot_io_opts {
+       u64                     cur_inum;
+       struct bch_io_opts      fs_io_opts;
+       DARRAY(struct snapshot_io_opts_entry) d;
+};
+
+static inline void per_snapshot_io_opts_init(struct per_snapshot_io_opts *io_opts, struct bch_fs *c)
+{
+       memset(io_opts, 0, sizeof(*io_opts));
+       io_opts->fs_io_opts = bch2_opts_to_inode_opts(c->opts);
+}
+
+static inline void per_snapshot_io_opts_exit(struct per_snapshot_io_opts *io_opts)
+{
+       darray_exit(&io_opts->d);
+}
+
+struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *,
+                               struct per_snapshot_io_opts *, struct bkey_s_c);
+int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, struct bkey_s_c);
 
 int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *);
 
+int bch2_move_extent(struct moving_context *,
+                    struct move_bucket_in_flight *,
+                    struct btree_iter *,
+                    struct bkey_s_c,
+                    struct bch_io_opts,
+                    struct data_update_opts);
+
+int __bch2_move_data(struct moving_context *,
+                    struct bbpos,
+                    struct bbpos,
+                    move_pred_fn, void *);
 int bch2_move_data(struct bch_fs *,
-                  enum btree_id, struct bpos,
-                  enum btree_id, struct bpos,
+                  struct bbpos start,
+                  struct bbpos end,
                   struct bch_ratelimit *,
                   struct bch_move_stats *,
                   struct write_point_specifier,
                   bool,
                   move_pred_fn, void *);
 
-int __bch2_evacuate_bucket(struct btree_trans *,
-                          struct moving_context *,
+int __bch2_evacuate_bucket(struct moving_context *,
                           struct move_bucket_in_flight *,
                           struct bpos, int,
                           struct data_update_opts);
@@ -88,7 +127,10 @@ int bch2_data_job(struct bch_fs *,
                  struct bch_move_stats *,
                  struct bch_ioctl_data);
 
-void bch2_move_stats_init(struct bch_move_stats *stats, char *name);
+void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
+void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
+void bch2_move_stats_init(struct bch_move_stats *, char *);
+
 void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
 
 void bch2_fs_move_init(struct bch_fs *);
index baf1f8570b3fe05f1733ae9be1cfa28c9e3db933..e22841ef31e475fdfa11d8dcc7d48adb8d333897 100644 (file)
@@ -2,17 +2,17 @@
 #ifndef _BCACHEFS_MOVE_TYPES_H
 #define _BCACHEFS_MOVE_TYPES_H
 
+#include "bbpos_types.h"
+
 struct bch_move_stats {
        enum bch_data_type      data_type;
-       enum btree_id           btree_id;
-       struct bpos             pos;
-       struct list_head        list;
+       struct bbpos            pos;
        char                    name[32];
 
        atomic64_t              keys_moved;
        atomic64_t              keys_raced;
-       atomic64_t              sectors_moved;
        atomic64_t              sectors_seen;
+       atomic64_t              sectors_moved;
        atomic64_t              sectors_raced;
 };
 
index 4017120baeeebddecee6180522fe99f84f291db1..f73b9b7f4bf7ee111e586754d868e13c5fc1d0bb 100644 (file)
@@ -101,8 +101,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
        return ret;
 }
 
-static void move_buckets_wait(struct btree_trans *trans,
-                             struct moving_context *ctxt,
+static void move_buckets_wait(struct moving_context *ctxt,
                              struct buckets_in_flight *list,
                              bool flush)
 {
@@ -111,7 +110,7 @@ static void move_buckets_wait(struct btree_trans *trans,
 
        while ((i = list->first)) {
                if (flush)
-                       move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count));
+                       move_ctxt_wait_event(ctxt, !atomic_read(&i->count));
 
                if (atomic_read(&i->count))
                        break;
@@ -129,7 +128,7 @@ static void move_buckets_wait(struct btree_trans *trans,
                kfree(i);
        }
 
-       bch2_trans_unlock(trans);
+       bch2_trans_unlock(ctxt->trans);
 }
 
 static bool bucket_in_flight(struct buckets_in_flight *list,
@@ -140,11 +139,11 @@ static bool bucket_in_flight(struct buckets_in_flight *list,
 
 typedef DARRAY(struct move_bucket) move_buckets;
 
-static int bch2_copygc_get_buckets(struct btree_trans *trans,
-                       struct moving_context *ctxt,
+static int bch2_copygc_get_buckets(struct moving_context *ctxt,
                        struct buckets_in_flight *buckets_in_flight,
                        move_buckets *buckets)
 {
+       struct btree_trans *trans = ctxt->trans;
        struct bch_fs *c = trans->c;
        struct btree_iter iter;
        struct bkey_s_c k;
@@ -152,7 +151,7 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
        size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
        int ret;
 
-       move_buckets_wait(trans, ctxt, buckets_in_flight, false);
+       move_buckets_wait(ctxt, buckets_in_flight, false);
 
        ret = bch2_btree_write_buffer_flush(trans);
        if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
@@ -188,10 +187,10 @@ static int bch2_copygc_get_buckets(struct btree_trans *trans,
 }
 
 noinline
-static int bch2_copygc(struct btree_trans *trans,
-                      struct moving_context *ctxt,
+static int bch2_copygc(struct moving_context *ctxt,
                       struct buckets_in_flight *buckets_in_flight)
 {
+       struct btree_trans *trans = ctxt->trans;
        struct bch_fs *c = trans->c;
        struct data_update_opts data_opts = {
                .btree_insert_flags = BCH_WATERMARK_copygc,
@@ -202,7 +201,7 @@ static int bch2_copygc(struct btree_trans *trans,
        u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
        int ret = 0;
 
-       ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets);
+       ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets);
        if (ret)
                goto err;
 
@@ -221,7 +220,7 @@ static int bch2_copygc(struct btree_trans *trans,
                        break;
                }
 
-               ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket.k.bucket,
+               ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
                                             f->bucket.k.gen, data_opts);
                if (ret)
                        goto err;
@@ -300,7 +299,6 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c)
 static int bch2_copygc_thread(void *arg)
 {
        struct bch_fs *c = arg;
-       struct btree_trans *trans;
        struct moving_context ctxt;
        struct bch_move_stats move_stats;
        struct io_clock *clock = &c->io_clock[WRITE];
@@ -317,7 +315,6 @@ static int bch2_copygc_thread(void *arg)
        }
 
        set_freezable();
-       trans = bch2_trans_get(c);
 
        bch2_move_stats_init(&move_stats, "copygc");
        bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats,
@@ -325,16 +322,16 @@ static int bch2_copygc_thread(void *arg)
                              false);
 
        while (!ret && !kthread_should_stop()) {
-               bch2_trans_unlock(trans);
+               bch2_trans_unlock(ctxt.trans);
                cond_resched();
 
                if (!c->copy_gc_enabled) {
-                       move_buckets_wait(trans, &ctxt, &buckets, true);
+                       move_buckets_wait(&ctxt, &buckets, true);
                        kthread_wait_freezable(c->copy_gc_enabled);
                }
 
                if (unlikely(freezing(current))) {
-                       move_buckets_wait(trans, &ctxt, &buckets, true);
+                       move_buckets_wait(&ctxt, &buckets, true);
                        __refrigerator(false);
                        continue;
                }
@@ -345,7 +342,7 @@ static int bch2_copygc_thread(void *arg)
                if (wait > clock->max_slop) {
                        c->copygc_wait_at = last;
                        c->copygc_wait = last + wait;
-                       move_buckets_wait(trans, &ctxt, &buckets, true);
+                       move_buckets_wait(&ctxt, &buckets, true);
                        trace_and_count(c, copygc_wait, c, wait, last + wait);
                        bch2_kthread_io_clock_wait(clock, last + wait,
                                        MAX_SCHEDULE_TIMEOUT);
@@ -355,16 +352,16 @@ static int bch2_copygc_thread(void *arg)
                c->copygc_wait = 0;
 
                c->copygc_running = true;
-               ret = bch2_copygc(trans, &ctxt, &buckets);
+               ret = bch2_copygc(&ctxt, &buckets);
                c->copygc_running = false;
 
                wake_up(&c->copygc_running_wq);
        }
 
-       move_buckets_wait(trans, &ctxt, &buckets, true);
+       move_buckets_wait(&ctxt, &buckets, true);
        rhashtable_destroy(&buckets.table);
-       bch2_trans_put(trans);
        bch2_moving_ctxt_exit(&ctxt);
+       bch2_move_stats_exit(&move_stats, c);
 
        return 0;
 }
index 8294f56e45d5a503821e1bf1d15539f3bee8ae89..b7722b6236978cd692a478cd4ec324ade4456bb4 100644 (file)
@@ -294,6 +294,9 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
                return -EINVAL;
        }
 
+       if (opt->fn.validate)
+               return opt->fn.validate(v, err);
+
        return 0;
 }
 
index 16dd0f0622bcbbbb72d3222810b603a12b3ce5ad..2307cdd2a23cd18ad324d223aefef231ea7cb857 100644 (file)
@@ -74,6 +74,7 @@ enum opt_type {
 struct bch_opt_fn {
        int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *);
        void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64);
+       int (*validate)(u64, struct printbuf *);
 };
 
 /**
index de41f9a144920b83fb816182a4c97fecc1df87bf..5e653eb81d54f8fdfcca37038eeaf5a1febdb8e7 100644 (file)
@@ -415,11 +415,11 @@ void bch2_prt_bitflags(struct printbuf *out,
        while (list[nr])
                nr++;
 
-       while (flags && (bit = __ffs(flags)) < nr) {
+       while (flags && (bit = __ffs64(flags)) < nr) {
                if (!first)
                        bch2_prt_printf(out, ",");
                first = false;
                bch2_prt_printf(out, "%s", list[bit]);
-               flags ^= 1 << bit;
+               flags ^= BIT_ULL(bit);
        }
 }
index 568f1e8e7507e73913ff70c2e1769b9750f3289e..6ba8574b4a69f8ea5f093a65fe9105700aee9678 100644 (file)
@@ -3,13 +3,18 @@
 #include "bcachefs.h"
 #include "alloc_foreground.h"
 #include "btree_iter.h"
+#include "btree_update.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "clock.h"
 #include "compress.h"
 #include "disk_groups.h"
 #include "errcode.h"
+#include "error.h"
+#include "inode.h"
 #include "move.h"
 #include "rebalance.h"
+#include "subvolume.h"
 #include "super-io.h"
 #include "trace.h"
 
 #include <linux/kthread.h>
 #include <linux/sched/cputime.h>
 
-/*
- * Check if an extent should be moved:
- * returns -1 if it should not be moved, or
- * device of pointer that should be moved, if known, or INT_MAX if unknown
- */
-static bool rebalance_pred(struct bch_fs *c, void *arg,
-                          struct bkey_s_c k,
-                          struct bch_io_opts *io_opts,
-                          struct data_update_opts *data_opts)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       unsigned i;
+#define REBALANCE_WORK_SCAN_OFFSET     (U64_MAX - 1)
 
-       data_opts->rewrite_ptrs         = 0;
-       data_opts->target               = io_opts->background_target;
-       data_opts->extra_replicas       = 0;
-       data_opts->btree_insert_flags   = 0;
-
-       if (io_opts->background_compression &&
-           !bch2_bkey_is_incompressible(k)) {
-               const union bch_extent_entry *entry;
-               struct extent_ptr_decoded p;
-
-               i = 0;
-               bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-                       if (!p.ptr.cached &&
-                           p.crc.compression_type !=
-                           bch2_compression_opt_to_type(io_opts->background_compression))
-                               data_opts->rewrite_ptrs |= 1U << i;
-                       i++;
-               }
-       }
-
-       if (io_opts->background_target) {
-               const struct bch_extent_ptr *ptr;
+static const char * const bch2_rebalance_state_strs[] = {
+#define x(t) #t,
+       BCH_REBALANCE_STATES()
+       NULL
+#undef x
+};
 
-               i = 0;
-               bkey_for_each_ptr(ptrs, ptr) {
-                       if (!ptr->cached &&
-                           !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) &&
-                           bch2_target_accepts_data(c, BCH_DATA_user, io_opts->background_target))
-                               data_opts->rewrite_ptrs |= 1U << i;
-                       i++;
-               }
-       }
+static int __bch2_set_rebalance_needs_scan(struct btree_trans *trans, u64 inum)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_i_cookie *cookie;
+       u64 v;
+       int ret;
 
-       return data_opts->rewrite_ptrs != 0;
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
+                            SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+                            BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       v = k.k->type == KEY_TYPE_cookie
+               ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
+               : 0;
+
+       cookie = bch2_trans_kmalloc(trans, sizeof(*cookie));
+       ret = PTR_ERR_OR_ZERO(cookie);
+       if (ret)
+               goto err;
+
+       bkey_cookie_init(&cookie->k_i);
+       cookie->k.p = iter.pos;
+       cookie->v.cookie = cpu_to_le64(v + 1);
+
+       ret = bch2_trans_update(trans, &iter, &cookie->k_i, 0);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
 }
 
-void bch2_rebalance_add_key(struct bch_fs *c,
-                           struct bkey_s_c k,
-                           struct bch_io_opts *io_opts)
+int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
 {
-       struct data_update_opts update_opts = { 0 };
-       struct bkey_ptrs_c ptrs;
-       const struct bch_extent_ptr *ptr;
-       unsigned i;
+       int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+                           __bch2_set_rebalance_needs_scan(trans, inum));
+       rebalance_wakeup(c);
+       return ret;
+}
 
-       if (!rebalance_pred(c, NULL, k, io_opts, &update_opts))
-               return;
-
-       i = 0;
-       ptrs = bch2_bkey_ptrs_c(k);
-       bkey_for_each_ptr(ptrs, ptr) {
-               if ((1U << i) && update_opts.rewrite_ptrs)
-                       if (atomic64_add_return(k.k->size,
-                                       &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) ==
-                           k.k->size)
-                               rebalance_wakeup(c);
-               i++;
-       }
+int bch2_set_fs_needs_rebalance(struct bch_fs *c)
+{
+       return bch2_set_rebalance_needs_scan(c, 0);
 }
 
-void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors)
+static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, u64 cookie)
 {
-       if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) ==
-           sectors)
-               rebalance_wakeup(c);
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 v;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work,
+                            SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX),
+                            BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       v = k.k->type == KEY_TYPE_cookie
+               ? le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie)
+               : 0;
+
+       if (v == cookie)
+               ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
 }
 
-struct rebalance_work {
-       int             dev_most_full_idx;
-       unsigned        dev_most_full_percent;
-       u64             dev_most_full_work;
-       u64             dev_most_full_capacity;
-       u64             total_work;
-};
+static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans,
+                                           struct btree_iter *work_iter)
+{
+       return !kthread_should_stop()
+               ? bch2_btree_iter_peek(work_iter)
+               : bkey_s_c_null;
+}
 
-static void rebalance_work_accumulate(struct rebalance_work *w,
-               u64 dev_work, u64 unknown_dev, u64 capacity, int idx)
+static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
+                                          struct btree_iter *iter,
+                                          struct bkey_s_c k)
 {
-       unsigned percent_full;
-       u64 work = dev_work + unknown_dev;
+       struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0);
+       int ret = PTR_ERR_OR_ZERO(n);
+       if (ret)
+               return ret;
 
-       /* avoid divide by 0 */
-       if (!capacity)
-               return;
+       extent_entry_drop(bkey_i_to_s(n),
+                         (void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
+       return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+}
+
+static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
+                       struct bpos work_pos,
+                       struct btree_iter *extent_iter,
+                       struct data_update_opts *data_opts)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_s_c k;
+
+       bch2_trans_iter_exit(trans, extent_iter);
+       bch2_trans_iter_init(trans, extent_iter,
+                            work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink,
+                            work_pos,
+                            BTREE_ITER_ALL_SNAPSHOTS);
+       k = bch2_btree_iter_peek_slot(extent_iter);
+       if (bkey_err(k))
+               return k;
+
+       const struct bch_extent_rebalance *r = k.k ? bch2_bkey_rebalance_opts(k) : NULL;
+       if (!r) {
+               /* raced due to btree write buffer, nothing to do */
+               return bkey_s_c_null;
+       }
 
-       if (work < dev_work || work < unknown_dev)
-               work = U64_MAX;
-       work = min(work, capacity);
+       memset(data_opts, 0, sizeof(*data_opts));
 
-       percent_full = div64_u64(work * 100, capacity);
+       data_opts->rewrite_ptrs         =
+               bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression);
+       data_opts->target               = r->target;
 
-       if (percent_full >= w->dev_most_full_percent) {
-               w->dev_most_full_idx            = idx;
-               w->dev_most_full_percent        = percent_full;
-               w->dev_most_full_work           = work;
-               w->dev_most_full_capacity       = capacity;
+       if (!data_opts->rewrite_ptrs) {
+               /*
+                * device we would want to write to offline? devices in target
+                * changed?
+                *
+                * We'll now need a full scan before this extent is picked up
+                * again:
+                */
+               int ret = bch2_bkey_clear_needs_rebalance(trans, extent_iter, k);
+               if (ret)
+                       return bkey_s_c_err(ret);
+               return bkey_s_c_null;
        }
 
-       if (w->total_work + dev_work >= w->total_work &&
-           w->total_work + dev_work >= dev_work)
-               w->total_work += dev_work;
+       return k;
 }
 
-static struct rebalance_work rebalance_work(struct bch_fs *c)
+noinline_for_stack
+static int do_rebalance_extent(struct moving_context *ctxt,
+                              struct bpos work_pos,
+                              struct btree_iter *extent_iter)
 {
-       struct bch_dev *ca;
-       struct rebalance_work ret = { .dev_most_full_idx = -1 };
-       u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev);
-       unsigned i;
+       struct btree_trans *trans = ctxt->trans;
+       struct bch_fs *c = trans->c;
+       struct bch_fs_rebalance *r = &trans->c->rebalance;
+       struct data_update_opts data_opts;
+       struct bch_io_opts io_opts;
+       struct bkey_s_c k;
+       struct bkey_buf sk;
+       int ret;
+
+       ctxt->stats = &r->work_stats;
+       r->state = BCH_REBALANCE_working;
 
-       for_each_online_member(ca, c, i)
-               rebalance_work_accumulate(&ret,
-                       atomic64_read(&ca->rebalance_work),
-                       unknown_dev,
-                       bucket_to_sector(ca, ca->mi.nbuckets -
-                                        ca->mi.first_bucket),
-                       i);
+       bch2_bkey_buf_init(&sk);
 
-       rebalance_work_accumulate(&ret,
-               unknown_dev, 0, c->capacity, -1);
+       ret = bkey_err(k = next_rebalance_extent(trans, work_pos,
+                                                extent_iter, &data_opts));
+       if (ret || !k.k)
+               goto out;
 
+       ret = bch2_move_get_io_opts_one(trans, &io_opts, k);
+       if (ret)
+               goto out;
+
+       atomic64_add(k.k->size, &ctxt->stats->sectors_seen);
+
+       /*
+        * The iterator gets unlocked by __bch2_read_extent - need to
+        * save a copy of @k elsewhere:
+        */
+       bch2_bkey_buf_reassemble(&sk, c, k);
+       k = bkey_i_to_s_c(sk.k);
+
+       ret = bch2_move_extent(ctxt, NULL, extent_iter, k, io_opts, data_opts);
+       if (ret) {
+               if (bch2_err_matches(ret, ENOMEM)) {
+                       /* memory allocation failure, wait for some IO to finish */
+                       bch2_move_ctxt_wait_for_io(ctxt);
+                       ret = -BCH_ERR_transaction_restart_nested;
+               }
+
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       goto out;
+
+               /* skip it and continue, XXX signal failure */
+               ret = 0;
+       }
+out:
+       bch2_bkey_buf_exit(&sk, c);
        return ret;
 }
 
-static void rebalance_work_reset(struct bch_fs *c)
+static bool rebalance_pred(struct bch_fs *c, void *arg,
+                          struct bkey_s_c k,
+                          struct bch_io_opts *io_opts,
+                          struct data_update_opts *data_opts)
 {
-       struct bch_dev *ca;
-       unsigned i;
+       unsigned target, compression;
 
-       for_each_online_member(ca, c, i)
-               atomic64_set(&ca->rebalance_work, 0);
+       if (k.k->p.inode) {
+               target          = io_opts->background_target;
+               compression     = io_opts->background_compression ?: io_opts->compression;
+       } else {
+               const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+
+               target          = r ? r->target : io_opts->background_target;
+               compression     = r ? r->compression :
+                       (io_opts->background_compression ?: io_opts->compression);
+       }
 
-       atomic64_set(&c->rebalance.work_unknown_dev, 0);
+       data_opts->rewrite_ptrs         = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
+       data_opts->target               = target;
+       return data_opts->rewrite_ptrs != 0;
 }
 
-static unsigned long curr_cputime(void)
+static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
 {
-       u64 utime, stime;
+       struct btree_trans *trans = ctxt->trans;
+       struct bch_fs_rebalance *r = &trans->c->rebalance;
+       int ret;
+
+       bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
+       ctxt->stats = &r->scan_stats;
 
-       task_cputime_adjusted(current, &utime, &stime);
-       return nsecs_to_jiffies(utime + stime);
+       if (!inum) {
+               r->scan_start   = BBPOS_MIN;
+               r->scan_end     = BBPOS_MAX;
+       } else {
+               r->scan_start   = BBPOS(BTREE_ID_extents, POS(inum, 0));
+               r->scan_end     = BBPOS(BTREE_ID_extents, POS(inum, U64_MAX));
+       }
+
+       r->state = BCH_REBALANCE_scanning;
+
+       ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
+               commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+                         bch2_clear_rebalance_needs_scan(trans, inum, cookie));
+
+       bch2_move_stats_exit(&r->scan_stats, trans->c);
+       return ret;
 }
 
-static int bch2_rebalance_thread(void *arg)
+static void rebalance_wait(struct bch_fs *c)
 {
-       struct bch_fs *c = arg;
        struct bch_fs_rebalance *r = &c->rebalance;
+       struct bch_dev *ca;
        struct io_clock *clock = &c->io_clock[WRITE];
-       struct rebalance_work w, p;
-       struct bch_move_stats move_stats;
-       unsigned long start, prev_start;
-       unsigned long prev_run_time, prev_run_cputime;
-       unsigned long cputime, prev_cputime;
-       u64 io_start;
-       long throttle;
+       u64 now = atomic64_read(&clock->now);
+       u64 min_member_capacity = 128 * 2048;
+       unsigned i;
 
-       set_freezable();
+       for_each_rw_member(ca, c, i)
+               min_member_capacity = min(min_member_capacity,
+                                         ca->mi.nbuckets * ca->mi.bucket_size);
+
+       r->wait_iotime_end              = now + (min_member_capacity >> 6);
+
+       if (r->state != BCH_REBALANCE_waiting) {
+               r->wait_iotime_start    = now;
+               r->wait_wallclock_start = ktime_get_real_ns();
+               r->state                = BCH_REBALANCE_waiting;
+       }
 
-       io_start        = atomic64_read(&clock->now);
-       p               = rebalance_work(c);
-       prev_start      = jiffies;
-       prev_cputime    = curr_cputime();
+       bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT);
+}
 
-       bch2_move_stats_init(&move_stats, "rebalance");
-       while (!kthread_wait_freezable(r->enabled)) {
-               cond_resched();
+static int do_rebalance(struct moving_context *ctxt)
+{
+       struct btree_trans *trans = ctxt->trans;
+       struct bch_fs *c = trans->c;
+       struct bch_fs_rebalance *r = &c->rebalance;
+       struct btree_iter rebalance_work_iter, extent_iter = { NULL };
+       struct bkey_s_c k;
+       int ret = 0;
 
-               start                   = jiffies;
-               cputime                 = curr_cputime();
+       bch2_move_stats_init(&r->work_stats, "rebalance_work");
+       bch2_move_stats_init(&r->scan_stats, "rebalance_scan");
 
-               prev_run_time           = start - prev_start;
-               prev_run_cputime        = cputime - prev_cputime;
+       bch2_trans_iter_init(trans, &rebalance_work_iter,
+                            BTREE_ID_rebalance_work, POS_MIN,
+                            BTREE_ITER_ALL_SNAPSHOTS);
 
-               w                       = rebalance_work(c);
-               BUG_ON(!w.dev_most_full_capacity);
+       while (!bch2_move_ratelimit(ctxt) &&
+              !kthread_wait_freezable(r->enabled)) {
+               bch2_trans_begin(trans);
 
-               if (!w.total_work) {
-                       r->state = REBALANCE_WAITING;
-                       kthread_wait_freezable(rebalance_work(c).total_work);
+               ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
                        continue;
-               }
+               if (ret || !k.k)
+                       break;
 
-               /*
-                * If there isn't much work to do, throttle cpu usage:
-                */
-               throttle = prev_run_cputime * 100 /
-                       max(1U, w.dev_most_full_percent) -
-                       prev_run_time;
-
-               if (w.dev_most_full_percent < 20 && throttle > 0) {
-                       r->throttled_until_iotime = io_start +
-                               div_u64(w.dev_most_full_capacity *
-                                       (20 - w.dev_most_full_percent),
-                                       50);
-
-                       if (atomic64_read(&clock->now) + clock->max_slop <
-                           r->throttled_until_iotime) {
-                               r->throttled_until_cputime = start + throttle;
-                               r->state = REBALANCE_THROTTLED;
-
-                               bch2_kthread_io_clock_wait(clock,
-                                       r->throttled_until_iotime,
-                                       throttle);
-                               continue;
-                       }
-               }
+               ret = k.k->type == KEY_TYPE_cookie
+                       ? do_rebalance_scan(ctxt, k.k->p.inode,
+                                           le64_to_cpu(bkey_s_c_to_cookie(k).v->cookie))
+                       : do_rebalance_extent(ctxt, k.k->p, &extent_iter);
+
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+                       continue;
+               if (ret)
+                       break;
 
-               /* minimum 1 mb/sec: */
-               r->pd.rate.rate =
-                       max_t(u64, 1 << 11,
-                             r->pd.rate.rate *
-                             max(p.dev_most_full_percent, 1U) /
-                             max(w.dev_most_full_percent, 1U));
-
-               io_start        = atomic64_read(&clock->now);
-               p               = w;
-               prev_start      = start;
-               prev_cputime    = cputime;
-
-               r->state = REBALANCE_RUNNING;
-               memset(&move_stats, 0, sizeof(move_stats));
-               rebalance_work_reset(c);
-
-               bch2_move_data(c,
-                              0,               POS_MIN,
-                              BTREE_ID_NR,     POS_MAX,
-                              /* ratelimiting disabled for now */
-                              NULL, /*  &r->pd.rate, */
-                              &move_stats,
-                              writepoint_ptr(&c->rebalance_write_point),
-                              true,
-                              rebalance_pred, NULL);
+               bch2_btree_iter_advance(&rebalance_work_iter);
        }
 
-       return 0;
+       bch2_trans_iter_exit(trans, &extent_iter);
+       bch2_trans_iter_exit(trans, &rebalance_work_iter);
+       bch2_move_stats_exit(&r->scan_stats, c);
+
+       if (!ret &&
+           !kthread_should_stop() &&
+           !atomic64_read(&r->work_stats.sectors_seen) &&
+           !atomic64_read(&r->scan_stats.sectors_seen)) {
+               bch2_trans_unlock(trans);
+               rebalance_wait(c);
+       }
+
+       bch_err_fn(c, ret);
+       return ret;
 }
 
-void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
+static int bch2_rebalance_thread(void *arg)
 {
+       struct bch_fs *c = arg;
        struct bch_fs_rebalance *r = &c->rebalance;
-       struct rebalance_work w = rebalance_work(c);
+       struct moving_context ctxt;
+       int ret;
 
-       if (!out->nr_tabstops)
-               printbuf_tabstop_push(out, 20);
+       set_freezable();
 
-       prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx);
-       prt_tab(out);
+       bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats,
+                             writepoint_ptr(&c->rebalance_write_point),
+                             true);
 
-       prt_human_readable_u64(out, w.dev_most_full_work << 9);
-       prt_printf(out, "/");
-       prt_human_readable_u64(out, w.dev_most_full_capacity << 9);
-       prt_newline(out);
+       while (!kthread_should_stop() &&
+              !(ret = do_rebalance(&ctxt)))
+               ;
 
-       prt_printf(out, "total work:");
-       prt_tab(out);
+       bch2_moving_ctxt_exit(&ctxt);
 
-       prt_human_readable_u64(out, w.total_work << 9);
-       prt_printf(out, "/");
-       prt_human_readable_u64(out, c->capacity << 9);
-       prt_newline(out);
+       return 0;
+}
+
+void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
+{
+       struct bch_fs_rebalance *r = &c->rebalance;
 
-       prt_printf(out, "rate:");
-       prt_tab(out);
-       prt_printf(out, "%u", r->pd.rate.rate);
+       prt_str(out, bch2_rebalance_state_strs[r->state]);
        prt_newline(out);
+       printbuf_indent_add(out, 2);
 
        switch (r->state) {
-       case REBALANCE_WAITING:
-               prt_printf(out, "waiting");
+       case BCH_REBALANCE_waiting: {
+               u64 now = atomic64_read(&c->io_clock[WRITE].now);
+
+               prt_str(out, "io wait duration:  ");
+               bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start);
+               prt_newline(out);
+
+               prt_str(out, "io wait remaining: ");
+               bch2_prt_human_readable_s64(out, r->wait_iotime_end - now);
+               prt_newline(out);
+
+               prt_str(out, "duration waited:   ");
+               bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start);
+               prt_newline(out);
                break;
-       case REBALANCE_THROTTLED:
-               prt_printf(out, "throttled for %lu sec or ",
-                      (r->throttled_until_cputime - jiffies) / HZ);
-               prt_human_readable_u64(out,
-                           (r->throttled_until_iotime -
-                            atomic64_read(&c->io_clock[WRITE].now)) << 9);
-               prt_printf(out, " io");
+       }
+       case BCH_REBALANCE_working:
+               bch2_move_stats_to_text(out, &r->work_stats);
                break;
-       case REBALANCE_RUNNING:
-               prt_printf(out, "running");
+       case BCH_REBALANCE_scanning:
+               bch2_move_stats_to_text(out, &r->scan_stats);
                break;
        }
        prt_newline(out);
+       printbuf_indent_sub(out, 2);
 }
 
 void bch2_rebalance_stop(struct bch_fs *c)
@@ -361,6 +462,4 @@ int bch2_rebalance_start(struct bch_fs *c)
 void bch2_fs_rebalance_init(struct bch_fs *c)
 {
        bch2_pd_controller_init(&c->rebalance.pd);
-
-       atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX);
 }
index 7ade0bb81cce8d1ac0a12819a44f3eb2d2e8f79d..28a52638f16cc113848cf3925758e12d510dc247 100644 (file)
@@ -4,6 +4,9 @@
 
 #include "rebalance_types.h"
 
+int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum);
+int bch2_set_fs_needs_rebalance(struct bch_fs *);
+
 static inline void rebalance_wakeup(struct bch_fs *c)
 {
        struct task_struct *p;
@@ -15,11 +18,7 @@ static inline void rebalance_wakeup(struct bch_fs *c)
        rcu_read_unlock();
 }
 
-void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c,
-                           struct bch_io_opts *);
-void bch2_rebalance_add_work(struct bch_fs *, u64);
-
-void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *);
+void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *);
 
 void bch2_rebalance_stop(struct bch_fs *);
 int bch2_rebalance_start(struct bch_fs *);
index 7462a92e95985d91cdc454485d045659240dd0fc..0fffb536c1d0c1b65d1a2a68730cab49f7535db2 100644 (file)
@@ -2,25 +2,36 @@
 #ifndef _BCACHEFS_REBALANCE_TYPES_H
 #define _BCACHEFS_REBALANCE_TYPES_H
 
+#include "bbpos_types.h"
 #include "move_types.h"
 
-enum rebalance_state {
-       REBALANCE_WAITING,
-       REBALANCE_THROTTLED,
-       REBALANCE_RUNNING,
+#define BCH_REBALANCE_STATES()         \
+       x(waiting)                      \
+       x(working)                      \
+       x(scanning)
+
+enum bch_rebalance_states {
+#define x(t)   BCH_REBALANCE_##t,
+       BCH_REBALANCE_STATES()
+#undef x
 };
 
 struct bch_fs_rebalance {
-       struct task_struct __rcu *thread;
+       struct task_struct __rcu        *thread;
        struct bch_pd_controller pd;
 
-       atomic64_t              work_unknown_dev;
+       enum bch_rebalance_states       state;
+       u64                             wait_iotime_start;
+       u64                             wait_iotime_end;
+       u64                             wait_wallclock_start;
+
+       struct bch_move_stats           work_stats;
 
-       enum rebalance_state    state;
-       u64                     throttled_until_iotime;
-       unsigned long           throttled_until_cputime;
+       struct bbpos                    scan_start;
+       struct bbpos                    scan_end;
+       struct bch_move_stats           scan_stats;
 
-       unsigned                enabled:1;
+       unsigned                        enabled:1;
 };
 
 #endif /* _BCACHEFS_REBALANCE_TYPES_H */
index 55663253c9d3359615d4668173d09fea7bd182e7..02025099c38fc5e2b284d4fff065c20622611711 100644 (file)
@@ -23,6 +23,7 @@
 #include "logged_ops.h"
 #include "move.h"
 #include "quota.h"
+#include "rebalance.h"
 #include "recovery.h"
 #include "replicas.h"
 #include "sb-clean.h"
@@ -946,16 +947,12 @@ int bch2_fs_initialize(struct bch_fs *c)
        for (i = 0; i < BTREE_ID_NR; i++)
                bch2_btree_root_alloc(c, i);
 
-       for_each_online_member(ca, c, i)
+       for_each_member_device(ca, c, i)
                bch2_dev_usage_init(ca);
 
-       for_each_online_member(ca, c, i) {
-               ret = bch2_dev_journal_alloc(ca);
-               if (ret) {
-                       percpu_ref_put(&ca->io_ref);
-                       goto err;
-               }
-       }
+       ret = bch2_fs_journal_alloc(c);
+       if (ret)
+               goto err;
 
        /*
         * journal_res_get() will crash if called before this has
@@ -973,15 +970,13 @@ int bch2_fs_initialize(struct bch_fs *c)
         * btree updates
         */
        bch_verbose(c, "marking superblocks");
-       for_each_member_device(ca, c, i) {
-               ret = bch2_trans_mark_dev_sb(c, ca);
-               if (ret) {
-                       percpu_ref_put(&ca->ref);
-                       goto err;
-               }
+       ret = bch2_trans_mark_dev_sbs(c);
+       bch_err_msg(c, ret, "marking superblocks");
+       if (ret)
+               goto err;
 
+       for_each_online_member(ca, c, i)
                ca->new_fs_bucket_idx = 0;
-       }
 
        ret = bch2_fs_freespace_init(c);
        if (ret)
index 4c1cea2a601d2f2ae0d2c9e8746a6ed763ce52e5..515e3d62c2ac9ec481694985ddaa7b1722760d6f 100644 (file)
@@ -14,6 +14,8 @@
        x(snapshots_read,               PASS_ALWAYS)                                            \
        x(check_topology,               0)                                                      \
        x(check_allocations,            PASS_FSCK)                                              \
+       x(trans_mark_dev_sbs,           PASS_ALWAYS|PASS_SILENT)                                \
+       x(fs_journal_alloc,             PASS_ALWAYS|PASS_SILENT)                                \
        x(set_may_go_rw,                PASS_ALWAYS|PASS_SILENT)                                \
        x(journal_replay,               PASS_ALWAYS)                                            \
        x(check_alloc_info,             PASS_FSCK)                                              \
@@ -32,6 +34,7 @@
        x(resume_logged_ops,            PASS_ALWAYS)                                            \
        x(check_inodes,                 PASS_FSCK)                                              \
        x(check_extents,                PASS_FSCK)                                              \
+       x(check_indirect_extents,       PASS_FSCK)                                              \
        x(check_dirents,                PASS_FSCK)                                              \
        x(check_xattrs,                 PASS_FSCK)                                              \
        x(check_root,                   PASS_FSCK)                                              \
@@ -39,6 +42,7 @@
        x(check_nlinks,                 PASS_FSCK)                                              \
        x(delete_dead_inodes,           PASS_FSCK|PASS_UNCLEAN)                                 \
        x(fix_reflink_p,                0)                                                      \
+       x(set_fs_needs_rebalance,       0)                                                      \
 
 enum bch_recovery_pass {
 #define x(n, when)     BCH_RECOVERY_PASS_##n,
index d77d0ea9affffe14b71a7a5a377a38c1a4143672..dbbdf1955f7664ea43b74f8e902eb5de9dd48209 100644 (file)
@@ -7,6 +7,7 @@
 #include "inode.h"
 #include "io_misc.h"
 #include "io_write.h"
+#include "rebalance.h"
 #include "reflink.h"
 #include "subvolume.h"
 #include "super-io.h"
@@ -103,21 +104,22 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
 }
 #endif
 
+static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags)
+{
+       if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) {
+               new->k.type = KEY_TYPE_deleted;
+               new->k.size = 0;
+               set_bkey_val_u64s(&new->k, 0);;
+               *flags &= ~BTREE_TRIGGER_INSERT;
+       }
+}
+
 int bch2_trans_mark_reflink_v(struct btree_trans *trans,
                              enum btree_id btree_id, unsigned level,
                              struct bkey_s_c old, struct bkey_i *new,
                              unsigned flags)
 {
-       if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
-               struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new);
-
-               if (!r->v.refcount) {
-                       r->k.type = KEY_TYPE_deleted;
-                       r->k.size = 0;
-                       set_bkey_val_u64s(&r->k, 0);
-                       return 0;
-               }
-       }
+       check_indirect_extent_deleting(new, &flags);
 
        return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
 }
@@ -132,7 +134,7 @@ int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k,
 }
 
 void bch2_indirect_inline_data_to_text(struct printbuf *out,
-                                       struct bch_fs *c, struct bkey_s_c k)
+                                      struct bch_fs *c, struct bkey_s_c k)
 {
        struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k);
        unsigned datalen = bkey_inline_data_bytes(k.k);
@@ -147,16 +149,7 @@ int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
                              struct bkey_s_c old, struct bkey_i *new,
                              unsigned flags)
 {
-       if (!(flags & BTREE_TRIGGER_OVERWRITE)) {
-               struct bkey_i_indirect_inline_data *r =
-                       bkey_i_to_indirect_inline_data(new);
-
-               if (!r->v.refcount) {
-                       r->k.type = KEY_TYPE_deleted;
-                       r->k.size = 0;
-                       set_bkey_val_u64s(&r->k, 0);
-               }
-       }
+       check_indirect_extent_deleting(new, &flags);
 
        return 0;
 }
@@ -260,6 +253,7 @@ s64 bch2_remap_range(struct bch_fs *c,
        struct bpos dst_start = POS(dst_inum.inum, dst_offset);
        struct bpos src_start = POS(src_inum.inum, src_offset);
        struct bpos dst_end = dst_start, src_end = src_start;
+       struct bch_io_opts opts;
        struct bpos src_want;
        u64 dst_done;
        u32 dst_snapshot, src_snapshot;
@@ -277,6 +271,10 @@ s64 bch2_remap_range(struct bch_fs *c,
        bch2_bkey_buf_init(&new_src);
        trans = bch2_trans_get(c);
 
+       ret = bch2_inum_opts_get(trans, src_inum, &opts);
+       if (ret)
+               goto err;
+
        bch2_trans_iter_init(trans, &src_iter, BTREE_ID_extents, src_start,
                             BTREE_ITER_INTENT);
        bch2_trans_iter_init(trans, &dst_iter, BTREE_ID_extents, dst_start,
@@ -360,10 +358,13 @@ s64 bch2_remap_range(struct bch_fs *c,
                                min(src_k.k->p.offset - src_want.offset,
                                    dst_end.offset - dst_iter.pos.offset));
 
-               ret = bch2_extent_update(trans, dst_inum, &dst_iter,
-                                        new_dst.k, &disk_res,
-                                        new_i_size, i_sectors_delta,
-                                        true);
+               ret =   bch2_bkey_set_needs_rebalance(c, new_dst.k,
+                                       opts.background_target,
+                                       opts.background_compression) ?:
+                       bch2_extent_update(trans, dst_inum, &dst_iter,
+                                       new_dst.k, &disk_res,
+                                       new_i_size, i_sectors_delta,
+                                       true);
                bch2_disk_reservation_put(c, &disk_res);
        }
        bch2_trans_iter_exit(trans, &dst_iter);
@@ -394,7 +395,7 @@ s64 bch2_remap_range(struct bch_fs *c,
 
                bch2_trans_iter_exit(trans, &inode_iter);
        } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart));
-
+err:
        bch2_trans_put(trans);
        bch2_bkey_buf_exit(&new_src, c);
        bch2_bkey_buf_exit(&new_dst, c);
index 0e85c22672be85fb09a1ce51ed457d65ab1d2d5c..ce59018b27acc92d70b75ba68ba6e2b1c4db643a 100644 (file)
@@ -948,9 +948,6 @@ int bch2_fs_start(struct bch_fs *c)
                goto err;
        }
 
-       for_each_online_member(ca, c, i)
-               bch2_sb_from_fs(c, ca);
-
        for_each_online_member(ca, c, i)
                bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now);
 
@@ -960,12 +957,6 @@ int bch2_fs_start(struct bch_fs *c)
                bch2_dev_allocator_add(c, ca);
        bch2_recalc_capacity(c);
 
-       for (i = 0; i < BCH_TRANSACTIONS_NR; i++) {
-               mutex_lock(&c->btree_transaction_stats[i].lock);
-               bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times);
-               mutex_unlock(&c->btree_transaction_stats[i].lock);
-       }
-
        ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
                ? bch2_fs_recovery(c)
                : bch2_fs_initialize(c);
@@ -1591,7 +1582,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
        dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
 
        if (BCH_MEMBER_GROUP(&dev_mi)) {
-               bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
+               bch2_disk_path_to_text_sb(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1);
                if (label.allocation_failure) {
                        ret = -ENOMEM;
                        goto err;
@@ -1689,13 +1680,13 @@ have_slot:
 
        ret = bch2_trans_mark_dev_sb(c, ca);
        if (ret) {
-               bch_err_msg(c, ret, "marking new superblock");
+               bch_err_msg(ca, ret, "marking new superblock");
                goto err_late;
        }
 
        ret = bch2_fs_freespace_init(c);
        if (ret) {
-               bch_err_msg(c, ret, "initializing free space");
+               bch_err_msg(ca, ret, "initializing free space");
                goto err_late;
        }
 
@@ -1763,19 +1754,26 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
        if (ca->mi.state == BCH_MEMBER_STATE_rw)
                __bch2_dev_read_write(c, ca);
 
-       mutex_lock(&c->sb_lock);
-       struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+       if (!ca->mi.freespace_initialized) {
+               ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets);
+               bch_err_msg(ca, ret, "initializing free space");
+               if (ret)
+                       goto err;
+       }
 
-       m->last_mount =
-               cpu_to_le64(ktime_get_real_seconds());
+       if (!ca->journal.nr) {
+               ret = bch2_dev_journal_alloc(ca);
+               bch_err_msg(ca, ret, "allocating journal");
+               if (ret)
+                       goto err;
+       }
 
+       mutex_lock(&c->sb_lock);
+       bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount =
+               cpu_to_le64(ktime_get_real_seconds());
        bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
 
-       ret = bch2_fs_freespace_init(c);
-       if (ret)
-               bch_err_msg(c, ret, "initializing free space");
-
        up_write(&c->state_lock);
        return 0;
 err:
index 78d6138db62d7e4a1f1fe07bb043ca7ada417986..7dda4985b99fe6cfdde52c6df869e3df446d48d0 100644 (file)
@@ -37,16 +37,4 @@ struct bch_member_cpu {
        u8                      valid;
 };
 
-struct bch_disk_group_cpu {
-       bool                            deleted;
-       u16                             parent;
-       struct bch_devs_mask            devs;
-};
-
-struct bch_disk_groups_cpu {
-       struct rcu_head                 rcu;
-       unsigned                        nr;
-       struct bch_disk_group_cpu       entries[] __counted_by(nr);
-};
-
 #endif /* _BCACHEFS_SUPER_TYPES_H */
index 89544dadcfd0c6c8e55a3bf65f1996627d3ce1a1..db2727e5cc5feec1a9c4d9a4c53a689bd8a91201 100644 (file)
@@ -212,7 +212,7 @@ read_attribute(copy_gc_wait);
 
 rw_attribute(rebalance_enabled);
 sysfs_pd_controller_attribute(rebalance);
-read_attribute(rebalance_work);
+read_attribute(rebalance_status);
 rw_attribute(promote_whole_extents);
 
 read_attribute(new_stripes);
@@ -386,8 +386,8 @@ SHOW(bch2_fs)
        if (attr == &sysfs_copy_gc_wait)
                bch2_copygc_wait_to_text(out, c);
 
-       if (attr == &sysfs_rebalance_work)
-               bch2_rebalance_work_to_text(out, c);
+       if (attr == &sysfs_rebalance_status)
+               bch2_rebalance_status_to_text(out, c);
 
        sysfs_print(promote_whole_extents,      c->promote_whole_extents);
 
@@ -646,7 +646,7 @@ struct attribute *bch2_fs_internal_files[] = {
        &sysfs_copy_gc_wait,
 
        &sysfs_rebalance_enabled,
-       &sysfs_rebalance_work,
+       &sysfs_rebalance_status,
        sysfs_pd_controller_files(rebalance),
 
        &sysfs_moving_ctxts,
@@ -707,10 +707,8 @@ STORE(bch2_fs_opts_dir)
        bch2_opt_set_by_id(&c->opts, id, v);
 
        if ((id == Opt_background_target ||
-            id == Opt_background_compression) && v) {
-               bch2_rebalance_add_work(c, S64_MAX);
-               rebalance_wakeup(c);
-       }
+            id == Opt_background_compression) && v)
+               bch2_set_rebalance_needs_scan(c, 0);
 
        ret = size;
 err:
@@ -910,13 +908,8 @@ SHOW(bch2_dev)
        sysfs_print(discard,            ca->mi.discard);
 
        if (attr == &sysfs_label) {
-               if (ca->mi.group) {
-                       mutex_lock(&c->sb_lock);
-                       bch2_disk_path_to_text(out, c->disk_sb.sb,
-                                              ca->mi.group - 1);
-                       mutex_unlock(&c->sb_lock);
-               }
-
+               if (ca->mi.group)
+                       bch2_disk_path_to_text(out, c, ca->mi.group - 1);
                prt_char(out, '\n');
        }
 
index 33efa6005c6f2b1f0885a1f07f146bfd5de0a0a4..dc48b52b01b49c4ed7af877921dd7e2b446d75a8 100644 (file)
@@ -7,6 +7,7 @@
 #include "btree_locking.h"
 #include "btree_update_interior.h"
 #include "keylist.h"
+#include "move_types.h"
 #include "opts.h"
 #include "six.h"
 
index 2308f49f3b2e2316cbe027c5a79812f92dfcc328..81f72b2add09a5dc250cfe8646d2b9a37cbaa45b 100644 (file)
@@ -767,25 +767,36 @@ DEFINE_EVENT(bkey, move_extent_alloc_mem_fail,
 );
 
 TRACE_EVENT(move_data,
-       TP_PROTO(struct bch_fs *c, u64 sectors_moved,
-                u64 keys_moved),
-       TP_ARGS(c, sectors_moved, keys_moved),
+       TP_PROTO(struct bch_fs *c,
+                struct bch_move_stats *stats),
+       TP_ARGS(c, stats),
 
        TP_STRUCT__entry(
-               __field(dev_t,          dev                     )
-               __field(u64,            sectors_moved   )
+               __field(dev_t,          dev             )
                __field(u64,            keys_moved      )
+               __field(u64,            keys_raced      )
+               __field(u64,            sectors_seen    )
+               __field(u64,            sectors_moved   )
+               __field(u64,            sectors_raced   )
        ),
 
        TP_fast_assign(
-               __entry->dev                    = c->dev;
-               __entry->sectors_moved = sectors_moved;
-               __entry->keys_moved = keys_moved;
+               __entry->dev            = c->dev;
+               __entry->keys_moved     = atomic64_read(&stats->keys_moved);
+               __entry->keys_raced     = atomic64_read(&stats->keys_raced);
+               __entry->sectors_seen   = atomic64_read(&stats->sectors_seen);
+               __entry->sectors_moved  = atomic64_read(&stats->sectors_moved);
+               __entry->sectors_raced  = atomic64_read(&stats->sectors_raced);
        ),
 
-       TP_printk("%d,%d sectors_moved %llu keys_moved %llu",
+       TP_printk("%d,%d keys moved %llu raced %llu"
+                 "sectors seen %llu moved %llu raced %llu",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->sectors_moved, __entry->keys_moved)
+                 __entry->keys_moved,
+                 __entry->keys_raced,
+                 __entry->sectors_seen,
+                 __entry->sectors_moved,
+                 __entry->sectors_raced)
 );
 
 TRACE_EVENT(evacuate_bucket,
index b069b1a62e25186be7fb068255080ca179593868..74b41f567ab808f7c4a581cd0812e9de3867b46d 100644 (file)
@@ -590,7 +590,7 @@ err:
        if (value &&
            (opt_id == Opt_background_compression ||
             opt_id == Opt_background_target))
-               bch2_rebalance_add_work(c, inode->v.i_blocks);
+               bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
 
        return bch2_err_class(ret);
 }
index 2958169ce833efc1b838bad6c6453e01334904b1..1faa24d6400e6097c8a0640d34768e32a9cd68b8 100644 (file)
@@ -22,6 +22,10 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
                panic("closure_put_after_sub: bogus flags %x remaining %i", flags, r);
 
        if (!r) {
+               smp_acquire__after_ctrl_dep();
+
+               cl->closure_get_happened = false;
+
                if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
                        atomic_set(&cl->remaining,
                                   CLOSURE_REMAINING_INITIALIZER);
@@ -44,7 +48,7 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
 /* For clearing flags with the same atomic op as a put */
 void closure_sub(struct closure *cl, int v)
 {
-       closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
+       closure_put_after_sub(cl, atomic_sub_return_release(v, &cl->remaining));
 }
 EXPORT_SYMBOL(closure_sub);
 
@@ -53,7 +57,7 @@ EXPORT_SYMBOL(closure_sub);
  */
 void closure_put(struct closure *cl)
 {
-       closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
+       closure_put_after_sub(cl, atomic_dec_return_release(&cl->remaining));
 }
 EXPORT_SYMBOL(closure_put);
 
@@ -91,6 +95,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
        if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
                return false;
 
+       cl->closure_get_happened = true;
        closure_set_waiting(cl, _RET_IP_);
        atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
        llist_add(&cl->list, &waitlist->list);