Update bcachefs sources to d5e561b3cc bcachefs: BCH_DATA ioctl

author Kent Overstreet <kent.overstreet@gmail.com>

Wed, 7 Feb 2018 13:08:24 +0000 (08:08 -0500)

committer Kent Overstreet <kent.overstreet@gmail.com>

Thu, 8 Feb 2018 20:33:42 +0000 (15:33 -0500)
author Kent Overstreet <kent.overstreet@gmail.com>
Wed, 7 Feb 2018 13:08:24 +0000 (08:08 -0500)
committer Kent Overstreet <kent.overstreet@gmail.com>
Thu, 8 Feb 2018 20:33:42 +0000 (15:33 -0500)
diff --git a/.bcachefs_revision b/.bcachefs_revision

index f35d38b80a5ab611f610ff86b756d8becb98199c..274236e3701e179353092fb7a86ad3c3a247036a 100644 (file)
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-496cbe9474173ec41bf221dc8ab1f5d70a128c3b
+d5e561b3cc023dd247d2b3d08b680709ec21b477
diff --git a/include/linux/bio.h b/include/linux/bio.h

index dcaffedb563e473bab14972db9a641dad9ee2dce..1bd21ee3ac8e330e596792e25cac489c44643b1b 100644 (file)
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -257,6 +257,8 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
  extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
                                struct bio *src, struct bvec_iter *src_iter);
  extern void bio_copy_data(struct bio *dst, struct bio *src);
+
+void bio_free_pages(struct bio *bio);
  extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
  
  void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter);
diff --git a/include/linux/completion.h b/include/linux/completion.h

index 1808d21e25a047d3a130d15f05200725d7cd7592..d11a8dd0df298da13d7e014fc289a6ddc99c815f 100644 (file)
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -37,4 +37,6 @@ static inline void reinit_completion(struct completion *x)
  void complete(struct completion *);
  void wait_for_completion(struct completion *);
  
+#define wait_for_completion_interruptible(x) (wait_for_completion(x), 0)
+
  #endif
diff --git a/include/linux/random.h b/include/linux/random.h

index 243c0602bbc6cc190c874d6aafcc508621f73777..90fe574921ba28cccb7441ecf0b3a1c73cf88d7d 100644 (file)
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -29,6 +29,11 @@ static inline void get_random_bytes(void *buf, int nbytes)
         BUG_ON(getrandom(buf, nbytes, 0) != nbytes);
  }
  
+static inline void prandom_bytes(void *buf, int nbytes)
+{
+       return get_random_bytes(buf, nbytes);
+}
+
  #define get_random_type(type)                          \
  static inline type get_random_##type(void)             \
  {                                                      \
diff --git a/libbcachefs.c b/libbcachefs.c

index 3632e30d4fa861f6004cee05d98ba7f86f5b87e1..238cca9986b733464cb0cf2483fd114bf4a60810 100644 (file)
--- a/libbcachefs.c
+++ b/libbcachefs.c
@@ -459,6 +459,11 @@ static void bch2_sb_print_quota(struct bch_sb *sb, struct bch_sb_field *f,
  {
  }
  
+static void bch2_sb_print_disk_groups(struct bch_sb *sb, struct bch_sb_field *f,
+                                     enum units units)
+{
+}
+
  typedef void (*sb_field_print_fn)(struct bch_sb *, struct bch_sb_field *, enum units);
  
  struct bch_sb_field_ops {
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h

index 298f26d4161d84bbf5818dc2c176b8d555a87c1b..cb9906c5bd22ba4aa96bf8f151840441228b844b 100644 (file)
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -496,6 +496,8 @@ struct bch_fs {
         struct bch_replicas_cpu __rcu *replicas_gc;
         struct mutex            replicas_gc_lock;
  
+       struct bch_disk_groups_cpu __rcu *disk_groups;
+
         struct bch_opts         opts;
  
         /* Updated by bch2_sb_update():*/
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h

index cb9e450ba2868455839280066cbd8f8b72201058..85f728f21295023df3c06d34b3ac2e7ca566d0c7 100644 (file)
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -782,7 +782,8 @@ struct bch_sb_field {
         x(members,      1)      \
         x(crypt,        2)      \
         x(replicas,     3)      \
-       x(quota,        4)
+       x(quota,        4)      \
+       x(disk_groups,  5)
  
  enum bch_sb_field_type {
  #define x(f, nr)       BCH_SB_FIELD_##f = nr,
@@ -815,8 +816,9 @@ LE64_BITMASK(BCH_MEMBER_STATE,              struct bch_member, flags[0],  0,  4)
  LE64_BITMASK(BCH_MEMBER_TIER,          struct bch_member, flags[0],  4,  8)
  /* 8-10 unused, was HAS_(META)DATA */
  LE64_BITMASK(BCH_MEMBER_REPLACEMENT,   struct bch_member, flags[0], 10, 14)
-LE64_BITMASK(BCH_MEMBER_DISCARD,       struct bch_member, flags[0], 14, 15);
-LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,  struct bch_member, flags[0], 15, 20);
+LE64_BITMASK(BCH_MEMBER_DISCARD,       struct bch_member, flags[0], 14, 15)
+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,  struct bch_member, flags[0], 15, 20)
+LE64_BITMASK(BCH_MEMBER_GROUP,         struct bch_member, flags[0], 20, 28)
  
  #if 0
  LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,        struct bch_member, flags[1], 0,  20);
@@ -933,6 +935,23 @@ struct bch_sb_field_quota {
         struct bch_sb_quota_type        q[QTYP_NR];
  } __attribute__((packed, aligned(8)));
  
+/* BCH_SB_FIELD_disk_groups: */
+
+#define BCH_SB_LABEL_SIZE              32
+
+struct bch_disk_group {
+       __u8                    label[BCH_SB_LABEL_SIZE];
+       __le64                  flags[2];
+};
+
+LE64_BITMASK(BCH_GROUP_DELETED,                struct bch_disk_group, flags[0], 0, 1)
+LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,   struct bch_disk_group, flags[0], 1, 6)
+
+struct bch_sb_field_disk_groups {
+       struct bch_sb_field     field;
+       struct bch_disk_group   entries[0];
+};
+
  /* Superblock: */
  
  /*
@@ -947,7 +966,6 @@ struct bch_sb_field_quota {
  #define BCH_SB_VERSION_MAX             9
  
  #define BCH_SB_SECTOR                  8
-#define BCH_SB_LABEL_SIZE              32
  #define BCH_SB_MEMBERS_MAX             64 /* XXX kill */
  
  struct bch_sb_layout {
@@ -1069,20 +1087,6 @@ enum bch_sb_features {
  
  #define BCH_REPLICAS_MAX               4U
  
-#if 0
-#define BCH_ERROR_ACTIONS()                                    \
-       x(BCH_ON_ERROR_CONTINUE,        0, "continue")          \
-       x(BCH_ON_ERROR_RO,              1, "remount-ro")        \
-       x(BCH_ON_ERROR_PANIC,           2, "panic")             \
-       x(BCH_NR_ERROR_ACTIONS,         3, NULL)
-
-enum bch_error_actions {
-#define x(_opt, _nr, _str)     _opt = _nr,
-       BCH_ERROR_ACTIONS()
-#undef x
-};
-#endif
-
  enum bch_error_actions {
         BCH_ON_ERROR_CONTINUE           = 0,
         BCH_ON_ERROR_RO                 = 1,
diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h

index aa2a20504a9f6d8750665b2aa6d2f53f027bccd7..6578847b50c4cdf25539fcad05afabb470999f21 100644 (file)
--- a/libbcachefs/bcachefs_ioctl.h
+++ b/libbcachefs/bcachefs_ioctl.h
@@ -46,7 +46,6 @@ struct bch_ioctl_incremental {
  #define BCH_IOCTL_DISK_ONLINE  _IOW(0xbc,      6,  struct bch_ioctl_disk)
  #define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc,      7,  struct bch_ioctl_disk)
  #define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc,    8,  struct bch_ioctl_disk_set_state)
-#define BCH_IOCTL_DISK_EVACUATE        _IOW(0xbc,      9,  struct bch_ioctl_disk)
  #define BCH_IOCTL_DATA         _IOW(0xbc,      10, struct bch_ioctl_data)
  #define BCH_IOCTL_USAGE                _IOWR(0xbc,     11, struct bch_ioctl_usage)
  #define BCH_IOCTL_READ_SUPER   _IOW(0xbc,      12, struct bch_ioctl_read_super)
@@ -75,30 +74,37 @@ struct bch_ioctl_disk_set_state {
         __u64                   dev;
  };
  
-#define BCH_REWRITE_INCREASE_REPLICAS  (1 << 0)
-#define BCH_REWRITE_DECREASE_REPLICAS  (1 << 1)
-
-#define BCH_REWRITE_RECOMPRESS         (1 << 0)
-#define BCH_REWRITE_DECREASE_REPLICAS  (1 << 1)
-
  enum bch_data_ops {
-       BCH_DATA_SCRUB,
-};
-
-struct bch_data_op {
-       __u8                    type;
+       BCH_DATA_OP_SCRUB       = 0,
+       BCH_DATA_OP_REREPLICATE = 1,
+       BCH_DATA_OP_MIGRATE     = 2,
+       BCH_DATA_OP_NR          = 3,
  };
  
  struct bch_ioctl_data {
+       __u32                   op;
         __u32                   flags;
-       __u32                   pad;
-
-       __u64                   start_inode;
-       __u64                   start_offset;
  
-       __u64                   end_inode;
-       __u64                   end_offset;
-};
+       struct bpos             start;
+       struct bpos             end;
+
+       union {
+       struct {
+               __u32           dev;
+               __u32           pad;
+       }                       migrate;
+       };
+} __attribute__((packed, aligned(8)));
+
+struct bch_ioctl_data_progress {
+       __u8                    data_type;
+       __u8                    btree_id;
+       __u8                    pad[2];
+       struct bpos             pos;
+
+       __u64                   sectors_done;
+       __u64                   sectors_total;
+} __attribute__((packed, aligned(8)));
  
  struct bch_ioctl_dev_usage {
         __u8                    state;
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c

index 1498832ba28f6a2893dee4424b2626507c40a76a..5ff90cc0015fca9e13afbfde0dd6dd649ebac4b6 100644 (file)
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -1,20 +1,25 @@
  #ifndef NO_BCACHEFS_CHARDEV
  
  #include "bcachefs.h"
+#include "alloc.h"
  #include "bcachefs_ioctl.h"
  #include "buckets.h"
  #include "chardev.h"
+#include "move.h"
  #include "super.h"
  #include "super-io.h"
  
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/major.h>
+#include <linux/anon_inodes.h>
  #include <linux/cdev.h>
  #include <linux/device.h>
+#include <linux/file.h>
+#include <linux/fs.h>
  #include <linux/ioctl.h>
-#include <linux/uaccess.h>
+#include <linux/kthread.h>
+#include <linux/major.h>
+#include <linux/sched/task.h>
  #include <linux/slab.h>
+#include <linux/uaccess.h>
  
  /* returns with ref on ca->ref */
  static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
@@ -266,23 +271,108 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c,
         return ret;
  }
  
-static long bch2_ioctl_disk_evacuate(struct bch_fs *c,
-                                    struct bch_ioctl_disk arg)
+struct bch_data_ctx {
+       struct bch_fs                   *c;
+       struct bch_ioctl_data           arg;
+       struct bch_move_stats           stats;
+
+       int                             ret;
+
+       struct task_struct              *thread;
+};
+
+static int bch2_data_thread(void *arg)
  {
-       struct bch_dev *ca;
-       int ret;
+       struct bch_data_ctx *ctx = arg;
  
-       if ((arg.flags & ~BCH_BY_INDEX) ||
-           arg.pad)
+       ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
+
+       ctx->stats.data_type = U8_MAX;
+       return 0;
+}
+
+static int bch2_data_job_release(struct inode *inode, struct file *file)
+{
+       struct bch_data_ctx *ctx = file->private_data;
+
+       kthread_stop(ctx->thread);
+       put_task_struct(ctx->thread);
+       kfree(ctx);
+       return 0;
+}
+
+static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
+                                 size_t len, loff_t *ppos)
+{
+       struct bch_data_ctx *ctx = file->private_data;
+       struct bch_fs *c = ctx->c;
+       struct bch_ioctl_data_progress p = {
+               .data_type      = ctx->stats.data_type,
+               .btree_id       = ctx->stats.iter.btree_id,
+               .pos            = ctx->stats.iter.pos,
+               .sectors_done   = atomic64_read(&ctx->stats.sectors_seen),
+               .sectors_total  = bch2_fs_sectors_used(c, bch2_fs_usage_read(c)),
+       };
+
+       if (len != sizeof(p))
                 return -EINVAL;
  
-       ca = bch2_device_lookup(c, arg.dev, arg.flags);
-       if (IS_ERR(ca))
-               return PTR_ERR(ca);
+       return copy_to_user(buf, &p, sizeof(p)) ?: sizeof(p);
+}
  
-       ret = bch2_dev_evacuate(c, ca);
+static const struct file_operations bcachefs_data_ops = {
+       .release        = bch2_data_job_release,
+       .read           = bch2_data_job_read,
+       .llseek         = no_llseek,
+};
  
-       percpu_ref_put(&ca->ref);
+static long bch2_ioctl_data(struct bch_fs *c,
+                           struct bch_ioctl_data arg)
+{
+       struct bch_data_ctx *ctx = NULL;
+       struct file *file = NULL;
+       unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
+       int ret, fd = -1;
+
+       if (arg.op >= BCH_DATA_OP_NR || arg.flags)
+               return -EINVAL;
+
+       ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+       if (!ctx)
+               return -ENOMEM;
+
+       ctx->c = c;
+       ctx->arg = arg;
+
+       ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]");
+       if (IS_ERR(ctx->thread)) {
+               ret = PTR_ERR(ctx->thread);
+               goto err;
+       }
+
+       ret = get_unused_fd_flags(flags);
+       if (ret < 0)
+               goto err;
+       fd = ret;
+
+       file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags);
+       if (IS_ERR(file)) {
+               ret = PTR_ERR(file);
+               goto err;
+       }
+
+       fd_install(fd, file);
+
+       get_task_struct(ctx->thread);
+       wake_up_process(ctx->thread);
+
+       return fd;
+err:
+       if (fd >= 0)
+               put_unused_fd(fd);
+       if (!IS_ERR_OR_NULL(ctx->thread))
+               kthread_stop(ctx->thread);
+       kfree(ctx);
         return ret;
  }
  
@@ -474,8 +564,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
                 BCH_IOCTL(disk_offline, struct bch_ioctl_disk);
         case BCH_IOCTL_DISK_SET_STATE:
                 BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
-       case BCH_IOCTL_DISK_EVACUATE:
-               BCH_IOCTL(disk_evacuate, struct bch_ioctl_disk);
+       case BCH_IOCTL_DATA:
+               BCH_IOCTL(data, struct bch_ioctl_data);
         case BCH_IOCTL_READ_SUPER:
                 BCH_IOCTL(read_super, struct bch_ioctl_read_super);
         case BCH_IOCTL_DISK_GET_IDX:
@@ -488,9 +578,12 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
         }
  }
  
+static DEFINE_IDR(bch_chardev_minor);
+
  static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v)
  {
-       struct bch_fs *c = filp->private_data;
+       unsigned minor = iminor(file_inode(filp));
+       struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL;
         void __user *arg = (void __user *) v;
  
         return c
@@ -507,7 +600,6 @@ static const struct file_operations bch_chardev_fops = {
  static int bch_chardev_major;
  static struct class *bch_chardev_class;
  static struct device *bch_chardev;
-static DEFINE_IDR(bch_chardev_minor);
  
  void bch2_fs_chardev_exit(struct bch_fs *c)
  {
@@ -524,7 +616,7 @@ int bch2_fs_chardev_init(struct bch_fs *c)
                 return c->minor;
  
         c->chardev = device_create(bch_chardev_class, NULL,
-                                  MKDEV(bch_chardev_major, c->minor), NULL,
+                                  MKDEV(bch_chardev_major, c->minor), c,
                                    "bcachefs%u-ctl", c->minor);
         if (IS_ERR(c->chardev))
                 return PTR_ERR(c->chardev);
@@ -536,7 +628,7 @@ void bch2_chardev_exit(void)
  {
         if (!IS_ERR_OR_NULL(bch_chardev_class))
                 device_destroy(bch_chardev_class,
-                              MKDEV(bch_chardev_major, 255));
+                              MKDEV(bch_chardev_major, U8_MAX));
         if (!IS_ERR_OR_NULL(bch_chardev_class))
                 class_destroy(bch_chardev_class);
         if (bch_chardev_major > 0)
@@ -554,7 +646,7 @@ int __init bch2_chardev_init(void)
                 return PTR_ERR(bch_chardev_class);
  
         bch_chardev = device_create(bch_chardev_class, NULL,
-                                   MKDEV(bch_chardev_major, 255),
+                                   MKDEV(bch_chardev_major, U8_MAX),
                                     NULL, "bcachefs-ctl");
         if (IS_ERR(bch_chardev))
                 return PTR_ERR(bch_chardev);
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c

index c2469167efeab03a4b0bed6eeeed33878c639e52..f5dccfad15d6572d4470e859a023e9d4a59cf4de 100644 (file)
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -139,6 +139,34 @@ bool bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev)
         return dropped;
  }
  
+const struct bch_extent_ptr *
+bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group)
+{
+       const struct bch_extent_ptr *ptr;
+
+       extent_for_each_ptr(e, ptr) {
+               struct bch_dev *ca = c->devs[ptr->dev];
+
+               if (ca->mi.group &&
+                   ca->mi.group == group)
+                       return ptr;
+       }
+
+       return NULL;
+}
+
+const struct bch_extent_ptr *
+bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target)
+{
+       const struct bch_extent_ptr *ptr;
+
+       extent_for_each_ptr(e, ptr)
+               if (dev_in_target(c->devs[ptr->dev], target))
+                       return ptr;
+
+       return NULL;
+}
+
  unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e)
  {
         const struct bch_extent_ptr *ptr;
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h

index eda34381001e21f69a350bf8edc2f026ae7bf85b..e8f54f2e9acb89ff17a795ef11f853e713268520 100644 (file)
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -43,6 +43,10 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent);
  const struct bch_extent_ptr *
  bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
  bool bch2_extent_drop_device(struct bkey_s_extent, unsigned);
+const struct bch_extent_ptr *
+bch2_extent_has_group(struct bch_fs *, struct bkey_s_c_extent, unsigned);
+const struct bch_extent_ptr *
+bch2_extent_has_target(struct bch_fs *, struct bkey_s_c_extent, unsigned);
  
  unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
  unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c

index 1870534df6d0ee9beec5f003840b287e40699efe..a1e45625704afd42509f8f3a20c8132554b5dbf0 100644 (file)
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -54,6 +54,13 @@ static inline u64 journal_last_seq(struct journal *j)
         return j->pin.front;
  }
  
+static inline u64 journal_cur_seq(struct journal *j)
+{
+       BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq));
+
+       return j->pin.back - 1;
+}
+
  static inline u64 journal_pin_seq(struct journal *j,
                                   struct journal_entry_pin_list *pin_list)
  {
@@ -264,7 +271,9 @@ int bch2_journal_seq_should_ignore(struct bch_fs *c, u64 seq, struct btree *b)
         if (!seq)
                 return 0;
  
-       journal_seq = atomic64_read(&j->seq);
+       spin_lock(&j->lock);
+       journal_seq = journal_cur_seq(j);
+       spin_unlock(&j->lock);
  
         /* Interier updates aren't journalled: */
         BUG_ON(b->level);
@@ -989,6 +998,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
         u64 cur_seq, end_seq, seq;
         unsigned iter, keys = 0, entries = 0;
         size_t nr;
+       bool degraded = false;
         int ret = 0;
  
         closure_init_stack(&jlist.cl);
@@ -996,12 +1006,19 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
         jlist.head = list;
         jlist.ret = 0;
  
-       for_each_readable_member(ca, c, iter) {
-               percpu_ref_get(&ca->io_ref);
-               closure_call(&ca->journal.read,
-                            bch2_journal_read_device,
-                            system_unbound_wq,
-                            &jlist.cl);
+       for_each_member_device(ca, c, iter) {
+               if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL)))
+                       continue;
+
+               if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
+                    ca->mi.state == BCH_MEMBER_STATE_RO) &&
+                   percpu_ref_tryget(&ca->io_ref))
+                       closure_call(&ca->journal.read,
+                                    bch2_journal_read_device,
+                                    system_unbound_wq,
+                                    &jlist.cl);
+               else
+                       degraded = true;
         }
  
         closure_sync(&jlist.cl);
@@ -1022,11 +1039,17 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
                 if (ret)
                         goto fsck_err;
  
-               if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-                   fsck_err_on(!bch2_sb_has_replicas(c, BCH_DATA_JOURNAL,
-                                                     i->devs), c,
-                               "superblock not marked as containing replicas (type %u)",
-                               BCH_DATA_JOURNAL)) {
+               /*
+                * If we're mounting in degraded mode - if we didn't read all
+                * the devices - this is wrong:
+                */
+
+               if (!degraded &&
+                   (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
+                    fsck_err_on(!bch2_sb_has_replicas(c, BCH_DATA_JOURNAL,
+                                                      i->devs), c,
+                                "superblock not marked as containing replicas (type %u)",
+                                BCH_DATA_JOURNAL))) {
                         ret = bch2_check_mark_super(c, BCH_DATA_JOURNAL,
                                                     i->devs);
                         if (ret)
@@ -1111,7 +1134,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
         }
  
         bch_info(c, "journal read done, %i keys in %i entries, seq %llu",
-                keys, entries, (u64) atomic64_read(&j->seq));
+                keys, entries, journal_cur_seq(j));
  fsck_err:
         return ret;
  }
@@ -1174,9 +1197,6 @@ static void journal_pin_new_entry(struct journal *j, int count)
         atomic64_inc(&j->seq);
         p = fifo_push_ref(&j->pin);
  
-       EBUG_ON(journal_seq_pin(j, atomic64_read(&j->seq)) !=
-               &fifo_peek_back(&j->pin));
-
         INIT_LIST_HEAD(&p->list);
         INIT_LIST_HEAD(&p->flushed);
         atomic_set(&p->count, count);
@@ -1190,7 +1210,7 @@ static void bch2_journal_buf_init(struct journal *j)
         memset(buf->has_inode, 0, sizeof(buf->has_inode));
  
         memset(buf->data, 0, sizeof(*buf->data));
-       buf->data->seq  = cpu_to_le64(atomic64_read(&j->seq));
+       buf->data->seq  = cpu_to_le64(journal_cur_seq(j));
         buf->data->u64s = 0;
  }
  
@@ -1472,7 +1492,7 @@ void bch2_journal_start(struct bch_fs *c)
  
         set_bit(JOURNAL_STARTED, &j->flags);
  
-       while (atomic64_read(&j->seq) < new_seq)
+       while (journal_cur_seq(j) < new_seq)
                 journal_pin_new_entry(j, 0);
  
         /*
@@ -2015,9 +2035,11 @@ static void journal_reclaim_work(struct work_struct *work)
                 mutex_unlock(&j->reclaim_lock);
  
         /* Also flush if the pin fifo is more than half full */
+       spin_lock(&j->lock);
         seq_to_flush = max_t(s64, seq_to_flush,
-                            (s64) atomic64_read(&j->seq) -
+                            (s64) journal_cur_seq(j) -
                              (j->pin.size >> 1));
+       spin_unlock(&j->lock);
  
         /*
          * If it's been longer than j->reclaim_delay_ms since we last flushed,
@@ -2110,7 +2132,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
  
                 ja->sectors_free = ca->mi.bucket_size - sectors;
                 ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
-               ja->bucket_seq[ja->cur_idx] = atomic64_read(&j->seq);
+               ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
  
                 extent_ptr_append(bkey_i_to_extent(&j->key),
                         (struct bch_extent_ptr) {
@@ -2436,9 +2458,9 @@ u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
  
         spin_lock(&j->lock);
         if (test_bit(h, journal_cur_buf(j)->has_inode))
-               seq = atomic64_read(&j->seq);
+               seq = journal_cur_seq(j);
         else if (test_bit(h, journal_prev_buf(j)->has_inode))
-               seq = atomic64_read(&j->seq) - 1;
+               seq = journal_cur_seq(j) - 1;
         spin_unlock(&j->lock);
  
         return seq;
@@ -2547,7 +2569,7 @@ u64 bch2_journal_last_unwritten_seq(struct journal *j)
         u64 seq;
  
         spin_lock(&j->lock);
-       seq = atomic64_read(&j->seq);
+       seq = journal_cur_seq(j);
         if (j->reservations.prev_buf_unwritten)
                 seq--;
         spin_unlock(&j->lock);
@@ -2560,9 +2582,9 @@ int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *pare
         int ret;
  
         spin_lock(&j->lock);
-       BUG_ON(seq > atomic64_read(&j->seq));
+       BUG_ON(seq > journal_cur_seq(j));
  
-       if (seq < atomic64_read(&j->seq) ||
+       if (seq < journal_cur_seq(j) ||
             journal_entry_is_open(j)) {
                 spin_unlock(&j->lock);
                 return 1;
@@ -2583,17 +2605,17 @@ void bch2_journal_wait_on_seq(struct journal *j, u64 seq, struct closure *parent
  {
         spin_lock(&j->lock);
  
-       BUG_ON(seq > atomic64_read(&j->seq));
+       BUG_ON(seq > journal_cur_seq(j));
  
         if (bch2_journal_error(j)) {
                 spin_unlock(&j->lock);
                 return;
         }
  
-       if (seq == atomic64_read(&j->seq)) {
+       if (seq == journal_cur_seq(j)) {
                 if (!closure_wait(&journal_cur_buf(j)->wait, parent))
                         BUG();
-       } else if (seq + 1 == atomic64_read(&j->seq) &&
+       } else if (seq + 1 == journal_cur_seq(j) &&
                    j->reservations.prev_buf_unwritten) {
                 if (!closure_wait(&journal_prev_buf(j)->wait, parent))
                         BUG();
@@ -2615,14 +2637,14 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa
  
         spin_lock(&j->lock);
  
-       BUG_ON(seq > atomic64_read(&j->seq));
+       BUG_ON(seq > journal_cur_seq(j));
  
         if (bch2_journal_error(j)) {
                 spin_unlock(&j->lock);
                 return;
         }
  
-       if (seq == atomic64_read(&j->seq)) {
+       if (seq == journal_cur_seq(j)) {
                 bool set_need_write = false;
  
                 buf = journal_cur_buf(j);
@@ -2643,7 +2665,7 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa
                 case JOURNAL_ENTRY_CLOSED:
                         /*
                          * Journal entry hasn't been opened yet, but caller
-                        * claims it has something (seq == j->seq):
+                        * claims it has something
                          */
                         BUG();
                 case JOURNAL_ENTRY_INUSE:
@@ -2652,7 +2674,7 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa
                         return;
                 }
         } else if (parent &&
-                  seq + 1 == atomic64_read(&j->seq) &&
+                  seq + 1 == journal_cur_seq(j) &&
                    j->reservations.prev_buf_unwritten) {
                 buf = journal_prev_buf(j);
  
@@ -2676,9 +2698,9 @@ static int journal_seq_flushed(struct journal *j, u64 seq)
         int ret = 1;
  
         spin_lock(&j->lock);
-       BUG_ON(seq > atomic64_read(&j->seq));
+       BUG_ON(seq > journal_cur_seq(j));
  
-       if (seq == atomic64_read(&j->seq)) {
+       if (seq == journal_cur_seq(j)) {
                 bool set_need_write = false;
  
                 ret = 0;
@@ -2697,7 +2719,7 @@ static int journal_seq_flushed(struct journal *j, u64 seq)
                 case JOURNAL_ENTRY_CLOSED:
                         /*
                          * Journal entry hasn't been opened yet, but caller
-                        * claims it has something (seq == j->seq):
+                        * claims it has something
                          */
                         BUG();
                 case JOURNAL_ENTRY_INUSE:
@@ -2705,7 +2727,7 @@ static int journal_seq_flushed(struct journal *j, u64 seq)
                 case JOURNAL_UNLOCKED:
                         return 0;
                 }
-       } else if (seq + 1 == atomic64_read(&j->seq) &&
+       } else if (seq + 1 == journal_cur_seq(j) &&
                    j->reservations.prev_buf_unwritten) {
                 ret = bch2_journal_error(j);
         }
@@ -2762,7 +2784,7 @@ void bch2_journal_flush_async(struct journal *j, struct closure *parent)
         u64 seq, journal_seq;
  
         spin_lock(&j->lock);
-       journal_seq = atomic64_read(&j->seq);
+       journal_seq = journal_cur_seq(j);
  
         if (journal_entry_is_open(j)) {
                 seq = journal_seq;
@@ -2782,7 +2804,7 @@ int bch2_journal_flush(struct journal *j)
         u64 seq, journal_seq;
  
         spin_lock(&j->lock);
-       journal_seq = atomic64_read(&j->seq);
+       journal_seq = journal_cur_seq(j);
  
         if (journal_entry_is_open(j)) {
                 seq = journal_seq;
@@ -2797,7 +2819,7 @@ int bch2_journal_flush(struct journal *j)
         return bch2_journal_flush_seq(j, seq);
  }
  
-int bch2_journal_flush_device(struct journal *j, unsigned dev_idx)
+int bch2_journal_flush_device(struct journal *j, int dev_idx)
  {
         struct bch_fs *c = container_of(j, struct bch_fs, journal);
         struct journal_entry_pin_list *p;
@@ -2807,7 +2829,9 @@ int bch2_journal_flush_device(struct journal *j, unsigned dev_idx)
  
         spin_lock(&j->lock);
         fifo_for_each_entry_ptr(p, &j->pin, iter)
-               if (bch2_dev_list_has_dev(p->devs, dev_idx))
+               if (dev_idx >= 0
+                   ? bch2_dev_list_has_dev(p->devs, dev_idx)
+                   : p->devs.nr < c->opts.metadata_replicas)
                         seq = iter;
         spin_unlock(&j->lock);
  
@@ -2821,7 +2845,7 @@ int bch2_journal_flush_device(struct journal *j, unsigned dev_idx)
         seq = 0;
  
         spin_lock(&j->lock);
-       while (!ret && seq < atomic64_read(&j->seq)) {
+       while (!ret && seq < j->pin.back) {
                 seq = max(seq, journal_last_seq(j));
                 devs = journal_seq_pin(j, seq)->devs;
                 seq++;
@@ -2982,7 +3006,7 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
                          "dirty:\t\t\t%i\n"
                          "replay done:\t\t%i\n",
                          fifo_used(&j->pin),
-                        (u64) atomic64_read(&j->seq),
+                        journal_cur_seq(j),
                          journal_last_seq(j),
                          j->last_seq_ondisk,
                          journal_state_count(*s, s->idx),
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h

index 52d74eec3bec5de0f45970e87676c4afddb70d07..46ae8f0d256dd408213d50f2d3d823efa167ac18 100644 (file)
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -368,7 +368,7 @@ void bch2_journal_meta_async(struct journal *, struct closure *);
  int bch2_journal_flush_seq(struct journal *, u64);
  int bch2_journal_flush(struct journal *);
  int bch2_journal_meta(struct journal *);
-int bch2_journal_flush_device(struct journal *, unsigned);
+int bch2_journal_flush_device(struct journal *, int);
  
  void bch2_journal_halt(struct journal *);
  
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c

index 01c8896078f2c027849cc46e69b044bcd89f1281..9c2920cff61cb874b08b23af2e80d63ad878e644 100644 (file)
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -40,12 +40,15 @@ static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
          * operations
          */
         do {
+               memset(&stats, 0, sizeof(stats));
+
                 ret = bch2_move_data(c, NULL,
                                      SECTORS_IN_FLIGHT_PER_DEVICE,
                                      NULL,
                                      writepoint_hashed((unsigned long) current),
                                      0,
                                      ca->dev_idx,
+                                    POS_MIN, POS_MAX,
                                      migrate_pred, ca,
                                      &stats);
                 if (ret) {
diff --git a/libbcachefs/move.c b/libbcachefs/move.c

index a67e7a451e3877d04e785e6ddb598f1cf25ee366..e5a46ba6d03fbf1236eb7b4a19834af94ce173c9 100644 (file)
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -87,7 +87,6 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                                             m->move_dev)))
                         bch2_extent_drop_ptr(extent_i_to_s(insert), ptr);
  
-
                 extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) {
                         if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) {
                                 /*
@@ -194,6 +193,8 @@ static void move_free(struct closure *cl)
         struct bio_vec *bv;
         int i;
  
+       bch2_disk_reservation_put(io->write.op.c, &io->write.op.res);
+
         bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
                 if (bv->bv_page)
                         __free_page(bv->bv_page);
@@ -243,20 +244,21 @@ static int bch2_move_extent(struct bch_fs *c,
                           int btree_insert_flags,
                           int move_device,
                           struct bch_io_opts opts,
-                         struct bkey_s_c k)
+                         struct bkey_s_c_extent e)
  {
         struct extent_pick_ptr pick;
         struct moving_io *io;
         const struct bch_extent_ptr *ptr;
         struct bch_extent_crc_unpacked crc;
-       unsigned sectors = k.k->size, pages;
+       unsigned sectors = e.k->size, pages, nr_good;
+       int ret = -ENOMEM;
  
-       bch2_extent_pick_ptr(c, k, NULL, &pick);
+       bch2_extent_pick_ptr(c, e.s_c, NULL, &pick);
         if (IS_ERR_OR_NULL(pick.ca))
                 return pick.ca ? PTR_ERR(pick.ca) : 0;
  
         /* write path might have to decompress data: */
-       extent_for_each_ptr_crc(bkey_s_c_to_extent(k), ptr, crc)
+       extent_for_each_ptr_crc(e, ptr, crc)
                 sectors = max_t(unsigned, sectors, crc.uncompressed_size);
  
         pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
@@ -266,7 +268,7 @@ static int bch2_move_extent(struct bch_fs *c,
                 goto err;
  
         io->write.ctxt  = ctxt;
-       io->sectors     = k.k->size;
+       io->sectors     = e.k->size;
  
         bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
         bio_set_prio(&io->write.op.wbio.bio,
@@ -274,10 +276,8 @@ static int bch2_move_extent(struct bch_fs *c,
         io->write.op.wbio.bio.bi_iter.bi_size = sectors << 9;
  
         bch2_bio_map(&io->write.op.wbio.bio, NULL);
-       if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL)) {
-               kfree(io);
-               goto err;
-       }
+       if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL))
+               goto err_free;
  
         io->rbio.opts = opts;
         bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
@@ -285,7 +285,7 @@ static int bch2_move_extent(struct bch_fs *c,
         io->rbio.bio.bi_iter.bi_size = sectors << 9;
  
         bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
-       io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(k.k);
+       io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(e.k);
         io->rbio.bio.bi_end_io          = move_read_endio;
  
         io->write.btree_insert_flags = btree_insert_flags;
@@ -298,10 +298,22 @@ static int bch2_move_extent(struct bch_fs *c,
         io->write.op.devs       = devs;
         io->write.op.write_point = wp;
  
+       if (move_device < 0 &&
+           ((nr_good = bch2_extent_nr_good_ptrs(c, e)) <
+            c->opts.data_replicas)) {
+               io->write.op.nr_replicas = c->opts.data_replicas - nr_good;
+
+               ret = bch2_disk_reservation_get(c, &io->write.op.res,
+                                               e.k->size,
+                                               io->write.op.nr_replicas, 0);
+               if (ret)
+                       goto err_free_pages;
+       }
+
         atomic64_inc(&ctxt->stats->keys_moved);
-       atomic64_add(k.k->size, &ctxt->stats->sectors_moved);
+       atomic64_add(e.k->size, &ctxt->stats->sectors_moved);
  
-       trace_move_extent(k.k);
+       trace_move_extent(e.k);
  
         atomic_add(io->sectors, &ctxt->sectors_in_flight);
         list_add_tail(&io->list, &ctxt->reads);
@@ -311,12 +323,16 @@ static int bch2_move_extent(struct bch_fs *c,
          * ctxt when doing wakeup
          */
         closure_get(&ctxt->cl);
-       bch2_read_extent(c, &io->rbio, bkey_s_c_to_extent(k),
-                        &pick, BCH_READ_NODECODE);
+       bch2_read_extent(c, &io->rbio, e, &pick, BCH_READ_NODECODE);
         return 0;
+err_free_pages:
+       bio_free_pages(&io->write.op.wbio.bio);
+err_free:
+       kfree(io);
  err:
-       trace_move_alloc_fail(k.k);
-       return -ENOMEM;
+       percpu_ref_put(&pick.ca->io_ref);
+       trace_move_alloc_fail(e.k);
+       return ret;
  }
  
  static void do_pending_writes(struct moving_context *ctxt)
@@ -355,6 +371,8 @@ int bch2_move_data(struct bch_fs *c,
                    struct write_point_specifier wp,
                    int btree_insert_flags,
                    int move_device,
+                  struct bpos start,
+                  struct bpos end,
                    move_pred_fn pred, void *arg,
                    struct bch_move_stats *stats)
  {
@@ -363,14 +381,16 @@ int bch2_move_data(struct bch_fs *c,
         struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
         BKEY_PADDED(k) tmp;
         struct bkey_s_c k;
+       struct bkey_s_c_extent e;
         u64 cur_inum = U64_MAX;
         int ret = 0;
  
-       memset(stats, 0, sizeof(*stats));
         closure_init_stack(&ctxt.cl);
         INIT_LIST_HEAD(&ctxt.reads);
         init_waitqueue_head(&ctxt.wait);
-       bch2_btree_iter_init(&stats->iter, c, BTREE_ID_EXTENTS, POS_MIN,
+
+       stats->data_type = BCH_DATA_USER;
+       bch2_btree_iter_init(&stats->iter, c, BTREE_ID_EXTENTS, start,
                              BTREE_ITER_PREFETCH);
  
         if (rate)
@@ -396,10 +416,14 @@ peek:
                 ret = btree_iter_err(k);
                 if (ret)
                         break;
+               if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+                       break;
  
                 if (!bkey_extent_is_data(k.k))
                         goto next_nondata;
  
+               e = bkey_s_c_to_extent(k);
+
                 if (cur_inum != k.k->p.inode) {
                         struct bch_inode_unpacked inode;
  
@@ -413,7 +437,7 @@ peek:
                         goto peek;
                 }
  
-               if (!pred(arg, bkey_s_c_to_extent(k)))
+               if (!pred(arg, e))
                         goto next;
  
                 /* unlock before doing IO: */
@@ -423,7 +447,8 @@ peek:
  
                 if (bch2_move_extent(c, &ctxt, devs, wp,
                                      btree_insert_flags,
-                                    move_device, opts, k)) {
+                                    move_device, opts,
+                                    bkey_s_c_to_extent(k))) {
                         /* memory allocation failure, wait for some IO to finish */
                         bch2_move_ctxt_wait_for_io(&ctxt);
                         continue;
@@ -453,3 +478,157 @@ next_nondata:
  
         return ret;
  }
+
+static int bch2_gc_data_replicas(struct bch_fs *c)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       mutex_lock(&c->replicas_gc_lock);
+       bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
+
+       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+                          BTREE_ITER_PREFETCH, k) {
+               ret = bch2_check_mark_super(c, BCH_DATA_USER, bch2_bkey_devs(k));
+               if (ret)
+                       break;
+       }
+       ret = bch2_btree_iter_unlock(&iter) ?: ret;
+
+       bch2_replicas_gc_end(c, ret);
+       mutex_unlock(&c->replicas_gc_lock);
+
+       return ret;
+}
+
+static int bch2_gc_btree_replicas(struct bch_fs *c)
+{
+       struct btree_iter iter;
+       struct btree *b;
+       unsigned id;
+       int ret = 0;
+
+       mutex_lock(&c->replicas_gc_lock);
+       bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
+
+       for (id = 0; id < BTREE_ID_NR; id++) {
+               for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+                       ret = bch2_check_mark_super(c, BCH_DATA_BTREE,
+                                       bch2_bkey_devs(bkey_i_to_s_c(&b->key)));
+
+                       bch2_btree_iter_cond_resched(&iter);
+               }
+
+               ret = bch2_btree_iter_unlock(&iter) ?: ret;
+       }
+
+       bch2_replicas_gc_end(c, ret);
+       mutex_unlock(&c->replicas_gc_lock);
+
+       return ret;
+}
+
+static int bch2_move_btree(struct bch_fs *c,
+                          move_pred_fn pred,
+                          void *arg,
+                          struct bch_move_stats *stats)
+{
+       struct btree *b;
+       unsigned id;
+       int ret = 0;
+
+       stats->data_type = BCH_DATA_BTREE;
+
+       for (id = 0; id < BTREE_ID_NR; id++) {
+               for_each_btree_node(&stats->iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
+                       if (pred(arg, bkey_i_to_s_c_extent(&b->key)))
+                               ret = bch2_btree_node_rewrite(c, &stats->iter,
+                                               b->data->keys.seq, 0) ?: ret;
+
+                       bch2_btree_iter_cond_resched(&stats->iter);
+               }
+
+               ret = bch2_btree_iter_unlock(&stats->iter) ?: ret;
+       }
+
+       return ret;
+}
+
+#if 0
+static bool scrub_data_pred(void *arg, struct bkey_s_c_extent e)
+{
+}
+#endif
+
+static bool rereplicate_metadata_pred(void *arg, struct bkey_s_c_extent e)
+{
+       struct bch_fs *c = arg;
+       unsigned nr_good = bch2_extent_nr_good_ptrs(c, e);
+
+       return nr_good && nr_good < c->opts.metadata_replicas;
+}
+
+static bool rereplicate_data_pred(void *arg, struct bkey_s_c_extent e)
+{
+       struct bch_fs *c = arg;
+       unsigned nr_good = bch2_extent_nr_good_ptrs(c, e);
+
+       return nr_good && nr_good < c->opts.data_replicas;
+}
+
+static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
+{
+       struct bch_ioctl_data *op = arg;
+
+       return bch2_extent_has_device(e, op->migrate.dev);
+}
+
+int bch2_data_job(struct bch_fs *c,
+                 struct bch_move_stats *stats,
+                 struct bch_ioctl_data op)
+{
+       int ret = 0;
+
+       switch (op.op) {
+       case BCH_DATA_OP_REREPLICATE:
+               stats->data_type = BCH_DATA_JOURNAL;
+               ret = bch2_journal_flush_device(&c->journal, -1);
+
+               ret = bch2_move_btree(c, rereplicate_metadata_pred, c, stats) ?: ret;
+               ret = bch2_gc_btree_replicas(c) ?: ret;
+
+               ret = bch2_move_data(c, NULL, SECTORS_IN_FLIGHT_PER_DEVICE,
+                                    NULL,
+                                    writepoint_hashed((unsigned long) current),
+                                    0, -1,
+                                    op.start,
+                                    op.end,
+                                    rereplicate_data_pred, c, stats) ?: ret;
+               ret = bch2_gc_data_replicas(c) ?: ret;
+               break;
+       case BCH_DATA_OP_MIGRATE:
+               if (op.migrate.dev >= c->sb.nr_devices)
+                       return -EINVAL;
+
+               stats->data_type = BCH_DATA_JOURNAL;
+               ret = bch2_journal_flush_device(&c->journal, op.migrate.dev);
+
+               ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret;
+               ret = bch2_gc_btree_replicas(c) ?: ret;
+
+               ret = bch2_move_data(c, NULL, SECTORS_IN_FLIGHT_PER_DEVICE,
+                                    NULL,
+                                    writepoint_hashed((unsigned long) current),
+                                    0, -1,
+                                    op.start,
+                                    op.end,
+                                    migrate_pred, &op, stats) ?: ret;
+               ret = bch2_gc_data_replicas(c) ?: ret;
+               break;
+       default:
+               ret = -EINVAL;
+       }
+
+       return ret;
+}
diff --git a/libbcachefs/move.h b/libbcachefs/move.h

index 24d6ddfa9637c401fecfbd2c25c5d1a56e2347e2..07aa5669524c8da0a4786f193524c84f02d48a0b 100644 (file)
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@@ -27,6 +27,7 @@ void bch2_migrate_write_init(struct migrate_write *, struct bch_read_bio *);
  typedef bool (*move_pred_fn)(void *, struct bkey_s_c_extent);
  
  struct bch_move_stats {
+       enum bch_data_type      data_type;
         struct btree_iter       iter;
  
         atomic64_t              keys_moved;
@@ -38,7 +39,12 @@ struct bch_move_stats {
  int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
                    unsigned, struct bch_devs_mask *,
                    struct write_point_specifier,
-                  int, int, move_pred_fn, void *,
+                  int, int, struct bpos, struct bpos,
+                  move_pred_fn, void *,
                    struct bch_move_stats *);
  
+int bch2_data_job(struct bch_fs *,
+                 struct bch_move_stats *,
+                 struct bch_ioctl_data);
+
  #endif /* _BCACHEFS_MOVE_H */
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c

index d6f2968ec4b1742cbe86ad51e19e7aee1236f108..515d5001aec890f7f23354c1645472036b919b9c 100644 (file)
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -106,6 +106,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
         size_t b;
         int ret;
  
+       memset(&move_stats, 0, sizeof(move_stats));
         closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
  
         /*
@@ -166,6 +167,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
                              writepoint_ptr(&ca->copygc_write_point),
                              BTREE_INSERT_USE_RESERVE,
                              ca->dev_idx,
+                            POS_MIN, POS_MAX,
                              copygc_pred, ca,
                              &move_stats);
  
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c

index 1f266ba37386ba7763aa13d5c1dff115762861c4..f333b8fad58adb907398e84e67c1464be31148dd 100644 (file)
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -13,6 +13,7 @@
  static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
  static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
                                             struct bch_replicas_cpu *);
+static int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
  
  /* superblock fields (optional/variable size sections: */
  
@@ -43,6 +44,7 @@ static const struct bch_sb_field_ops bch2_sb_field_ops[] = {
  
  static const char *bch2_sb_field_validate(struct bch_sb *sb,
                                           struct bch_sb_field *f)
+
  {
         unsigned type = le32_to_cpu(f->type);
  
@@ -297,7 +299,7 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
         if (!sb->nr_devices ||
             sb->nr_devices <= sb->dev_idx ||
             sb->nr_devices > BCH_SB_MEMBERS_MAX)
-               return "Bad cache device number in set";
+               return "Bad number of member devices";
  
         if (!BCH_SB_META_REPLICAS_WANT(sb) ||
             BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
@@ -458,6 +460,10 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
         if (ret)
                 return ret;
  
+       ret = bch2_sb_disk_groups_to_cpu(c);
+       if (ret)
+               return ret;
+
         bch2_sb_update(c);
         return 0;
  }
@@ -1557,3 +1563,129 @@ static const char *bch2_sb_validate_quota(struct bch_sb *sb,
  
         return NULL;
  }
+
+/* Disk groups: */
+
+#if 0
+static size_t trim_nulls(const char *str, size_t len)
+{
+       while (len && !str[len - 1])
+               --len;
+       return len;
+}
+#endif
+
+static const char *bch2_sb_validate_disk_groups(struct bch_sb *sb,
+                                               struct bch_sb_field *f)
+{
+       struct bch_sb_field_disk_groups *groups =
+               field_to_type(f, disk_groups);
+       struct bch_sb_field_members *mi;
+       struct bch_member *m;
+       struct bch_disk_group *g;
+       unsigned nr_groups;
+
+       mi              = bch2_sb_get_members(sb);
+       groups          = bch2_sb_get_disk_groups(sb);
+       nr_groups       = disk_groups_nr(groups);
+
+       for (m = mi->members;
+            m < mi->members + sb->nr_devices;
+            m++) {
+               if (!BCH_MEMBER_GROUP(m))
+                       continue;
+
+               if (BCH_MEMBER_GROUP(m) >= nr_groups)
+                       return "disk has invalid group";
+
+               g = &groups->entries[BCH_MEMBER_GROUP(m)];
+               if (BCH_GROUP_DELETED(g))
+                       return "disk has invalid group";
+       }
+#if 0
+       if (!groups)
+               return NULL;
+
+       char **labels;
+       labels = kcalloc(nr_groups, sizeof(char *), GFP_KERNEL);
+       if (!labels)
+               return "cannot allocate memory";
+
+       for (g = groups->groups;
+            g < groups->groups + nr_groups;
+            g++) {
+
+       }
+#endif
+       return NULL;
+}
+
+static int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
+{
+       struct bch_sb_field_members *mi;
+       struct bch_sb_field_disk_groups *groups;
+       struct bch_disk_groups_cpu *cpu_g, *old_g;
+       unsigned i, nr_groups;
+
+       lockdep_assert_held(&c->sb_lock);
+
+       mi              = bch2_sb_get_members(c->disk_sb);
+       groups          = bch2_sb_get_disk_groups(c->disk_sb);
+       nr_groups       = disk_groups_nr(groups);
+
+       if (!groups)
+               return 0;
+
+       cpu_g = kzalloc(sizeof(*cpu_g) +
+                       sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
+       if (!cpu_g)
+               return -ENOMEM;
+
+       cpu_g->nr = nr_groups;
+
+       for (i = 0; i < nr_groups; i++) {
+               struct bch_disk_group *src      = &groups->entries[i];
+               struct bch_disk_group_cpu *dst  = &cpu_g->entries[i];
+
+               dst->deleted = BCH_GROUP_DELETED(src);
+       }
+
+       for (i = 0; i < c->disk_sb->nr_devices; i++) {
+               struct bch_member *m = mi->members + i;
+               struct bch_disk_group_cpu *dst =
+                       &cpu_g->entries[BCH_MEMBER_GROUP(m)];
+
+               if (!bch2_member_exists(m))
+                       continue;
+
+               __set_bit(i, dst->devs.d);
+       }
+
+       old_g = c->disk_groups;
+       rcu_assign_pointer(c->disk_groups, cpu_g);
+       if (old_g)
+               kfree_rcu(old_g, rcu);
+
+       return 0;
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
+{
+       struct target t = target_decode(target);
+
+       switch (t.type) {
+       case TARGET_DEV:
+               BUG_ON(t.dev >= c->sb.nr_devices && !c->devs[t.dev]);
+               return &c->devs[t.dev]->self;
+       case TARGET_GROUP: {
+               struct bch_disk_groups_cpu *g =
+                       rcu_dereference(c->disk_groups);
+
+               /* XXX: what to do here? */
+               BUG_ON(t.group >= g->nr || g->entries[t.group].deleted);
+               return &g->entries[t.group].devs;
+       }
+       default:
+               BUG();
+       }
+}
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h

index 59a8b816c4679f8ba179dd43a3f67f7fc841ea06..eb85410c5f16d90a19efc3b7bcca61fdb5053d9c 100644 (file)
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -127,6 +127,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
                 .nbuckets       = le64_to_cpu(mi->nbuckets),
                 .first_bucket   = le16_to_cpu(mi->first_bucket),
                 .bucket_size    = le16_to_cpu(mi->bucket_size),
+               .group          = BCH_MEMBER_GROUP(mi),
                 .state          = BCH_MEMBER_STATE(mi),
                 .tier           = BCH_MEMBER_TIER(mi),
                 .replacement    = BCH_MEMBER_REPLACEMENT(mi),
@@ -177,4 +178,65 @@ replicas_entry_next(struct bch_replicas_entry *i)
              (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
              (_i) = replicas_entry_next(_i))
  
+/* disk groups: */
+
+static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
+{
+       return groups
+               ? (vstruct_end(&groups->field) -
+                  (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
+               : 0;
+}
+
+struct target {
+       enum {
+               TARGET_NULL,
+               TARGET_DEV,
+               TARGET_GROUP,
+       }                       type;
+       union {
+               unsigned        dev;
+               unsigned        group;
+       };
+};
+
+static inline u16 dev_to_target(unsigned dev)
+{
+       return 1 + dev;
+}
+
+static inline u16 group_to_target(unsigned group)
+{
+       return 1 + U8_MAX + group;
+}
+
+static inline struct target target_decode(unsigned target)
+{
+       if (!target)
+               return (struct target) { .type = TARGET_NULL };
+
+       --target;
+       if (target <= U8_MAX)
+               return (struct target) { .type = TARGET_DEV, .dev = target };
+
+       target -= U8_MAX;
+       return (struct target) { .type = TARGET_GROUP, .group = target };
+}
+
+static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
+{
+       struct target t = target_decode(target);
+
+       switch (t.type) {
+       case TARGET_DEV:
+               return ca->dev_idx == t.dev;
+       case TARGET_GROUP:
+               return ca->mi.group && ca->mi.group == t.group;
+       default:
+               BUG();
+       }
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
+
  #endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/libbcachefs/super.c b/libbcachefs/super.c

index 8c7a147a08331417664c62a9ad2d5b1f37f1eec3..f836c199e06b38292ecd0dd044782cde23d8f684 100644 (file)
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -426,6 +426,7 @@ static void bch2_fs_free(struct bch_fs *c)
         mempool_exit(&c->fill_iter);
         percpu_ref_exit(&c->writes);
         kfree(rcu_dereference_protected(c->replicas, 1));
+       kfree(rcu_dereference_protected(c->disk_groups, 1));
  
         if (c->copygc_wq)
                 destroy_workqueue(c->copygc_wq);
@@ -1169,6 +1170,12 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
  
         BUG_ON(!percpu_ref_is_zero(&ca->io_ref));
  
+       if (get_capacity(sb->bdev->bd_disk) <
+           ca->mi.bucket_size * ca->mi.nbuckets) {
+               bch_err(c, "device too small");
+               return -EINVAL;
+       }
+
         ret = bch2_dev_journal_init(ca, sb->sb);
         if (ret)
                 return ret;
@@ -1495,10 +1502,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
         mutex_lock(&c->state_lock);
         mutex_lock(&c->sb_lock);
  
-       /*
-        * Preserve the old cache member information (esp. tier)
-        * before we start bashing the disk stuff.
-        */
+       /* Grab member info for new disk: */
         dev_mi = bch2_sb_get_members(sb.sb);
         saved_mi = dev_mi->members[sb.sb->dev_idx];
         saved_mi.last_mount = cpu_to_le64(ktime_get_seconds());
@@ -1646,47 +1650,6 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags)
         return 0;
  }
  
-int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca)
-{
-       unsigned data;
-       int ret = 0;
-
-       mutex_lock(&c->state_lock);
-
-       if (ca->mi.state == BCH_MEMBER_STATE_RW &&
-           bch2_dev_is_online(ca)) {
-               bch_err(ca, "Cannot migrate data off RW device");
-               ret = -EINVAL;
-               goto err;
-       }
-
-       ret = bch2_dev_data_migrate(c, ca, 0);
-       if (ret) {
-               bch_err(ca, "Error migrating data: %i", ret);
-               goto err;
-       }
-
-       ret = bch2_journal_flush_device(&c->journal, ca->dev_idx);
-       if (ret) {
-               bch_err(ca, "Migrate failed: error %i flushing journal", ret);
-               goto err;
-       }
-
-       data = bch2_dev_has_data(c, ca);
-       if (data) {
-               char buf[100];
-
-               bch2_scnprint_flag_list(buf, sizeof(buf),
-                                       bch2_data_types, data);
-               bch_err(ca, "Migrate failed, still has data (%s)", buf);
-               ret = -EINVAL;
-               goto err;
-       }
-err:
-       mutex_unlock(&c->state_lock);
-       return ret;
-}
-
  int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
  {
         struct bch_member *mi;
diff --git a/libbcachefs/super.h b/libbcachefs/super.h

index a35ee3db0ea2b8ce986d0436743f4b93d7e4cbb6..d0a38cf6750882e38dfcece4123e202c71f705e3 100644 (file)
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@@ -30,7 +30,7 @@ static inline bool bch2_dev_is_online(struct bch_dev *ca)
         return ca->disk_sb.bdev != NULL;
  }
  
-static inline unsigned dev_mask_nr(struct bch_devs_mask *devs)
+static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
  {
         return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
  }
@@ -68,7 +68,7 @@ static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
  }
  
  static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
-                                             struct bch_devs_mask *mask)
+                                             const struct bch_devs_mask *mask)
  {
         struct bch_dev *ca = NULL;
  
@@ -188,7 +188,6 @@ int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int);
  int bch2_dev_add(struct bch_fs *, const char *);
  int bch2_dev_online(struct bch_fs *, const char *);
  int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
-int bch2_dev_evacuate(struct bch_fs *, struct bch_dev *);
  int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
  
  bool bch2_fs_emergency_read_only(struct bch_fs *);
diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h

index 35c8bebf683f3deebe212d847233fc2224f07911..966da4afbeda99fb7bc4341798ff5b5e1acfeeac 100644 (file)
--- a/libbcachefs/super_types.h
+++ b/libbcachefs/super_types.h
@@ -22,6 +22,7 @@ struct bch_member_cpu {
         u64                     nbuckets;       /* device size */
         u16                     first_bucket;   /* index of first bucket used */
         u16                     bucket_size;    /* sectors */
+       u16                     group;
         u8                      state;
         u8                      tier;
         u8                      replacement;
@@ -42,4 +43,15 @@ struct bch_replicas_cpu {
         struct bch_replicas_cpu_entry entries[];
  };
  
+struct bch_disk_group_cpu {
+       struct bch_devs_mask            devs;
+       bool                            deleted;
+};
+
+struct bch_disk_groups_cpu {
+       struct rcu_head                 rcu;
+       unsigned                        nr;
+       struct bch_disk_group_cpu       entries[];
+};
+
  #endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c

index 6a581097a7e6d8f961deffb3c79d4a7284c8afc3..c4625c80bbf80db449bb66ea611b1bc327ef1dd9 100644 (file)
--- a/libbcachefs/tier.c
+++ b/libbcachefs/tier.c
@@ -44,6 +44,7 @@ static int bch2_tiering_thread(void *arg)
         unsigned long last;
         unsigned i, nr_devices;
  
+       memset(&move_stats, 0, sizeof(move_stats));
         set_freezable();
  
         while (!kthread_should_stop()) {
@@ -91,6 +92,7 @@ static int bch2_tiering_thread(void *arg)
                                writepoint_ptr(&tier->wp),
                                0,
                                -1,
+                              POS_MIN, POS_MAX,
                                tiering_pred, tier,
                                &move_stats);
         }
diff --git a/linux/bio.c b/linux/bio.c

index d8256989b99d7012aba6c20c8d3bfd3ce08f6fea..79f50dc28d19de917c2f0ac6b4b4e8cb6938ae9d 100644 (file)
--- a/linux/bio.c
+++ b/linux/bio.c
@@ -163,6 +163,15 @@ struct bio *bio_split(struct bio *bio, int sectors,
         return split;
  }
  
+void bio_free_pages(struct bio *bio)
+{
+       struct bio_vec *bvec;
+       int i;
+
+       bio_for_each_segment_all(bvec, bio, i)
+               __free_page(bvec->bv_page);
+}
+
  int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
  {
         int i;
author	Kent Overstreet <kent.overstreet@gmail.com>
	Wed, 7 Feb 2018 13:08:24 +0000 (08:08 -0500)
committer	Kent Overstreet <kent.overstreet@gmail.com>
	Thu, 8 Feb 2018 20:33:42 +0000 (15:33 -0500)
.bcachefs_revision		patch \| blob \| history
include/linux/bio.h		patch \| blob \| history
include/linux/completion.h		patch \| blob \| history
include/linux/random.h		patch \| blob \| history
libbcachefs.c		patch \| blob \| history
libbcachefs/bcachefs.h		patch \| blob \| history
libbcachefs/bcachefs_format.h		patch \| blob \| history
libbcachefs/bcachefs_ioctl.h		patch \| blob \| history
libbcachefs/chardev.c		patch \| blob \| history
libbcachefs/extents.c		patch \| blob \| history
libbcachefs/extents.h		patch \| blob \| history
libbcachefs/journal.c		patch \| blob \| history
libbcachefs/journal.h		patch \| blob \| history
libbcachefs/migrate.c		patch \| blob \| history
libbcachefs/move.c		patch \| blob \| history
libbcachefs/move.h		patch \| blob \| history
libbcachefs/movinggc.c		patch \| blob \| history
libbcachefs/super-io.c		patch \| blob \| history
libbcachefs/super-io.h		patch \| blob \| history
libbcachefs/super.c		patch \| blob \| history
libbcachefs/super.h		patch \| blob \| history
libbcachefs/super_types.h		patch \| blob \| history
libbcachefs/tier.c		patch \| blob \| history
linux/bio.c		patch \| blob \| history