Update bcachefs sources to edf5f38218 bcachefs: Refactor superblock code

author Kent Overstreet <kent.overstreet@gmail.com>

Tue, 10 Apr 2018 23:19:09 +0000 (19:19 -0400)

committer Kent Overstreet <kent.overstreet@gmail.com>

Tue, 10 Apr 2018 23:23:58 +0000 (19:23 -0400)
author Kent Overstreet <kent.overstreet@gmail.com>
Tue, 10 Apr 2018 23:19:09 +0000 (19:19 -0400)
committer Kent Overstreet <kent.overstreet@gmail.com>
Tue, 10 Apr 2018 23:23:58 +0000 (19:23 -0400)
diff --git a/.bcachefs_revision b/.bcachefs_revision

index 641ae5feed73dba2a834bbbc11a6cf332b872e97..a7c36b9e6aa19deea413b41ca2300c4fdcc7d07e 100644 (file)
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-9fc6ccd8659598d4ca885220a795889071b619f4
+edf5f38218f699e53913a549465f35d36c4418f7
diff --git a/cmd_key.c b/cmd_key.c

index 0ca591c50f909e47471643f95ad91b2d717b9b60..6052cb0061762b8ed8d6a1f4c56dfc8c7e54ee41 100644 (file)
--- a/cmd_key.c
+++ b/cmd_key.c
@@ -86,7 +86,7 @@ int cmd_set_passphrase(int argc, char *argv[])
         if (IS_ERR(c))
                 die("Error opening %s: %s", argv[1], strerror(-PTR_ERR(c)));
  
-       struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb);
+       struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb.sb);
         if (!crypt)
                 die("Filesystem does not have encryption enabled");
  
@@ -100,7 +100,7 @@ int cmd_set_passphrase(int argc, char *argv[])
         char *new_passphrase = read_passphrase_twice("Enter new passphrase: ");
         struct bch_key passphrase_key = derive_passphrase(crypt, new_passphrase);
  
-       if (bch2_chacha_encrypt_key(&passphrase_key, __bch2_sb_key_nonce(c->disk_sb),
+       if (bch2_chacha_encrypt_key(&passphrase_key, __bch2_sb_key_nonce(c->disk_sb.sb),
                                     &new_key, sizeof(new_key)))
                 die("error encrypting key");
         crypt->key = new_key;
@@ -123,7 +123,7 @@ int cmd_remove_passphrase(int argc, char *argv[])
         if (IS_ERR(c))
                 die("Error opening %s: %s", argv[1], strerror(-PTR_ERR(c)));
  
-       struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb);
+       struct bch_sb_field_crypt *crypt = bch2_sb_get_crypt(c->disk_sb.sb);
         if (!crypt)
                 die("Filesystem does not have encryption enabled");
  
diff --git a/cmd_migrate.c b/cmd_migrate.c

index a42d11ec958f551c213f11755adcff8a6eb7bb2a..db20b71c15dfe811547bec097f60430c129b450e 100644 (file)
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@@ -31,6 +31,7 @@
  #include "libbcachefs/fs.h"
  #include "libbcachefs/inode.h"
  #include "libbcachefs/io.h"
+#include "libbcachefs/replicas.h"
  #include "libbcachefs/str_hash.h"
  #include "libbcachefs/super.h"
  #include "libbcachefs/xattr.h"
diff --git a/include/linux/bug.h b/include/linux/bug.h

index e25568c848bd443569d8e279bffba474eb763812..f8929688cfd9d5855bda42bbed9a86b1e48fddbe 100644 (file)
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -15,7 +15,7 @@
  #define BUG_ON(cond)           assert(!(cond))
  
  #define WARN_ON_ONCE(cond)     ({ bool _r = (cond); if (_r) assert(0); _r; })
-#define WARN_ONCE(cond, msg)   ({ bool _r = (cond); if (_r) assert(0); _r; })
+#define WARN_ONCE(cond, ...)   ({ bool _r = (cond); if (_r) assert(0); _r; })
  
  #define __WARN()               assert(0)
  #define __WARN_printf(arg...)  assert(0)
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h

index a7be2d8222d89b3a48637045d4e2033811d097bb..a34574ca8b0c3c72f606fe7157da57f1716629c3 100644 (file)
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -319,7 +319,7 @@ TRACE_EVENT(btree_gc_coalesce_fail,
  
         TP_fast_assign(
                 __entry->reason         = reason;
-               memcpy(__entry->uuid, c->disk_sb->user_uuid.b, 16);
+               memcpy(__entry->uuid, c->disk_sb.sb->user_uuid.b, 16);
         ),
  
         TP_printk("%pU: %u", __entry->uuid, __entry->reason)
diff --git a/libbcachefs.c b/libbcachefs.c

index 052ca35bcfa7dfd99828b8ba897eb666485ddb5e..a6eb9889452fff8a2636eff57c842bd2369b68e3 100644 (file)
--- a/libbcachefs.c
+++ b/libbcachefs.c
@@ -14,12 +14,14 @@
  
  #include <uuid/uuid.h>
  
-#include "libbcachefs/bcachefs_format.h"
-#include "libbcachefs/checksum.h"
-#include "crypto.h"
  #include "libbcachefs.h"
+#include "crypto.h"
+#include "libbcachefs/bcachefs_format.h"
  #include "libbcachefs/btree_cache.h"
+#include "libbcachefs/checksum.h"
+#include "libbcachefs/disk_groups.h"
  #include "libbcachefs/opts.h"
+#include "libbcachefs/replicas.h"
  #include "libbcachefs/super-io.h"
  
  #define NSEC_PER_SEC   1000000000L
@@ -124,8 +126,8 @@ void bch2_pick_bucket_size(struct format_opts opts, struct dev_opts *dev)
  
  }
  
-static unsigned parse_target(struct dev_opts *devs, size_t nr_devs,
-                            struct bch_sb_field_disk_groups *gi,
+static unsigned parse_target(struct bch_sb_handle *sb,
+                            struct dev_opts *devs, size_t nr_devs,
                              const char *s)
  {
         struct dev_opts *i;
@@ -138,7 +140,7 @@ static unsigned parse_target(struct dev_opts *devs, size_t nr_devs,
                 if (!strcmp(s, i->path))
                         return dev_to_target(i - devs);
  
-       idx = __bch2_disk_group_find(gi, s);
+       idx = bch2_disk_path_find(sb, s);
         if (idx >= 0)
                 return group_to_target(idx);
  
@@ -149,11 +151,9 @@ static unsigned parse_target(struct dev_opts *devs, size_t nr_devs,
  struct bch_sb *bch2_format(struct format_opts opts,
                            struct dev_opts *devs, size_t nr_devs)
  {
-       struct bch_sb *sb;
+       struct bch_sb_handle sb = { NULL };
         struct dev_opts *i;
         struct bch_sb_field_members *mi;
-       struct bch_sb_field_disk_groups *gi = NULL;
-       unsigned u64s;
  
         /* calculate block size: */
         if (!opts.block_size)
@@ -184,58 +184,51 @@ struct bch_sb *bch2_format(struct format_opts opts,
         if (uuid_is_null(opts.uuid.b))
                 uuid_generate(opts.uuid.b);
  
-       sb = calloc(1, sizeof(*sb) +
-                   sizeof(struct bch_sb_field_members) +
-                   sizeof(struct bch_member) * nr_devs +
-                   sizeof(struct bch_sb_field_disk_groups) +
-                   sizeof(struct bch_disk_group) * nr_devs +
-                   sizeof(struct bch_sb_field_crypt));
+       if (bch2_sb_realloc(&sb, 0))
+               die("insufficient memory");
  
-       sb->version     = cpu_to_le64(BCH_SB_VERSION_MAX);
-       sb->magic       = BCACHE_MAGIC;
-       sb->block_size  = cpu_to_le16(opts.block_size);
-       sb->user_uuid   = opts.uuid;
-       sb->nr_devices  = nr_devs;
+       sb.sb->version          = cpu_to_le64(BCH_SB_VERSION_MAX);
+       sb.sb->magic            = BCACHE_MAGIC;
+       sb.sb->block_size       = cpu_to_le16(opts.block_size);
+       sb.sb->user_uuid        = opts.uuid;
+       sb.sb->nr_devices       = nr_devs;
  
-       uuid_generate(sb->uuid.b);
+       uuid_generate(sb.sb->uuid.b);
  
         if (opts.label)
-               strncpy((char *) sb->label, opts.label, sizeof(sb->label));
-
-       SET_BCH_SB_CSUM_TYPE(sb,                opts.meta_csum_type);
-       SET_BCH_SB_META_CSUM_TYPE(sb,           opts.meta_csum_type);
-       SET_BCH_SB_DATA_CSUM_TYPE(sb,           opts.data_csum_type);
-       SET_BCH_SB_COMPRESSION_TYPE(sb,         opts.compression_type);
-       SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(sb, opts.background_compression_type);
-
-       SET_BCH_SB_BTREE_NODE_SIZE(sb,          opts.btree_node_size);
-       SET_BCH_SB_GC_RESERVE(sb,               8);
-       SET_BCH_SB_META_REPLICAS_WANT(sb,       opts.meta_replicas);
-       SET_BCH_SB_META_REPLICAS_REQ(sb,        opts.meta_replicas_required);
-       SET_BCH_SB_DATA_REPLICAS_WANT(sb,       opts.data_replicas);
-       SET_BCH_SB_DATA_REPLICAS_REQ(sb,        opts.data_replicas_required);
-       SET_BCH_SB_ERROR_ACTION(sb,             opts.on_error_action);
-       SET_BCH_SB_STR_HASH_TYPE(sb,            BCH_STR_HASH_SIPHASH);
-       SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb,  ilog2(opts.encoded_extent_max));
-
-       SET_BCH_SB_POSIX_ACL(sb,                1);
+               strncpy((char *) sb.sb->label, opts.label, sizeof(sb.sb->label));
+
+       SET_BCH_SB_CSUM_TYPE(sb.sb,             opts.meta_csum_type);
+       SET_BCH_SB_META_CSUM_TYPE(sb.sb,        opts.meta_csum_type);
+       SET_BCH_SB_DATA_CSUM_TYPE(sb.sb,        opts.data_csum_type);
+       SET_BCH_SB_COMPRESSION_TYPE(sb.sb,      opts.compression_type);
+       SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(sb.sb,
+                                               opts.background_compression_type);
+
+       SET_BCH_SB_BTREE_NODE_SIZE(sb.sb,       opts.btree_node_size);
+       SET_BCH_SB_GC_RESERVE(sb.sb,            8);
+       SET_BCH_SB_META_REPLICAS_WANT(sb.sb,    opts.meta_replicas);
+       SET_BCH_SB_META_REPLICAS_REQ(sb.sb,     opts.meta_replicas_required);
+       SET_BCH_SB_DATA_REPLICAS_WANT(sb.sb,    opts.data_replicas);
+       SET_BCH_SB_DATA_REPLICAS_REQ(sb.sb,     opts.data_replicas_required);
+       SET_BCH_SB_ERROR_ACTION(sb.sb,          opts.on_error_action);
+       SET_BCH_SB_STR_HASH_TYPE(sb.sb,         BCH_STR_HASH_SIPHASH);
+       SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb.sb,ilog2(opts.encoded_extent_max));
+
+       SET_BCH_SB_POSIX_ACL(sb.sb,             1);
  
         struct timespec now;
         if (clock_gettime(CLOCK_REALTIME, &now))
                 die("error getting current time: %m");
  
-       sb->time_base_lo        = cpu_to_le64(now.tv_sec * NSEC_PER_SEC + now.tv_nsec);
-       sb->time_precision      = cpu_to_le32(1);
-
-       mi = vstruct_end(sb);
-       u64s = (sizeof(struct bch_sb_field_members) +
-               sizeof(struct bch_member) * nr_devs) / sizeof(u64);
-
-       le32_add_cpu(&sb->u64s, u64s);
-       le32_add_cpu(&mi->field.u64s, u64s);
-       mi->field.type = BCH_SB_FIELD_members;
+       sb.sb->time_base_lo     = cpu_to_le64(now.tv_sec * NSEC_PER_SEC + now.tv_nsec);
+       sb.sb->time_precision   = cpu_to_le32(1);
  
         /* Member info: */
+       mi = bch2_sb_resize_members(&sb,
+                       (sizeof(*mi) + sizeof(struct bch_member) *
+                        nr_devs) / sizeof(u64));
+
         for (i = devs; i < devs + nr_devs; i++) {
                 struct bch_member *m = mi->members + (i - devs);
  
@@ -253,63 +246,38 @@ struct bch_sb *bch2_format(struct format_opts opts,
         /* Disk groups */
         for (i = devs; i < devs + nr_devs; i++) {
                 struct bch_member *m = mi->members + (i - devs);
-               struct bch_disk_group *g;
-               size_t len;
                 int idx;
  
                 if (!i->group)
                         continue;
  
-               len = min_t(size_t, strlen(i->group) + 1, BCH_SB_LABEL_SIZE);
-
-               if (!gi) {
-                       gi = vstruct_end(sb);
-                       u64s = sizeof(*gi) / sizeof(u64);
-                       le32_add_cpu(&sb->u64s, u64s);
-                       le32_add_cpu(&gi->field.u64s, u64s);
-                       gi->field.type = BCH_SB_FIELD_disk_groups;
-               }
-
-               idx = __bch2_disk_group_find(gi, i->group);
-               if (idx >= 0) {
-                       g = gi->entries + idx;
-               } else {
-                       u64s = sizeof(*g) / sizeof(u64);
-                       g = vstruct_end(&gi->field);
-                       le32_add_cpu(&sb->u64s, u64s);
-                       le32_add_cpu(&gi->field.u64s, u64s);
-                       memcpy(g->label, i->group, len);
-                       SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
-               }
+               idx = bch2_disk_path_find_or_create(&sb, i->group);
+               if (idx < 0)
+                       die("error creating disk path: %s", idx);
  
-               SET_BCH_MEMBER_GROUP(m, (g - gi->entries) + 1);
+               SET_BCH_MEMBER_GROUP(m, idx + 1);
         }
  
-       SET_BCH_SB_FOREGROUND_TARGET(sb,
-               parse_target(devs, nr_devs, gi, opts.foreground_target));
-       SET_BCH_SB_BACKGROUND_TARGET(sb,
-               parse_target(devs, nr_devs, gi, opts.background_target));
-       SET_BCH_SB_PROMOTE_TARGET(sb,
-               parse_target(devs, nr_devs, gi, opts.promote_target));
+       SET_BCH_SB_FOREGROUND_TARGET(sb.sb,
+               parse_target(&sb, devs, nr_devs, opts.foreground_target));
+       SET_BCH_SB_BACKGROUND_TARGET(sb.sb,
+               parse_target(&sb, devs, nr_devs, opts.background_target));
+       SET_BCH_SB_PROMOTE_TARGET(sb.sb,
+               parse_target(&sb, devs, nr_devs, opts.promote_target));
  
         /* Crypt: */
         if (opts.encrypted) {
-               struct bch_sb_field_crypt *crypt = vstruct_end(sb);
-
-               u64s = sizeof(struct bch_sb_field_crypt) / sizeof(u64);
-
-               le32_add_cpu(&sb->u64s, u64s);
-               crypt->field.u64s = cpu_to_le32(u64s);
-               crypt->field.type = BCH_SB_FIELD_crypt;
+               struct bch_sb_field_crypt *crypt =
+                       bch2_sb_resize_crypt(&sb, sizeof(*crypt) / sizeof(u64));
  
-               bch_sb_crypt_init(sb, crypt, opts.passphrase);
-               SET_BCH_SB_ENCRYPTION_TYPE(sb, 1);
+               bch_sb_crypt_init(sb.sb, crypt, opts.passphrase);
+               SET_BCH_SB_ENCRYPTION_TYPE(sb.sb, 1);
         }
  
         for (i = devs; i < devs + nr_devs; i++) {
-               sb->dev_idx = i - devs;
+               sb.sb->dev_idx = i - devs;
  
-               init_layout(&sb->layout, opts.block_size,
+               init_layout(&sb.sb->layout, opts.block_size,
                             i->sb_offset, i->sb_end);
  
                 if (i->sb_offset == BCH_SB_SECTOR) {
@@ -319,11 +287,11 @@ struct bch_sb *bch2_format(struct format_opts opts,
                         xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0);
                 }
  
-               bch2_super_write(i->fd, sb);
+               bch2_super_write(i->fd, sb.sb);
                 close(i->fd);
         }
  
-       return sb;
+       return sb.sb;
  }
  
  void bch2_super_write(int fd, struct bch_sb *sb)
@@ -553,11 +521,11 @@ static void bch2_sb_print_disk_groups(struct bch_sb *sb, struct bch_sb_field *f,
  
  typedef void (*sb_field_print_fn)(struct bch_sb *, struct bch_sb_field *, enum units);
  
-struct bch_sb_field_ops {
+struct bch_sb_field_toolops {
         sb_field_print_fn       print;
  };
  
-static const struct bch_sb_field_ops bch2_sb_field_ops[] = {
+static const struct bch_sb_field_toolops bch2_sb_field_ops[] = {
  #define x(f, nr)                                       \
         [BCH_SB_FIELD_##f] = {                          \
                 .print  = bch2_sb_print_##f,            \
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c

index ede44f73b7b4d9f77f6544a969066e67e5dbb246..16bdc48c759e84c57bd2a703c90a12f93be4168c 100644 (file)
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@@ -58,11 +58,13 @@
  #include "btree_cache.h"
  #include "btree_io.h"
  #include "btree_update.h"
+#include "btree_update_interior.h"
  #include "btree_gc.h"
  #include "buckets.h"
  #include "checksum.h"
  #include "clock.h"
  #include "debug.h"
+#include "disk_groups.h"
  #include "error.h"
  #include "extents.h"
  #include "io.h"
@@ -79,7 +81,7 @@
  #include <linux/sort.h>
  #include <trace/events/bcachefs.h>
  
-static void bch2_recalc_min_prio(struct bch_fs *, struct bch_dev *, int);
+static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
  
  /* Ratelimiting/PD controllers */
  
@@ -130,8 +132,7 @@ static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
         return DIV_ROUND_UP(bytes, sizeof(u64));
  }
  
-static const char *bch2_alloc_invalid(const struct bch_fs *c,
-                                     struct bkey_s_c k)
+const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
  {
         if (k.k->p.inode >= c->sb.nr_devices ||
             !c->devs[k.k->p.inode])
@@ -152,8 +153,8 @@ static const char *bch2_alloc_invalid(const struct bch_fs *c,
         return NULL;
  }
  
-static void bch2_alloc_to_text(struct bch_fs *c, char *buf,
-                              size_t size, struct bkey_s_c k)
+void bch2_alloc_to_text(struct bch_fs *c, char *buf,
+                       size_t size, struct bkey_s_c k)
  {
         buf[0] = '\0';
  
@@ -163,11 +164,6 @@ static void bch2_alloc_to_text(struct bch_fs *c, char *buf,
         }
  }
  
-const struct bkey_ops bch2_bkey_alloc_ops = {
-       .key_invalid    = bch2_alloc_invalid,
-       .val_to_text    = bch2_alloc_to_text,
-};
-
  static inline unsigned get_alloc_field(const u8 **p, unsigned bytes)
  {
         unsigned v;
@@ -236,9 +232,9 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k)
  
         d = a.v->data;
         if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
-               g->prio[READ] = get_alloc_field(&d, 2);
+               g->io_time[READ] = get_alloc_field(&d, 2);
         if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
-               g->prio[WRITE] = get_alloc_field(&d, 2);
+               g->io_time[WRITE] = get_alloc_field(&d, 2);
  
         lg_local_unlock(&c->usage_lock);
  }
@@ -270,21 +266,21 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
                                 bch2_alloc_read_key(c, bkey_i_to_s_c(k));
         }
  
-       mutex_lock(&c->prio_clock[READ].lock);
+       mutex_lock(&c->bucket_clock[READ].lock);
         for_each_member_device(ca, c, i) {
                 down_read(&ca->bucket_lock);
-               bch2_recalc_min_prio(c, ca, READ);
+               bch2_recalc_oldest_io(c, ca, READ);
                 up_read(&ca->bucket_lock);
         }
-       mutex_unlock(&c->prio_clock[READ].lock);
+       mutex_unlock(&c->bucket_clock[READ].lock);
  
-       mutex_lock(&c->prio_clock[WRITE].lock);
+       mutex_lock(&c->bucket_clock[WRITE].lock);
         for_each_member_device(ca, c, i) {
                 down_read(&ca->bucket_lock);
-               bch2_recalc_min_prio(c, ca, WRITE);
+               bch2_recalc_oldest_io(c, ca, WRITE);
                 up_read(&ca->bucket_lock);
         }
-       mutex_unlock(&c->prio_clock[WRITE].lock);
+       mutex_unlock(&c->bucket_clock[WRITE].lock);
  
         return 0;
  }
@@ -320,9 +316,9 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
  
                 d = a->v.data;
                 if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME))
-                       put_alloc_field(&d, 2, g->prio[READ]);
+                       put_alloc_field(&d, 2, g->io_time[READ]);
                 if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME))
-                       put_alloc_field(&d, 2, g->prio[WRITE]);
+                       put_alloc_field(&d, 2, g->io_time[WRITE]);
                 lg_local_unlock(&c->usage_lock);
  
                 ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq,
@@ -395,38 +391,34 @@ int bch2_alloc_write(struct bch_fs *c)
  
  /* Bucket IO clocks: */
  
-static void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
+static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
  {
-       struct prio_clock *clock = &c->prio_clock[rw];
+       struct bucket_clock *clock = &c->bucket_clock[rw];
         struct bucket_array *buckets = bucket_array(ca);
         struct bucket *g;
-       u16 max_delta = 1;
+       u16 max_last_io = 0;
         unsigned i;
  
-       lockdep_assert_held(&c->prio_clock[rw].lock);
+       lockdep_assert_held(&c->bucket_clock[rw].lock);
  
-       /* Determine min prio for this particular device */
+       /* Recalculate max_last_io for this device: */
         for_each_bucket(g, buckets)
-               max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw]));
+               max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
  
-       ca->min_prio[rw] = clock->hand - max_delta;
+       ca->max_last_bucket_io[rw] = max_last_io;
  
-       /*
-        * This may possibly increase the min prio for the whole device, check
-        * that as well.
-        */
-       max_delta = 1;
+       /* Recalculate global max_last_io: */
+       max_last_io = 0;
  
         for_each_member_device(ca, c, i)
-               max_delta = max(max_delta,
-                               (u16) (clock->hand - ca->min_prio[rw]));
+               max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
  
-       clock->min_prio = clock->hand - max_delta;
+       clock->max_last_io = max_last_io;
  }
  
-static void bch2_rescale_prios(struct bch_fs *c, int rw)
+static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
  {
-       struct prio_clock *clock = &c->prio_clock[rw];
+       struct bucket_clock *clock = &c->bucket_clock[rw];
         struct bucket_array *buckets;
         struct bch_dev *ca;
         struct bucket *g;
@@ -439,10 +431,10 @@ static void bch2_rescale_prios(struct bch_fs *c, int rw)
                 buckets = bucket_array(ca);
  
                 for_each_bucket(g, buckets)
-                       g->prio[rw] = clock->hand -
-                       (clock->hand - g->prio[rw]) / 2;
+                       g->io_time[rw] = clock->hand -
+                       bucket_last_io(c, g, rw) / 2;
  
-               bch2_recalc_min_prio(c, ca, rw);
+               bch2_recalc_oldest_io(c, ca, rw);
  
                 up_read(&ca->bucket_lock);
         }
@@ -450,19 +442,26 @@ static void bch2_rescale_prios(struct bch_fs *c, int rw)
  
  static void bch2_inc_clock_hand(struct io_timer *timer)
  {
-       struct prio_clock *clock = container_of(timer,
-                                               struct prio_clock, rescale);
+       struct bucket_clock *clock = container_of(timer,
+                                               struct bucket_clock, rescale);
         struct bch_fs *c = container_of(clock,
-                                       struct bch_fs, prio_clock[clock->rw]);
+                                       struct bch_fs, bucket_clock[clock->rw]);
+       struct bch_dev *ca;
         u64 capacity;
+       unsigned i;
  
         mutex_lock(&clock->lock);
  
-       clock->hand++;
-
         /* if clock cannot be advanced more, rescale prio */
-       if (clock->hand == (u16) (clock->min_prio - 1))
-               bch2_rescale_prios(c, clock->rw);
+       if (clock->max_last_io >= U16_MAX - 2)
+               bch2_rescale_bucket_io_times(c, clock->rw);
+
+       BUG_ON(clock->max_last_io >= U16_MAX - 2);
+
+       for_each_member_device(ca, c, i)
+               ca->max_last_bucket_io[clock->rw]++;
+       clock->max_last_io++;
+       clock->hand++;
  
         mutex_unlock(&clock->lock);
  
@@ -484,9 +483,9 @@ static void bch2_inc_clock_hand(struct io_timer *timer)
         bch2_io_timer_add(&c->io_clock[clock->rw], timer);
  }
  
-static void bch2_prio_timer_init(struct bch_fs *c, int rw)
+static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
  {
-       struct prio_clock *clock = &c->prio_clock[rw];
+       struct bucket_clock *clock = &c->bucket_clock[rw];
  
         clock->hand             = 1;
         clock->rw               = rw;
@@ -536,7 +535,7 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
         while (1) {
                 set_current_state(TASK_INTERRUPTIBLE);
                 if (kthread_should_stop()) {
-                       ret = -1;
+                       ret = 1;
                         break;
                 }
  
@@ -635,13 +634,14 @@ static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
  static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
                                      size_t b, struct bucket_mark m)
  {
+       unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
+       unsigned max_last_io = ca->max_last_bucket_io[READ];
+
         /*
          * Time since last read, scaled to [0, 8) where larger value indicates
          * more recently read data:
          */
-       unsigned long hotness =
-               (bucket(ca, b)->prio[READ]      - ca->min_prio[READ]) * 7 /
-               (c->prio_clock[READ].hand       - ca->min_prio[READ]);
+       unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
  
         /* How much we want to keep the data in this bucket: */
         unsigned long data_wantness =
@@ -659,23 +659,25 @@ static inline int bucket_alloc_cmp(alloc_heap *h,
                                    struct alloc_heap_entry l,
                                    struct alloc_heap_entry r)
  {
-       return (l.key > r.key) - (l.key < r.key);
+       return (l.key > r.key) - (l.key < r.key) ?:
+               (l.nr < r.nr)  - (l.nr  > r.nr) ?:
+               (l.bucket > r.bucket) - (l.bucket < r.bucket);
  }
  
  static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
  {
         struct bucket_array *buckets;
-       struct alloc_heap_entry e;
+       struct alloc_heap_entry e = { 0 };
         size_t b;
  
         ca->alloc_heap.used = 0;
  
-       mutex_lock(&c->prio_clock[READ].lock);
+       mutex_lock(&c->bucket_clock[READ].lock);
         down_read(&ca->bucket_lock);
  
         buckets = bucket_array(ca);
  
-       bch2_recalc_min_prio(c, ca, READ);
+       bch2_recalc_oldest_io(c, ca, READ);
  
         /*
          * Find buckets with lowest read priority, by building a maxheap sorted
@@ -684,30 +686,45 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
          */
         for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
                 struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
+               unsigned long key = bucket_sort_key(c, ca, b, m);
  
                 if (!bch2_can_invalidate_bucket(ca, b, m))
                         continue;
  
-               e = (struct alloc_heap_entry) {
-                       .bucket = b,
-                       .key    = bucket_sort_key(c, ca, b, m)
-               };
+               if (e.nr && e.bucket + e.nr == b && e.key == key) {
+                       e.nr++;
+               } else {
+                       if (e.nr)
+                               heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
+
+                       e = (struct alloc_heap_entry) {
+                               .bucket = b,
+                               .nr     = 1,
+                               .key    = key,
+                       };
+               }
  
-               heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
+               cond_resched();
         }
  
+       if (e.nr)
+               heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
+
         up_read(&ca->bucket_lock);
-       mutex_unlock(&c->prio_clock[READ].lock);
+       mutex_unlock(&c->bucket_clock[READ].lock);
  
         heap_resort(&ca->alloc_heap, bucket_alloc_cmp);
  
-       /*
-        * If we run out of buckets to invalidate, bch2_allocator_thread() will
-        * kick stuff and retry us
-        */
-       while (!fifo_full(&ca->free_inc) &&
-              heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp))
-               bch2_invalidate_one_bucket(c, ca, e.bucket);
+       while (heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) {
+               for (b = e.bucket;
+                    b < e.bucket + e.nr;
+                    b++) {
+                       if (fifo_full(&ca->free_inc))
+                               return;
+
+                       bch2_invalidate_one_bucket(c, ca, b);
+               }
+       }
  }
  
  static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
@@ -729,6 +746,8 @@ static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
  
                 if (bch2_can_invalidate_bucket(ca, b, m))
                         bch2_invalidate_one_bucket(c, ca, b);
+
+               cond_resched();
         }
  }
  
@@ -749,6 +768,8 @@ static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca
  
                 if (bch2_can_invalidate_bucket(ca, b, m))
                         bch2_invalidate_one_bucket(c, ca, b);
+
+               cond_resched();
         }
  }
  
@@ -850,7 +871,7 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t
  
                 if ((current->flags & PF_KTHREAD) &&
                     kthread_should_stop()) {
-                       ret = -1;
+                       ret = 1;
                         break;
                 }
  
@@ -880,7 +901,7 @@ static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
                                              ca->mi.bucket_size, GFP_NOIO, 0);
  
                 if (push_invalidated_bucket(c, ca, bucket))
-                       return -1;
+                       return 1;
         }
  
         return 0;
@@ -905,17 +926,32 @@ static int bch2_allocator_thread(void *arg)
  
         while (1) {
                 while (1) {
+                       cond_resched();
+
+                       pr_debug("discarding %zu invalidated buckets",
+                                ca->nr_invalidated);
+
                         ret = discard_invalidated_buckets(c, ca);
                         if (ret)
-                               return 0;
+                               goto stop;
  
                         if (fifo_empty(&ca->free_inc))
                                 break;
  
+                       pr_debug("invalidating %zu buckets",
+                                fifo_used(&ca->free_inc));
+
                         journal_seq = 0;
                         ret = bch2_invalidate_free_inc(c, ca, &journal_seq, SIZE_MAX);
-                       if (ret)
-                               return 0;
+                       if (ret) {
+                               bch_err(ca, "error invalidating buckets: %i", ret);
+                               goto stop;
+                       }
+
+                       if (!ca->nr_invalidated) {
+                               bch_err(ca, "allocator thread unable to make forward progress!");
+                               goto stop;
+                       }
  
                         if (ca->allocator_invalidating_data)
                                 ret = bch2_journal_flush_seq(&c->journal, journal_seq);
@@ -927,22 +963,29 @@ static int bch2_allocator_thread(void *arg)
                          * journal error - buckets haven't actually been
                          * invalidated, can't discard them:
                          */
-                       if (ret)
-                               return 0;
+                       if (ret) {
+                               bch_err(ca, "journal error: %i", ret);
+                               goto stop;
+                       }
                 }
  
+               pr_debug("free_inc now empty");
+
                 /* Reset front/back so we can easily sort fifo entries later: */
                 ca->free_inc.front = ca->free_inc.back  = 0;
                 ca->allocator_journal_seq_flush         = 0;
                 ca->allocator_invalidating_data         = false;
  
                 down_read(&c->gc_lock);
-               if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
-                       up_read(&c->gc_lock);
-                       return 0;
-               }
-
                 while (1) {
+                       size_t prev = fifo_used(&ca->free_inc);
+
+                       if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
+                               up_read(&c->gc_lock);
+                               bch_err(ca, "gc failure");
+                               goto stop;
+                       }
+
                         /*
                          * Find some buckets that we can invalidate, either
                          * they're completely unused, or only contain clean data
@@ -950,7 +993,14 @@ static int bch2_allocator_thread(void *arg)
                          * another cache tier
                          */
  
+                       pr_debug("scanning for reclaimable buckets");
+
                         find_reclaimable_buckets(c, ca);
+
+                       pr_debug("found %zu buckets (free_inc %zu/%zu)",
+                                fifo_used(&ca->free_inc) - prev,
+                                fifo_used(&ca->free_inc), ca->free_inc.size);
+
                         trace_alloc_batch(ca, fifo_used(&ca->free_inc),
                                           ca->free_inc.size);
  
@@ -977,15 +1027,20 @@ static int bch2_allocator_thread(void *arg)
                         ca->allocator_blocked = true;
                         closure_wake_up(&c->freelist_wait);
  
-                       if (wait_buckets_available(c, ca)) {
+                       ret = wait_buckets_available(c, ca);
+                       if (ret) {
                                 up_read(&c->gc_lock);
-                               return 0;
+                               goto stop;
                         }
                 }
  
                 ca->allocator_blocked = false;
                 up_read(&c->gc_lock);
  
+               pr_debug("free_inc now %zu/%zu",
+                        fifo_used(&ca->free_inc),
+                        ca->free_inc.size);
+
                 sort_free_inc(c, ca);
  
                 /*
@@ -993,6 +1048,10 @@ static int bch2_allocator_thread(void *arg)
                  * write out the new bucket gens:
                  */
         }
+
+stop:
+       pr_debug("alloc thread stopping (ret %i)", ret);
+       return 0;
  }
  
  /* Allocation */
@@ -1046,8 +1105,8 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
         return ob;
  }
  
-/* _only_ for allocating the journal and btree roots on a brand new fs: */
-int bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
+/* _only_ for allocating the journal on a new device: */
+long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
  {
         struct bucket_array *buckets;
         ssize_t b;
@@ -1056,14 +1115,8 @@ int bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
         buckets = bucket_array(ca);
  
         for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++)
-               if (is_available_bucket(buckets->b[b].mark)) {
-                       bch2_mark_alloc_bucket(c, ca, b, true,
-                                       gc_pos_alloc(c, NULL),
-                                       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-                                       BCH_BUCKET_MARK_GC_LOCK_HELD);
-                       set_bit(b, ca->buckets_dirty);
+               if (is_available_bucket(buckets->b[b].mark))
                         goto success;
-               }
         b = -1;
  success:
         rcu_read_unlock();
@@ -1135,9 +1188,8 @@ int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
                 break;
         }
  
-       if (unlikely(test_bit(BCH_FS_BRAND_NEW_FS, &c->flags)) &&
-           (bucket = bch2_bucket_alloc_startup(c, ca)) >= 0)
-               goto out;
+       if (cl)
+               closure_wait(&c->freelist_wait, cl);
  
         spin_unlock(&c->freelist_lock);
  
@@ -1218,7 +1270,7 @@ void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
                 *v = *v < scale ? 0 : *v - scale;
  }
  
-static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
+static enum bucket_alloc_ret bch2_bucket_alloc_set(struct bch_fs *c,
                                         struct write_point *wp,
                                         unsigned nr_replicas,
                                         enum alloc_reserve reserve,
@@ -1284,52 +1336,22 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
                         break;
                 }
         }
+       rcu_read_unlock();
  
         EBUG_ON(reserve == RESERVE_MOVINGGC &&
                 ret != ALLOC_SUCCESS &&
                 ret != OPEN_BUCKETS_EMPTY);
-       rcu_read_unlock();
-       return ret;
-}
-
-static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
-                               unsigned nr_replicas,
-                               enum alloc_reserve reserve,
-                               struct bch_devs_mask *devs,
-                               struct closure *cl)
-{
-       bool waiting = false;
-
-       while (1) {
-               switch (__bch2_bucket_alloc_set(c, wp, nr_replicas,
-                                               reserve, devs, cl)) {
-               case ALLOC_SUCCESS:
-                       if (waiting)
-                               closure_wake_up(&c->freelist_wait);
-
-                       return 0;
-
-               case NO_DEVICES:
-                       if (waiting)
-                               closure_wake_up(&c->freelist_wait);
-                       return -EROFS;
-
-               case FREELIST_EMPTY:
-                       if (!cl)
-                               return -ENOSPC;
  
-                       if (waiting)
-                               return -EAGAIN;
-
-                       /* Retry allocation after adding ourself to waitlist: */
-                       closure_wait(&c->freelist_wait, cl);
-                       waiting = true;
-                       break;
-               case OPEN_BUCKETS_EMPTY:
-                       return cl ? -EAGAIN : -ENOSPC;
-               default:
-                       BUG();
-               }
+       switch (ret) {
+       case ALLOC_SUCCESS:
+               return 0;
+       case NO_DEVICES:
+               return -EROFS;
+       case FREELIST_EMPTY:
+       case OPEN_BUCKETS_EMPTY:
+               return cl ? -EAGAIN : -ENOSPC;
+       default:
+               BUG();
         }
  }
  
@@ -1530,11 +1552,12 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
         nr_ptrs_have = wp->first_ptr;
  
         /* does writepoint have ptrs we don't want to use? */
-       writepoint_for_each_ptr(wp, ob, i)
-               if (!dev_idx_in_target(c, ob->ptr.dev, target)) {
-                       swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
-                       wp->first_ptr++;
-               }
+       if (target)
+               writepoint_for_each_ptr(wp, ob, i)
+                       if (!dev_idx_in_target(c, ob->ptr.dev, target)) {
+                               swap(wp->ptrs[i], wp->ptrs[wp->first_ptr]);
+                               wp->first_ptr++;
+                       }
  
         if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) {
                 ret = open_bucket_add_buckets(c, target, wp, devs_have,
@@ -1551,7 +1574,7 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
                                               nr_replicas, reserve, cl);
         }
  
-       if (ret)
+       if (ret && ret != -EROFS)
                 goto err;
  alloc_done:
         /* check for more than one cache: */
@@ -1584,6 +1607,13 @@ alloc_done:
                 nr_ptrs_effective += ca->mi.durability;
         }
  
+       if (ret == -EROFS &&
+           nr_ptrs_effective >= nr_replicas_required)
+               ret = 0;
+
+       if (ret)
+               goto err;
+
         if (nr_ptrs_effective > nr_replicas) {
                 writepoint_for_each_ptr(wp, ob, i) {
                         ca = bch_dev_bkey_exists(c, ob->ptr.dev);
@@ -1749,14 +1779,14 @@ void bch2_recalc_capacity(struct bch_fs *c)
  
         if (c->capacity) {
                 bch2_io_timer_add(&c->io_clock[READ],
-                                &c->prio_clock[READ].rescale);
+                                &c->bucket_clock[READ].rescale);
                 bch2_io_timer_add(&c->io_clock[WRITE],
-                                &c->prio_clock[WRITE].rescale);
+                                &c->bucket_clock[WRITE].rescale);
         } else {
                 bch2_io_timer_del(&c->io_clock[READ],
-                                &c->prio_clock[READ].rescale);
+                                &c->bucket_clock[READ].rescale);
                 bch2_io_timer_del(&c->io_clock[WRITE],
-                                &c->prio_clock[WRITE].rescale);
+                                &c->bucket_clock[WRITE].rescale);
         }
  
         /* Wake up case someone was waiting for buckets */
@@ -1889,7 +1919,8 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
         if (ca->alloc_thread)
                 return 0;
  
-       p = kthread_create(bch2_allocator_thread, ca, "bcache_allocator");
+       p = kthread_create(bch2_allocator_thread, ca,
+                          "bch_alloc[%s]", ca->name);
         if (IS_ERR(p))
                 return PTR_ERR(p);
  
@@ -1923,7 +1954,7 @@ static void allocator_start_issue_discards(struct bch_fs *c)
  static int __bch2_fs_allocator_start(struct bch_fs *c)
  {
         struct bch_dev *ca;
-       size_t bu, i, devs_have_enough = 0;
+       size_t bu, i;
         unsigned dev_iter;
         u64 journal_seq = 0;
         bool invalidating_data = false;
@@ -1964,16 +1995,21 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
  
         /* did we find enough buckets? */
         for_each_rw_member(ca, c, dev_iter)
-               devs_have_enough += (fifo_used(&ca->free_inc) >=
-                                    ca->free[RESERVE_BTREE].size);
+               if (fifo_used(&ca->free_inc) < ca->free[RESERVE_BTREE].size) {
+                       percpu_ref_put(&ca->io_ref);
+                       goto not_enough;
+               }
  
-       if (devs_have_enough >= c->opts.metadata_replicas)
-               return 0;
+       return 0;
+not_enough:
+       pr_debug("did not find enough empty buckets; issuing discards");
  
         /* clear out free_inc - find_reclaimable_buckets() assumes it's empty */
         for_each_rw_member(ca, c, dev_iter)
                 discard_invalidated_buckets(c, ca);
  
+       pr_debug("scanning for reclaimable buckets");
+
         for_each_rw_member(ca, c, dev_iter) {
                 BUG_ON(!fifo_empty(&ca->free_inc));
                 ca->free_inc.front = ca->free_inc.back  = 0;
@@ -1988,6 +2024,8 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
                                 break;
         }
  
+       pr_debug("done scanning for reclaimable buckets");
+
         /*
          * We're moving buckets to freelists _before_ they've been marked as
          * invalidated on disk - we have to so that we can allocate new btree
@@ -1997,10 +2035,13 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
          * have cached data in them, which is live until they're marked as
          * invalidated on disk:
          */
-       if (invalidating_data)
+       if (invalidating_data) {
+               pr_debug("invalidating existing data");
                 set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
-       else
+       } else {
+               pr_debug("issuing discards");
                 allocator_start_issue_discards(c);
+       }
  
         /*
          * XXX: it's possible for this to deadlock waiting on journal reclaim,
@@ -2017,13 +2058,15 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
         }
  
         if (invalidating_data) {
+               pr_debug("flushing journal");
+
                 ret = bch2_journal_flush_seq(&c->journal, journal_seq);
                 if (ret)
                         return ret;
-       }
  
-       if (invalidating_data)
+               pr_debug("issuing discards");
                 allocator_start_issue_discards(c);
+       }
  
         for_each_rw_member(ca, c, dev_iter)
                 while (ca->nr_invalidated) {
@@ -2038,19 +2081,43 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
                 struct bucket_table *tbl;
                 struct rhash_head *pos;
                 struct btree *b;
+               bool flush_updates;
+               size_t nr_pending_updates;
  
                 clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
  again:
+               pr_debug("flushing dirty btree nodes");
+               cond_resched();
+
+               flush_updates = false;
+               nr_pending_updates = bch2_btree_interior_updates_nr_pending(c);
+
+
                 rcu_read_lock();
                 for_each_cached_btree(b, c, tbl, i, pos)
                         if (btree_node_dirty(b) && (!b->written || b->level)) {
-                               rcu_read_unlock();
-                               six_lock_read(&b->lock);
-                               bch2_btree_node_write(c, b, SIX_LOCK_read);
-                               six_unlock_read(&b->lock);
-                               goto again;
+                               if (btree_node_may_write(b)) {
+                                       rcu_read_unlock();
+                                       six_lock_read(&b->lock);
+                                       bch2_btree_node_write(c, b, SIX_LOCK_read);
+                                       six_unlock_read(&b->lock);
+                                       goto again;
+                               } else {
+                                       flush_updates = true;
+                               }
                         }
                 rcu_read_unlock();
+
+               /*
+                * This is ugly, but it's needed to flush btree node writes
+                * without spinning...
+                */
+               if (flush_updates) {
+                       closure_wait_event(&c->btree_interior_update_wait,
+                               bch2_btree_interior_updates_nr_pending(c) <
+                               nr_pending_updates);
+                       goto again;
+               }
         }
  
         return 0;
@@ -2087,8 +2154,8 @@ void bch2_fs_allocator_init(struct bch_fs *c)
  
         mutex_init(&c->write_points_hash_lock);
         spin_lock_init(&c->freelist_lock);
-       bch2_prio_timer_init(c, READ);
-       bch2_prio_timer_init(c, WRITE);
+       bch2_bucket_clock_init(c, READ);
+       bch2_bucket_clock_init(c, WRITE);
  
         /* open bucket 0 is a sentinal NULL: */
         spin_lock_init(&c->open_buckets[0].lock);
diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h

index f914dbd56c2cfb6ac53526dadaca569ee8bd8107..372cc047e92749a0f3c2414e07313ee350f48f09 100644 (file)
--- a/libbcachefs/alloc.h
+++ b/libbcachefs/alloc.h
@@ -9,6 +9,14 @@ struct bch_dev;
  struct bch_fs;
  struct bch_devs_List;
  
+const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_alloc_ops (struct bkey_ops) {                \
+       .key_invalid    = bch2_alloc_invalid,           \
+       .val_to_text    = bch2_alloc_to_text,           \
+}
+
  struct dev_alloc_list {
         unsigned        nr;
         u8              devs[BCH_SB_MEMBERS_MAX];
@@ -30,6 +38,8 @@ enum bucket_alloc_ret {
         NO_DEVICES              = -3,   /* -EROFS */
  };
  
+long bch2_bucket_alloc_new_fs(struct bch_dev *);
+
  int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
                       struct closure *);
  
@@ -127,6 +137,4 @@ int bch2_alloc_write(struct bch_fs *);
  int bch2_fs_allocator_start(struct bch_fs *);
  void bch2_fs_allocator_init(struct bch_fs *);
  
-extern const struct bkey_ops bch2_bkey_alloc_ops;
-
  #endif /* _BCACHEFS_ALLOC_H */
diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h

index f3bd470110257b36027c7bb3cf0bcada94c11989..8a71a37637dee128da0607ff55c787dedcf60e95 100644 (file)
--- a/libbcachefs/alloc_types.h
+++ b/libbcachefs/alloc_types.h
@@ -8,7 +8,7 @@
  #include "fifo.h"
  
  /* There's two of these clocks, one for reads and one for writes: */
-struct prio_clock {
+struct bucket_clock {
         /*
          * "now" in (read/write) IO time - incremented whenever we do X amount
          * of reads or writes.
@@ -23,7 +23,7 @@ struct prio_clock {
          * consistent.
          */
         u16                     hand;
-       u16                     min_prio;
+       u16                     max_last_io;
  
         int                     rw;
  
@@ -80,6 +80,7 @@ struct write_point_specifier {
  
  struct alloc_heap_entry {
         size_t                  bucket;
+       size_t                  nr;
         unsigned long           key;
  };
  
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h

index 369d078c3d2e79199549eb00b15e120f20e9d7a3..bc10324f4e4a553cf7507cf106046c0773ace9fb 100644 (file)
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -384,7 +384,7 @@ struct bch_dev {
         alloc_fifo              free[RESERVE_NR];
         alloc_fifo              free_inc;
         spinlock_t              freelist_lock;
-       unsigned                nr_invalidated;
+       size_t                  nr_invalidated;
  
         u8                      open_buckets_partial[OPEN_BUCKETS_COUNT];
         unsigned                open_buckets_partial_nr;
@@ -392,7 +392,7 @@ struct bch_dev {
         size_t                  fifo_last_bucket;
  
         /* last calculated minimum prio */
-       u16                     min_prio[2];
+       u16                     max_last_bucket_io[2];
  
         atomic_long_t           saturated_count;
         size_t                  inc_gen_needs_gc;
@@ -431,11 +431,11 @@ struct bch_dev {
   */
  enum {
         /* startup: */
-       BCH_FS_BRAND_NEW_FS,
         BCH_FS_ALLOC_READ_DONE,
         BCH_FS_ALLOCATOR_STARTED,
         BCH_FS_INITIAL_GC_DONE,
         BCH_FS_FSCK_DONE,
+       BCH_FS_STARTED,
  
         /* shutdown: */
         BCH_FS_EMERGENCY_RO,
@@ -519,8 +519,7 @@ struct bch_fs {
                 u64             features;
         }                       sb;
  
-       struct bch_sb           *disk_sb;
-       unsigned                disk_sb_order;
+       struct bch_sb_handle    disk_sb;
  
         unsigned short          block_bits;     /* ilog2(block_size) */
  
@@ -595,7 +594,7 @@ struct bch_fs {
          * those together consistently we keep track of the smallest nonzero
          * priority of any bucket.
          */
-       struct prio_clock       prio_clock[2];
+       struct bucket_clock     bucket_clock[2];
  
         struct io_clock         io_clock[2];
  
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h

index d89f7781acd9681b91f741ca4150c593dcaa6c1a..eed6fb852b9007de6899797092a24515885425f6 100644 (file)
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -955,8 +955,9 @@ struct bch_disk_group {
         __le64                  flags[2];
  };
  
-LE64_BITMASK(BCH_GROUP_DELETED,                struct bch_disk_group, flags[0], 0, 1)
-LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,   struct bch_disk_group, flags[0], 1, 6)
+LE64_BITMASK(BCH_GROUP_DELETED,                struct bch_disk_group, flags[0], 0,  1)
+LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,   struct bch_disk_group, flags[0], 1,  6)
+LE64_BITMASK(BCH_GROUP_PARENT,         struct bch_disk_group, flags[0], 6, 24)
  
  struct bch_sb_field_disk_groups {
         struct bch_sb_field     field;
diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c

index 84cdf66221cfd9e41b3a992595213abbb8e34267..e4f62f905f114b6e460428d1b8e151f8c8c2d33a 100644 (file)
--- a/libbcachefs/bkey_methods.c
+++ b/libbcachefs/bkey_methods.c
@@ -10,20 +10,20 @@
  #include "quota.h"
  #include "xattr.h"
  
-const struct bkey_ops *bch2_bkey_ops[] = {
-       [BKEY_TYPE_EXTENTS]     = &bch2_bkey_extent_ops,
-       [BKEY_TYPE_INODES]      = &bch2_bkey_inode_ops,
-       [BKEY_TYPE_DIRENTS]     = &bch2_bkey_dirent_ops,
-       [BKEY_TYPE_XATTRS]      = &bch2_bkey_xattr_ops,
-       [BKEY_TYPE_ALLOC]       = &bch2_bkey_alloc_ops,
-       [BKEY_TYPE_QUOTAS]      = &bch2_bkey_quota_ops,
-       [BKEY_TYPE_BTREE]       = &bch2_bkey_btree_ops,
+const struct bkey_ops bch2_bkey_ops[] = {
+       [BKEY_TYPE_EXTENTS]     = bch2_bkey_extent_ops,
+       [BKEY_TYPE_INODES]      = bch2_bkey_inode_ops,
+       [BKEY_TYPE_DIRENTS]     = bch2_bkey_dirent_ops,
+       [BKEY_TYPE_XATTRS]      = bch2_bkey_xattr_ops,
+       [BKEY_TYPE_ALLOC]       = bch2_bkey_alloc_ops,
+       [BKEY_TYPE_QUOTAS]      = bch2_bkey_quota_ops,
+       [BKEY_TYPE_BTREE]       = bch2_bkey_btree_ops,
  };
  
  const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
                                   struct bkey_s_c k)
  {
-       const struct bkey_ops *ops = bch2_bkey_ops[type];
+       const struct bkey_ops *ops = &bch2_bkey_ops[type];
  
         switch (k.k->type) {
         case KEY_TYPE_DELETED:
@@ -51,7 +51,7 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
  const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
                               struct bkey_s_c k)
  {
-       const struct bkey_ops *ops = bch2_bkey_ops[type];
+       const struct bkey_ops *ops = &bch2_bkey_ops[type];
  
         if (k.k->u64s < BKEY_U64s)
                 return "u64s too small";
@@ -100,7 +100,7 @@ const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
  void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
  {
         enum bkey_type type = btree_node_type(b);
-       const struct bkey_ops *ops = bch2_bkey_ops[type];
+       const struct bkey_ops *ops = &bch2_bkey_ops[type];
         const char *invalid;
  
         BUG_ON(!k.k->u64s);
@@ -141,7 +141,7 @@ int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
  int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
                      char *buf, size_t size, struct bkey_s_c k)
  {
-       const struct bkey_ops *ops = bch2_bkey_ops[type];
+       const struct bkey_ops *ops = &bch2_bkey_ops[type];
         char *out = buf, *end = buf + size;
  
         switch (k.k->type) {
@@ -182,7 +182,7 @@ void bch2_bkey_swab(enum bkey_type type,
                    const struct bkey_format *f,
                    struct bkey_packed *k)
  {
-       const struct bkey_ops *ops = bch2_bkey_ops[type];
+       const struct bkey_ops *ops = &bch2_bkey_ops[type];
  
         bch2_bkey_swab_key(f, k);
  
diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h

index 59db3037e6dde1fdafdc3991d2b3a53921ad08d4..9e2c90d54e4242681ef499c5b9dbc35646b8ac8a 100644 (file)
--- a/libbcachefs/bkey_methods.h
+++ b/libbcachefs/bkey_methods.h
@@ -81,6 +81,6 @@ int bch2_bkey_val_to_text(struct bch_fs *, enum bkey_type,
  void bch2_bkey_swab(enum bkey_type, const struct bkey_format *,
                     struct bkey_packed *);
  
-extern const struct bkey_ops *bch2_bkey_ops[];
+extern const struct bkey_ops bch2_bkey_ops[];
  
  #endif /* _BCACHEFS_BKEY_METHODS_H */
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c

index f2e9c10e4efecf1532721ce449eff3782199b14a..ad51f29c9a38450f0aac4d917e63978e39b8aaee 100644 (file)
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -18,6 +18,7 @@
  #include "journal.h"
  #include "keylist.h"
  #include "move.h"
+#include "replicas.h"
  #include "super-io.h"
  
  #include <linux/slab.h>
@@ -317,7 +318,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
         unsigned i;
         u64 b;
  
-       lockdep_assert_held(&c->sb_lock);
+       if (c)
+               lockdep_assert_held(&c->sb_lock);
  
         for (i = 0; i < layout->nr_superblocks; i++) {
                 u64 offset = le64_to_cpu(layout->sb_offset[i]);
@@ -331,7 +333,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
                                       BCH_DATA_SB, flags);
         }
  
-       spin_lock(&c->journal.lock);
+       if (c)
+               spin_lock(&c->journal.lock);
  
         for (i = 0; i < ca->journal.nr; i++) {
                 b = ca->journal.buckets[i];
@@ -340,7 +343,8 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
                                           gc_phase(GC_PHASE_SB), flags);
         }
  
-       spin_unlock(&c->journal.lock);
+       if (c)
+               spin_unlock(&c->journal.lock);
  }
  
  static void bch2_mark_superblocks(struct bch_fs *c)
@@ -1034,8 +1038,8 @@ static int __bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
         int ret;
  
         mutex_lock(&c->sb_lock);
-       if (!bch2_sb_get_replicas(c->disk_sb)) {
-               if (BCH_SB_INITIALIZED(c->disk_sb))
+       if (!bch2_sb_get_replicas(c->disk_sb.sb)) {
+               if (BCH_SB_INITIALIZED(c->disk_sb.sb))
                         bch_info(c, "building replicas info");
                 set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
         }
diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c

index cc5bcbb28a50c2508bfe922d11feb02c571de64a..465aadbad5efbe358fbe6209ae305b96e8746368 100644 (file)
--- a/libbcachefs/btree_iter.c
+++ b/libbcachefs/btree_iter.c
@@ -1290,16 +1290,19 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
  
  struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
  {
+       iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
+
         if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
-               struct bkey_s_c k;
+               /*
+                * XXX: when we just need to relock we should be able to avoid
+                * calling traverse, but we need to kill BTREE_ITER_NEED_PEEK
+                * for that to work
+                */
+               btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
  
-               k = bch2_btree_iter_peek_slot(iter);
-               if (btree_iter_err(k))
-                       return k;
+               return bch2_btree_iter_peek_slot(iter);
         }
  
-       iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
-
         if (!bkey_deleted(&iter->k))
                 __btree_iter_advance(&iter->l[0]);
  
@@ -1318,6 +1321,8 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
  
         iter->c                         = c;
         iter->pos                       = pos;
+       bkey_init(&iter->k);
+       iter->k.p                       = pos;
         iter->flags                     = flags;
         iter->uptodate                  = BTREE_ITER_NEED_TRAVERSE;
         iter->btree_id                  = btree_id;
@@ -1330,6 +1335,10 @@ void __bch2_btree_iter_init(struct btree_iter *iter, struct bch_fs *c,
         iter->l[iter->level].b          = BTREE_ITER_NOT_END;
         iter->next                      = iter;
  
+       if (unlikely((flags & BTREE_ITER_IS_EXTENTS) &&
+                    !bkey_cmp(pos, POS_MAX)))
+               iter->uptodate = BTREE_ITER_END;
+
         prefetch(c->btree_roots[btree_id].b);
  }
  
diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h

index 318b04242d52f7f4864ed98dc248469ece2d1d35..95191ba2bc79145274c96cf42bc0105da05c1cfc 100644 (file)
--- a/libbcachefs/btree_iter.h
+++ b/libbcachefs/btree_iter.h
@@ -231,6 +231,20 @@ static inline int btree_iter_cmp(const struct btree_iter *l,
         return __btree_iter_cmp(l->btree_id, l->pos, r);
  }
  
+/*
+ * Unlocks before scheduling
+ * Note: does not revalidate iterator
+ */
+static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter)
+{
+       if (need_resched()) {
+               bch2_btree_iter_unlock(iter);
+               schedule();
+       } else if (race_fault()) {
+               bch2_btree_iter_unlock(iter);
+       }
+}
+
  #define __for_each_btree_node(_iter, _c, _btree_id, _start,            \
                               _locks_want, _depth, _flags, _b)          \
         for (__bch2_btree_iter_init((_iter), (_c), (_btree_id), _start, \
@@ -253,6 +267,8 @@ static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter,
  static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter,
                                                      unsigned flags)
  {
+       bch2_btree_iter_cond_resched(iter);
+
         return flags & BTREE_ITER_SLOTS
                 ? bch2_btree_iter_next_slot(iter)
                 : bch2_btree_iter_next(iter);
@@ -275,18 +291,4 @@ static inline int btree_iter_err(struct bkey_s_c k)
         return PTR_ERR_OR_ZERO(k.k);
  }
  
-/*
- * Unlocks before scheduling
- * Note: does not revalidate iterator
- */
-static inline void bch2_btree_iter_cond_resched(struct btree_iter *iter)
-{
-       if (need_resched()) {
-               bch2_btree_iter_unlock(iter);
-               schedule();
-       } else if (race_fault()) {
-               bch2_btree_iter_unlock(iter);
-       }
-}
-
  #endif /* _BCACHEFS_BTREE_ITER_H */
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h

index e86c6bcea229e417ed9e35812437f50dc13df88e..8854305d315e8c34c859a9354f6429b32df56dc3 100644 (file)
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -299,7 +299,7 @@ static inline enum bkey_type btree_node_type(struct btree *b)
  
  static inline const struct bkey_ops *btree_node_ops(struct btree *b)
  {
-       return bch2_bkey_ops[btree_node_type(b)];
+       return &bch2_bkey_ops[btree_node_type(b)];
  }
  
  static inline bool btree_node_has_ptrs(struct btree *b)
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c

index f42239dab71cd00d11d8250ab5c839bdb119892d..63696920ba9a1c40c7b86befc6c067e1fa2513f1 100644 (file)
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -13,6 +13,7 @@
  #include "extents.h"
  #include "journal.h"
  #include "keylist.h"
+#include "replicas.h"
  #include "super-io.h"
  
  #include <linux/random.h>
@@ -2116,3 +2117,16 @@ ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf)
  
         return out - buf;
  }
+
+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
+{
+       size_t ret = 0;
+       struct list_head *i;
+
+       mutex_lock(&c->btree_interior_update_lock);
+       list_for_each(i, &c->btree_interior_update_list)
+               ret++;
+       mutex_unlock(&c->btree_interior_update_lock);
+
+       return ret;
+}
diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h

index 0b58ccc904a442b68bd9924f6042b04984d0b618..3e66d69eda1bee57ebc9fdebc28e4c34f373ad3e 100644 (file)
--- a/libbcachefs/btree_update_interior.h
+++ b/libbcachefs/btree_update_interior.h
@@ -343,4 +343,6 @@ static inline bool journal_res_insert_fits(struct btree_insert *trans,
  
  ssize_t bch2_btree_updates_print(struct bch_fs *, char *);
  
+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *);
+
  #endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c

index 007aa5ef40910e89c6b764500e5d9e3055d0631b..53b39de52c6bb3e052aff5f3823d6d8f387407b1 100644 (file)
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -443,8 +443,20 @@ split:
          * potentially blocks the allocator:
          */
         ret = bch2_btree_split_leaf(c, split, trans->flags);
+
+       /*
+        * This can happen when we insert part of an extent - with an update
+        * with multiple keys, we don't want to redo the entire update - that's
+        * just too confusing:
+        */
+       if (!ret &&
+           (trans->flags & BTREE_INSERT_ATOMIC) &&
+           trans->did_work)
+               ret = -EINTR;
+
         if (ret)
                 goto err;
+
         /*
          * if the split didn't have to drop locks the insert will still be
          * atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked()
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c

index 864de940f1e73227bdf8714bf2cbaa14d2741e03..1f944cb8a3e5df9d6725cebb7fd3aa722bb639d2 100644 (file)
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -309,7 +309,7 @@ static bool bucket_became_unavailable(struct bch_fs *c,
  {
         return is_available_bucket(old) &&
                !is_available_bucket(new) &&
-              c && c->gc_pos.phase == GC_PHASE_DONE;
+              (!c || c->gc_pos.phase == GC_PHASE_DONE);
  }
  
  void bch2_fs_usage_apply(struct bch_fs *c,
@@ -351,12 +351,16 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
  {
         struct bch_dev_usage *dev_usage;
  
-       lockdep_assert_held(&c->usage_lock);
+       if (c)
+               lockdep_assert_held(&c->usage_lock);
  
-       bch2_fs_inconsistent_on(old.data_type && new.data_type &&
-                       old.data_type != new.data_type, c,
+       if (old.data_type && new.data_type &&
+           old.data_type != new.data_type) {
+               BUG_ON(!c);
+               bch2_fs_inconsistent(c,
                         "different types of data in same bucket: %u, %u",
                         old.data_type, new.data_type);
+       }
  
         dev_usage = this_cpu_ptr(ca->usage_percpu);
  
@@ -466,21 +470,29 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
  
         BUG_ON(!type);
  
-       lg_local_lock(&c->usage_lock);
-       g = bucket(ca, b);
+       if (likely(c)) {
+               lg_local_lock(&c->usage_lock);
  
-       if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-           gc_will_visit(c, pos)) {
-               lg_local_unlock(&c->usage_lock);
-               return;
+               if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+                   gc_will_visit(c, pos)) {
+                       lg_local_unlock(&c->usage_lock);
+                       return;
+               }
         }
  
+       preempt_disable();
+
+       g = bucket(ca, b);
         old = bucket_data_cmpxchg(c, ca, g, new, ({
                 saturated_add(ca, new.dirty_sectors, sectors,
                               GC_MAX_SECTORS_USED);
                 new.data_type           = type;
         }));
-       lg_local_unlock(&c->usage_lock);
+
+       preempt_enable();
+
+       if (likely(c))
+               lg_local_unlock(&c->usage_lock);
  
         BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
                bucket_became_unavailable(c, old, new));
@@ -859,9 +871,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
  
         bch2_copygc_stop(ca);
  
-       down_write(&c->gc_lock);
-       down_write(&ca->bucket_lock);
-       lg_global_lock(&c->usage_lock);
+       if (resize) {
+               down_write(&c->gc_lock);
+               down_write(&ca->bucket_lock);
+               lg_global_lock(&c->usage_lock);
+       }
  
         old_buckets = bucket_array(ca);
  
@@ -885,7 +899,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
         swap(ca->oldest_gens, oldest_gens);
         swap(ca->buckets_dirty, buckets_dirty);
  
-       lg_global_unlock(&c->usage_lock);
+       if (resize)
+               lg_global_unlock(&c->usage_lock);
  
         spin_lock(&c->freelist_lock);
         for (i = 0; i < RESERVE_NR; i++) {
@@ -904,8 +919,10 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
  
         nbuckets = ca->mi.nbuckets;
  
-       up_write(&ca->bucket_lock);
-       up_write(&c->gc_lock);
+       if (resize) {
+               up_write(&ca->bucket_lock);
+               up_write(&c->gc_lock);
+       }
  
         if (start_copygc &&
             bch2_copygc_start(c, ca))
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h

index fda7fd704ae8680d411cf4b71a550f002e7ac517..399a853c80bbcb8c1f13c610c0e57b835c203eed 100644 (file)
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -31,6 +31,7 @@
  static inline struct bucket_array *bucket_array(struct bch_dev *ca)
  {
         return rcu_dereference_check(ca->buckets,
+                                    !ca->fs ||
                                      lockdep_is_held(&ca->fs->usage_lock) ||
                                      lockdep_is_held(&ca->fs->gc_lock) ||
                                      lockdep_is_held(&ca->bucket_lock));
@@ -47,7 +48,12 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
  static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
                                          size_t b, int rw)
  {
-       bucket(ca, b)->prio[rw] = c->prio_clock[rw].hand;
+       bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand;
+}
+
+static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
+{
+       return c->bucket_clock[rw].hand - g->io_time[rw];
  }
  
  /*
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h

index a0256e13618c3aec2dc0dd60a921e6a4d592b898..28bd2c59647796015ac31771e0a7d2b49ddd522c 100644 (file)
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -31,12 +31,12 @@ struct bucket_mark {
  };
  
  struct bucket {
-       u16                             prio[2];
-
         union {
                 struct bucket_mark      _mark;
                 const struct bucket_mark mark;
         };
+
+       u16                             io_time[2];
  };
  
  struct bucket_array {
@@ -85,8 +85,9 @@ struct disk_reservation {
  };
  
  struct copygc_heap_entry {
+       u8                      gen;
+       u32                     sectors;
         u64                     offset;
-       struct bucket_mark      mark;
  };
  
  typedef HEAP(struct copygc_heap_entry) copygc_heap;
diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c

index ab6dc665186e7bc42c200e6b66e24584d47bf769..8403bae64038102bff65ba38ea77f645d98975f9 100644 (file)
--- a/libbcachefs/chardev.c
+++ b/libbcachefs/chardev.c
@@ -372,6 +372,9 @@ static long bch2_ioctl_usage(struct bch_fs *c,
         unsigned i, j;
         int ret;
  
+       if (!test_bit(BCH_FS_STARTED, &c->flags))
+               return -EINVAL;
+
         if (copy_from_user(&arg, user_arg, sizeof(arg)))
                 return -EFAULT;
  
@@ -460,7 +463,7 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
  
                 sb = ca->disk_sb.sb;
         } else {
-               sb = c->disk_sb;
+               sb = c->disk_sb.sb;
         }
  
         if (vstruct_bytes(sb) > arg.size) {
@@ -535,13 +538,22 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
  
-       /* ioctls that do require admin cap: */
         switch (cmd) {
         case BCH_IOCTL_START:
                 BCH_IOCTL(start, struct bch_ioctl_start);
         case BCH_IOCTL_STOP:
                 return bch2_ioctl_stop(c);
+       case BCH_IOCTL_READ_SUPER:
+               BCH_IOCTL(read_super, struct bch_ioctl_read_super);
+       case BCH_IOCTL_DISK_GET_IDX:
+               BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
+       }
  
+       if (!test_bit(BCH_FS_STARTED, &c->flags))
+               return -EINVAL;
+
+       /* ioctls that do require admin cap: */
+       switch (cmd) {
         case BCH_IOCTL_DISK_ADD:
                 BCH_IOCTL(disk_add, struct bch_ioctl_disk);
         case BCH_IOCTL_DISK_REMOVE:
@@ -554,10 +566,6 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
                 BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state);
         case BCH_IOCTL_DATA:
                 BCH_IOCTL(data, struct bch_ioctl_data);
-       case BCH_IOCTL_READ_SUPER:
-               BCH_IOCTL(read_super, struct bch_ioctl_read_super);
-       case BCH_IOCTL_DISK_GET_IDX:
-               BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
         case BCH_IOCTL_DISK_RESIZE:
                 BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
  
diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c

index 56bd99fd8b7144cf2f091499c7e266ae4923cb49..6d8543eb65008e7a67b7f91455d73759654c4b8c 100644 (file)
--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@@ -569,7 +569,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c,
         if (!bch2_key_is_encrypted(&sb_key))
                 goto out;
  
-       ret = bch2_request_key(c->disk_sb, &user_key);
+       ret = bch2_request_key(c->disk_sb.sb, &user_key);
         if (ret) {
                 bch_err(c, "error requesting encryption key: %i", ret);
                 goto err;
@@ -623,7 +623,7 @@ int bch2_disable_encryption(struct bch_fs *c)
  
         mutex_lock(&c->sb_lock);
  
-       crypt = bch2_sb_get_crypt(c->disk_sb);
+       crypt = bch2_sb_get_crypt(c->disk_sb.sb);
         if (!crypt)
                 goto out;
  
@@ -639,7 +639,7 @@ int bch2_disable_encryption(struct bch_fs *c)
         crypt->key.magic        = BCH_KEY_MAGIC;
         crypt->key.key          = key;
  
-       SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 0);
+       SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0);
         bch2_write_super(c);
  out:
         mutex_unlock(&c->sb_lock);
@@ -657,7 +657,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
         mutex_lock(&c->sb_lock);
  
         /* Do we already have an encryption key? */
-       if (bch2_sb_get_crypt(c->disk_sb))
+       if (bch2_sb_get_crypt(c->disk_sb.sb))
                 goto err;
  
         ret = bch2_alloc_ciphers(c);
@@ -668,7 +668,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
         get_random_bytes(&key.key, sizeof(key.key));
  
         if (keyed) {
-               ret = bch2_request_key(c->disk_sb, &user_key);
+               ret = bch2_request_key(c->disk_sb.sb, &user_key);
                 if (ret) {
                         bch_err(c, "error requesting encryption key: %i", ret);
                         goto err;
@@ -685,7 +685,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
         if (ret)
                 goto err;
  
-       crypt = bch2_fs_sb_resize_crypt(c, sizeof(*crypt) / sizeof(u64));
+       crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64));
         if (!crypt) {
                 ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
                 goto err;
@@ -694,7 +694,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
         crypt->key = key;
  
         /* write superblock */
-       SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb, 1);
+       SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1);
         bch2_write_super(c);
  err:
         mutex_unlock(&c->sb_lock);
@@ -728,7 +728,7 @@ int bch2_fs_encryption_init(struct bch_fs *c)
                 goto out;
         }
  
-       crypt = bch2_sb_get_crypt(c->disk_sb);
+       crypt = bch2_sb_get_crypt(c->disk_sb.sb);
         if (!crypt)
                 goto out;
  
diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h

index 7862294bc03b55cb778483e396642c30f7e41d45..2690cc4baeead200b0fb1fb9bdadcd9ce8d2a703 100644 (file)
--- a/libbcachefs/checksum.h
+++ b/libbcachefs/checksum.h
@@ -117,6 +117,7 @@ static const unsigned bch_crc_bytes[] = {
         [BCH_CSUM_CHACHA20_POLY1305_128]        = 16,
  };
  
+/* returns true if not equal */
  static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
  {
         /*
diff --git a/libbcachefs/clock_types.h b/libbcachefs/clock_types.h

index bfd4b3030e357fab16fe4aaee88a2cdf80f34929..df404b6dd3fe7c24760da5f815653059ebda66b6 100644 (file)
--- a/libbcachefs/clock_types.h
+++ b/libbcachefs/clock_types.h
@@ -3,7 +3,7 @@
  
  #include "util.h"
  
-#define NR_IO_TIMERS           8
+#define NR_IO_TIMERS           (BCH_SB_MEMBERS_MAX * 3)
  
  /*
   * Clocks/timers in units of sectors of IO:
diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c

index 18c945985636282e85cb2c68e89dfb07f6167af3..1af62621da1b070d71af7ce3a6b09ec6eff48b8b 100644 (file)
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@@ -500,7 +500,7 @@ int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f)
                 return ret;
         }
  
-       c->disk_sb->features[0] |= cpu_to_le64(f);
+       c->disk_sb.sb->features[0] |= cpu_to_le64(f);
         bch2_write_super(c);
         mutex_unlock(&c->sb_lock);
  
diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c

index 00e0de167b322e67c5dd1db0d1edb59e2b902a43..7190990dbfa149e86afee651caba6236f9a56626 100644 (file)
--- a/libbcachefs/debug.c
+++ b/libbcachefs/debug.c
@@ -212,17 +212,20 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
         if (!i->size)
                 return i->ret;
  
-       for_each_btree_key(&iter, i->c, i->id, i->from,
-                          BTREE_ITER_PREFETCH, k) {
-               i->from = iter.pos;
+       bch2_btree_iter_init(&iter, i->c, i->id, i->from, BTREE_ITER_PREFETCH);
+       k = bch2_btree_iter_peek(&iter);
  
+       while (k.k && !(err = btree_iter_err(k))) {
                 bch2_bkey_val_to_text(i->c, bkey_type(0, i->id),
-                                    i->buf, sizeof(i->buf), k);
+                                     i->buf, sizeof(i->buf), k);
                 i->bytes = strlen(i->buf);
                 BUG_ON(i->bytes >= PAGE_SIZE);
                 i->buf[i->bytes] = '\n';
                 i->bytes++;
  
+               k = bch2_btree_iter_next(&iter);
+               i->from = iter.pos;
+
                 err = flush_buf(i);
                 if (err)
                         break;
@@ -230,7 +233,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
                 if (!i->size)
                         break;
         }
-       err = bch2_btree_iter_unlock(&iter) ?: err;
+       bch2_btree_iter_unlock(&iter);
  
         return err < 0 ? err : i->ret;
  }
diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c

index 6bdece3a7637b6c0b2eb8ff5e1185abcd991a052..df9913f8967b8641cf3b1a4e0afee0541430f4f5 100644 (file)
--- a/libbcachefs/dirent.c
+++ b/libbcachefs/dirent.c
@@ -79,8 +79,7 @@ const struct bch_hash_desc bch2_dirent_hash_desc = {
         .cmp_bkey       = dirent_cmp_bkey,
  };
  
-static const char *bch2_dirent_invalid(const struct bch_fs *c,
-                                      struct bkey_s_c k)
+const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
  {
         struct bkey_s_c_dirent d;
         unsigned len;
@@ -116,8 +115,8 @@ static const char *bch2_dirent_invalid(const struct bch_fs *c,
         }
  }
  
-static void bch2_dirent_to_text(struct bch_fs *c, char *buf,
-                               size_t size, struct bkey_s_c k)
+void bch2_dirent_to_text(struct bch_fs *c, char *buf,
+                        size_t size, struct bkey_s_c k)
  {
         struct bkey_s_c_dirent d;
         size_t n = 0;
@@ -136,11 +135,6 @@ static void bch2_dirent_to_text(struct bch_fs *c, char *buf,
         }
  }
  
-const struct bkey_ops bch2_bkey_dirent_ops = {
-       .key_invalid    = bch2_dirent_invalid,
-       .val_to_text    = bch2_dirent_to_text,
-};
-
  static struct bkey_i_dirent *dirent_create_key(u8 type,
                                 const struct qstr *name, u64 dst)
  {
diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h

index 98405b5be2b81750dafa9683923ac32de2092ecf..5d066af18f9533d6548c241b589f23c0bbc5537d 100644 (file)
--- a/libbcachefs/dirent.h
+++ b/libbcachefs/dirent.h
@@ -4,7 +4,14 @@
  #include "str_hash.h"
  
  extern const struct bch_hash_desc bch2_dirent_hash_desc;
-extern const struct bkey_ops bch2_bkey_dirent_ops;
+
+const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_dirent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_dirent_ops (struct bkey_ops) {       \
+       .key_invalid    = bch2_dirent_invalid,          \
+       .val_to_text    = bch2_dirent_to_text,          \
+}
  
  struct qstr;
  struct file;
diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c

new file mode 100644 (file)

index 0000000..c129a33
--- /dev/null
+++ b/libbcachefs/disk_groups.c
@@ -0,0 +1,462 @@
+#include "bcachefs.h"
+#include "disk_groups.h"
+#include "super-io.h"
+
+#include <linux/sort.h>
+
+static int group_cmp(const void *_l, const void *_r)
+{
+       const struct bch_disk_group *l = _l;
+       const struct bch_disk_group *r = _r;
+
+       return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) -
+               (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?:
+               ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) -
+                (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?:
+               strncmp(l->label, r->label, sizeof(l->label));
+}
+
+const char *bch2_sb_disk_groups_validate(struct bch_sb *sb,
+                                        struct bch_sb_field *f)
+{
+       struct bch_sb_field_disk_groups *groups =
+               field_to_type(f, disk_groups);
+       struct bch_disk_group *g, *sorted = NULL;
+       struct bch_sb_field_members *mi;
+       struct bch_member *m;
+       unsigned i, nr_groups, len;
+       const char *err = NULL;
+
+       mi              = bch2_sb_get_members(sb);
+       groups          = bch2_sb_get_disk_groups(sb);
+       nr_groups       = disk_groups_nr(groups);
+
+       for (m = mi->members;
+            m < mi->members + sb->nr_devices;
+            m++) {
+               unsigned g;
+
+               if (!BCH_MEMBER_GROUP(m))
+                       continue;
+
+               g = BCH_MEMBER_GROUP(m) - 1;
+
+               if (g >= nr_groups ||
+                   BCH_GROUP_DELETED(&groups->entries[g]))
+                       return "disk has invalid group";
+       }
+
+       if (!nr_groups)
+               return NULL;
+
+       for (g = groups->entries;
+            g < groups->entries + nr_groups;
+            g++) {
+               if (BCH_GROUP_DELETED(g))
+                       continue;
+
+               len = strnlen(g->label, sizeof(g->label));
+               if (!len) {
+                       err = "group with empty label";
+                       goto err;
+               }
+       }
+
+       sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL);
+       if (!sorted)
+               return "cannot allocate memory";
+
+       memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted));
+       sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL);
+
+       for (i = 0; i + 1 < nr_groups; i++)
+               if (!BCH_GROUP_DELETED(sorted + i) &&
+                   !group_cmp(sorted + i, sorted + i + 1)) {
+                       err = "duplicate groups";
+                       goto err;
+               }
+
+       err = NULL;
+err:
+       kfree(sorted);
+       return err;
+}
+
+static size_t bch2_sb_disk_groups_to_text(char *buf, size_t size,
+                                       struct bch_sb *sb,
+                                       struct bch_sb_field *f)
+{
+       char *out = buf, *end = buf + size;
+       struct bch_sb_field_disk_groups *groups =
+               field_to_type(f, disk_groups);
+       struct bch_disk_group *g;
+       unsigned nr_groups = disk_groups_nr(groups);
+
+       for (g = groups->entries;
+            g < groups->entries + nr_groups;
+            g++) {
+               if (g != groups->entries)
+                       out += scnprintf(out, end - out, " ");
+
+               if (BCH_GROUP_DELETED(g))
+                       out += scnprintf(out, end - out, "[deleted]");
+               else
+                       out += scnprintf(out, end - out,
+                                        "[parent %llu name %s]",
+                                        BCH_GROUP_PARENT(g),
+                                        g->label);
+       }
+
+       return out - buf;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = {
+       .validate       = bch2_sb_disk_groups_validate,
+       .to_text        = bch2_sb_disk_groups_to_text
+};
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
+{
+       struct bch_sb_field_members *mi;
+       struct bch_sb_field_disk_groups *groups;
+       struct bch_disk_groups_cpu *cpu_g, *old_g;
+       unsigned i, g, nr_groups;
+
+       lockdep_assert_held(&c->sb_lock);
+
+       mi              = bch2_sb_get_members(c->disk_sb.sb);
+       groups          = bch2_sb_get_disk_groups(c->disk_sb.sb);
+       nr_groups       = disk_groups_nr(groups);
+
+       if (!groups)
+               return 0;
+
+       cpu_g = kzalloc(sizeof(*cpu_g) +
+                       sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
+       if (!cpu_g)
+               return -ENOMEM;
+
+       cpu_g->nr = nr_groups;
+
+       for (i = 0; i < nr_groups; i++) {
+               struct bch_disk_group *src      = &groups->entries[i];
+               struct bch_disk_group_cpu *dst  = &cpu_g->entries[i];
+
+               dst->deleted    = BCH_GROUP_DELETED(src);
+               dst->parent     = BCH_GROUP_PARENT(src);
+       }
+
+       for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+               struct bch_member *m = mi->members + i;
+               struct bch_disk_group_cpu *dst =
+                       &cpu_g->entries[BCH_MEMBER_GROUP(m)];
+
+               if (!bch2_member_exists(m))
+                       continue;
+
+               g = BCH_MEMBER_GROUP(m);
+               while (g) {
+                       dst = &cpu_g->entries[g - 1];
+                       __set_bit(i, dst->devs.d);
+                       g = dst->parent;
+               }
+       }
+
+       old_g = c->disk_groups;
+       rcu_assign_pointer(c->disk_groups, cpu_g);
+       if (old_g)
+               kfree_rcu(old_g, rcu);
+
+       return 0;
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
+{
+       struct target t = target_decode(target);
+
+       switch (t.type) {
+       case TARGET_DEV: {
+               struct bch_dev *ca = t.dev < c->sb.nr_devices
+                       ? rcu_dereference(c->devs[t.dev])
+                       : NULL;
+               return ca ? &ca->self : NULL;
+       }
+       case TARGET_GROUP: {
+               struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
+
+               return t.group < g->nr && !g->entries[t.group].deleted
+                       ? &g->entries[t.group].devs
+                       : NULL;
+       }
+       default:
+               BUG();
+       }
+}
+
+static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
+                                 unsigned parent,
+                                 const char *name, unsigned namelen)
+{
+       unsigned i, nr_groups = disk_groups_nr(groups);
+
+       if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+               return -EINVAL;
+
+       for (i = 0; i < nr_groups; i++) {
+               struct bch_disk_group *g = groups->entries + i;
+
+               if (BCH_GROUP_DELETED(g))
+                       continue;
+
+               if (!BCH_GROUP_DELETED(g) &&
+                   BCH_GROUP_PARENT(g) == parent &&
+                   strnlen(g->label, sizeof(g->label)) == namelen &&
+                   !memcmp(name, g->label, namelen))
+                       return i;
+       }
+
+       return -1;
+}
+
+static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent,
+                                const char *name, unsigned namelen)
+{
+       struct bch_sb_field_disk_groups *groups =
+               bch2_sb_get_disk_groups(sb->sb);
+       unsigned i, nr_groups = disk_groups_nr(groups);
+       struct bch_disk_group *g;
+
+       if (!namelen || namelen > BCH_SB_LABEL_SIZE)
+               return -EINVAL;
+
+       for (i = 0;
+            i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
+            i++)
+               ;
+
+       if (i == nr_groups) {
+               unsigned u64s =
+                       (sizeof(struct bch_sb_field_disk_groups) +
+                        sizeof(struct bch_disk_group) * (nr_groups + 1)) /
+                       sizeof(u64);
+
+               groups = bch2_sb_resize_disk_groups(sb, u64s);
+               if (!groups)
+                       return -ENOSPC;
+
+               nr_groups = disk_groups_nr(groups);
+       }
+
+       BUG_ON(i >= nr_groups);
+
+       g = &groups->entries[i];
+
+       memcpy(g->label, name, namelen);
+       if (namelen < sizeof(g->label))
+               g->label[namelen] = '\0';
+       SET_BCH_GROUP_DELETED(g, 0);
+       SET_BCH_GROUP_PARENT(g, parent);
+       SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
+
+       return i;
+}
+
+int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name)
+{
+       struct bch_sb_field_disk_groups *groups =
+               bch2_sb_get_disk_groups(sb->sb);
+       int v = -1;
+
+       do {
+               const char *next = strchrnul(name, '.');
+               unsigned len = next - name;
+
+               if (*next == '.')
+                       next++;
+
+               v = __bch2_disk_group_find(groups, v + 1, name, len);
+               name = next;
+       } while (*name && v >= 0);
+
+       return v;
+}
+
+int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name)
+{
+       struct bch_sb_field_disk_groups *groups;
+       unsigned parent = 0;
+       int v = -1;
+
+       do {
+               const char *next = strchrnul(name, '.');
+               unsigned len = next - name;
+
+               if (*next == '.')
+                       next++;
+
+               groups = bch2_sb_get_disk_groups(sb->sb);
+
+               v = __bch2_disk_group_find(groups, parent, name, len);
+               if (v < 0)
+                       v = __bch2_disk_group_add(sb, parent, name, len);
+               if (v < 0)
+                       return v;
+
+               parent = v + 1;
+               name = next;
+       } while (*name && v >= 0);
+
+       return v;
+}
+
+int bch2_disk_path_print(struct bch_sb_handle *sb,
+                        char *buf, size_t len, unsigned v)
+{
+       char *out = buf, *end = out + len;
+       struct bch_sb_field_disk_groups *groups =
+               bch2_sb_get_disk_groups(sb->sb);
+       struct bch_disk_group *g;
+       unsigned nr = 0;
+       u16 path[32];
+
+       while (1) {
+               if (nr == ARRAY_SIZE(path))
+                       goto inval;
+
+               if (v >= disk_groups_nr(groups))
+                       goto inval;
+
+               g = groups->entries + v;
+
+               if (BCH_GROUP_DELETED(g))
+                       goto inval;
+
+               path[nr++] = v;
+
+               if (!BCH_GROUP_PARENT(g))
+                       break;
+
+               v = BCH_GROUP_PARENT(g) - 1;
+       }
+
+       while (nr) {
+               unsigned b = 0;
+
+               v = path[--nr];
+               g = groups->entries + v;
+
+               if (end != out)
+                       b = min_t(size_t, end - out,
+                                 strnlen(g->label, sizeof(g->label)));
+               memcpy(out, g->label, b);
+               if (b < end - out)
+                       out[b] = '\0';
+               out += b;
+
+               if (nr)
+                       out += scnprintf(out, end - out, ".");
+       }
+
+       return out - buf;
+inval:
+       return scnprintf(buf, len, "invalid group %u", v);
+}
+
+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name)
+{
+       struct bch_member *mi;
+       int v = -1;
+
+       mutex_lock(&c->sb_lock);
+
+       if (!strlen(name) || !strcmp(name, "none"))
+               goto write_sb;
+
+       v = bch2_disk_path_find_or_create(&c->disk_sb, name);
+       if (v < 0) {
+               mutex_unlock(&c->sb_lock);
+               return v;
+       }
+
+write_sb:
+       mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
+       SET_BCH_MEMBER_GROUP(mi, v + 1);
+
+       bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+
+       return 0;
+}
+
+int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
+{
+       struct bch_dev *ca;
+       int g;
+
+       if (!strlen(buf) || !strcmp(buf, "none")) {
+               *v = 0;
+               return 0;
+       }
+
+       /* Is it a device? */
+       ca = bch2_dev_lookup(c, buf);
+       if (!IS_ERR(ca)) {
+               *v = dev_to_target(ca->dev_idx);
+               percpu_ref_put(&ca->ref);
+               return 0;
+       }
+
+       mutex_lock(&c->sb_lock);
+       g = bch2_disk_path_find(&c->disk_sb, buf);
+       mutex_unlock(&c->sb_lock);
+
+       if (g >= 0) {
+               *v = group_to_target(g);
+               return 0;
+       }
+
+       return -EINVAL;
+}
+
+int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v)
+{
+       struct target t = target_decode(v);
+       int ret;
+
+       switch (t.type) {
+       case TARGET_NULL:
+               return scnprintf(buf, len, "none");
+       case TARGET_DEV: {
+               struct bch_dev *ca;
+
+               rcu_read_lock();
+               ca = t.dev < c->sb.nr_devices
+                       ? rcu_dereference(c->devs[t.dev])
+                       : NULL;
+
+               if (ca && percpu_ref_tryget(&ca->io_ref)) {
+                       char b[BDEVNAME_SIZE];
+
+                       ret = scnprintf(buf, len, "/dev/%s",
+                                       bdevname(ca->disk_sb.bdev, b));
+                       percpu_ref_put(&ca->io_ref);
+               } else if (ca) {
+                       ret = scnprintf(buf, len, "offline device %u", t.dev);
+               } else {
+                       ret = scnprintf(buf, len, "invalid device %u", t.dev);
+               }
+
+               rcu_read_unlock();
+               break;
+       }
+       case TARGET_GROUP:
+               mutex_lock(&c->sb_lock);
+               ret = bch2_disk_path_print(&c->disk_sb, buf, len, t.group);
+               mutex_unlock(&c->sb_lock);
+               break;
+       default:
+               BUG();
+       }
+
+       return ret;
+}
diff --git a/libbcachefs/disk_groups.h b/libbcachefs/disk_groups.h

new file mode 100644 (file)

index 0000000..9da9805
--- /dev/null
+++ b/libbcachefs/disk_groups.h
@@ -0,0 +1,99 @@
+#ifndef _BCACHEFS_DISK_GROUPS_H
+#define _BCACHEFS_DISK_GROUPS_H
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups;
+
+static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
+{
+       return groups
+               ? (vstruct_end(&groups->field) -
+                  (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
+               : 0;
+}
+
+struct target {
+       enum {
+               TARGET_NULL,
+               TARGET_DEV,
+               TARGET_GROUP,
+       }                       type;
+       union {
+               unsigned        dev;
+               unsigned        group;
+       };
+};
+
+#define TARGET_DEV_START       1
+#define TARGET_GROUP_START     (256 + TARGET_DEV_START)
+
+static inline u16 dev_to_target(unsigned dev)
+{
+       return TARGET_DEV_START + dev;
+}
+
+static inline u16 group_to_target(unsigned group)
+{
+       return TARGET_GROUP_START + group;
+}
+
+static inline struct target target_decode(unsigned target)
+{
+       if (target >= TARGET_GROUP_START)
+               return (struct target) {
+                       .type   = TARGET_GROUP,
+                       .group  = target - TARGET_GROUP_START
+               };
+
+       if (target >= TARGET_DEV_START)
+               return (struct target) {
+                       .type   = TARGET_DEV,
+                       .group  = target - TARGET_DEV_START
+               };
+
+       return (struct target) { .type = TARGET_NULL };
+}
+
+static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
+{
+       struct target t = target_decode(target);
+
+       switch (t.type) {
+       case TARGET_NULL:
+               return false;
+       case TARGET_DEV:
+               return ca->dev_idx == t.dev;
+       case TARGET_GROUP:
+               return ca->mi.group && ca->mi.group - 1 == t.group;
+       default:
+               BUG();
+       }
+}
+
+static inline bool dev_idx_in_target(struct bch_fs *c, unsigned dev, unsigned target)
+{
+       bool ret;
+
+       rcu_read_lock();
+       ret = dev_in_target(rcu_dereference(c->devs[dev]), target);
+       rcu_read_unlock();
+
+       return ret;
+}
+
+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
+
+int bch2_disk_path_find(struct bch_sb_handle *, const char *);
+int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *);
+int bch2_disk_path_print(struct bch_sb_handle *, char *, size_t, unsigned);
+
+int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
+int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64);
+
+int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
+
+int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
+
+const char *bch2_sb_validate_disk_groups(struct bch_sb *,
+                                        struct bch_sb_field *);
+
+#endif /* _BCACHEFS_DISK_GROUPS_H */
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c

index f73e7562e723b0ec9159b8ce9cadbff75c831b4e..c5d1e7cb539b30bfaa9449ab5e91328b5a7bfe11 100644 (file)
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -14,10 +14,12 @@
  #include "checksum.h"
  #include "debug.h"
  #include "dirent.h"
+#include "disk_groups.h"
  #include "error.h"
  #include "extents.h"
  #include "inode.h"
  #include "journal.h"
+#include "replicas.h"
  #include "super.h"
  #include "super-io.h"
  #include "util.h"
@@ -25,9 +27,6 @@
  
  #include <trace/events/bcachefs.h>
  
-static enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
-                                          struct bkey_i *, struct bkey_i *);
-
  static void sort_key_next(struct btree_node_iter_large *iter,
                           struct btree *b,
                           struct btree_node_iter_set *i)
@@ -160,9 +159,13 @@ bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned targ
  {
         const struct bch_extent_ptr *ptr;
  
-       extent_for_each_ptr(e, ptr)
-               if (dev_in_target(c->devs[ptr->dev], target))
+       extent_for_each_ptr(e, ptr) {
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+               if (dev_in_target(ca, target) &&
+                   (!ptr->cached || !ptr_stale(ca, ptr)))
                         return ptr;
+       }
  
         return NULL;
  }
@@ -356,11 +359,25 @@ restart_narrow_pointers:
         return true;
  }
  
+/* returns true if not equal */
+static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
+                                        struct bch_extent_crc_unpacked r)
+{
+       return (l.csum_type             != r.csum_type ||
+               l.compression_type      != r.compression_type ||
+               l.compressed_size       != r.compressed_size ||
+               l.uncompressed_size     != r.uncompressed_size ||
+               l.offset                != r.offset ||
+               l.live_size             != r.live_size ||
+               l.nonce                 != r.nonce ||
+               bch2_crc_cmp(l.csum, r.csum));
+}
+
  void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
  {
         union bch_extent_entry *entry = e.v->start;
         union bch_extent_crc *crc, *prev = NULL;
-       struct bch_extent_crc_unpacked u, prev_u;
+       struct bch_extent_crc_unpacked u, prev_u = { 0 };
  
         while (entry != extent_entry_last(e)) {
                 union bch_extent_entry *next = extent_entry_next(entry);
@@ -382,7 +399,7 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
                         goto drop;
                 }
  
-               if (prev && !memcmp(&u, &prev_u, sizeof(u))) {
+               if (prev && !bch2_crc_unpacked_cmp(u, prev_u)) {
                         /* identical to previous crc entry: */
                         goto drop;
                 }
@@ -439,13 +456,12 @@ static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
                 bch2_extent_drop_redundant_crcs(e);
  }
  
-static bool bch2_ptr_normalize(struct bch_fs *c, struct btree *bk,
-                             struct bkey_s k)
+bool bch2_ptr_normalize(struct bch_fs *c, struct btree *b, struct bkey_s k)
  {
         return bch2_extent_normalize(c, k);
  }
  
-static void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
+void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
  {
         switch (k->type) {
         case BCH_EXTENT:
@@ -628,8 +644,7 @@ use:
  
  /* Btree ptrs */
  
-static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
-                                        struct bkey_s_c k)
+const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
  {
         if (bkey_extent_is_cached(k.k))
                 return "cached";
@@ -671,8 +686,8 @@ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
         }
  }
  
-static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
-                                struct bkey_s_c k)
+void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
+                              struct bkey_s_c k)
  {
         struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
         const struct bch_extent_ptr *ptr;
@@ -727,8 +742,8 @@ err:
                       mark.gen, (unsigned) mark.counter);
  }
  
-static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
-                                 size_t size, struct bkey_s_c k)
+void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
+                           size_t size, struct bkey_s_c k)
  {
         char *out = buf, *end = buf + size;
         const char *invalid;
@@ -756,13 +771,6 @@ bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
         return pick;
  }
  
-const struct bkey_ops bch2_bkey_btree_ops = {
-       .key_invalid    = bch2_btree_ptr_invalid,
-       .key_debugcheck = btree_ptr_debugcheck,
-       .val_to_text    = bch2_btree_ptr_to_text,
-       .swab           = bch2_ptr_swab,
-};
-
  /* Extents */
  
  static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
@@ -1436,7 +1444,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
  }
  
  static enum btree_insert_ret
-bch2_delete_fixup_extent(struct extent_insert_state *s)
+__bch2_delete_fixup_extent(struct extent_insert_state *s)
  {
         struct bch_fs *c = s->trans->c;
         struct btree_iter *iter = s->insert->iter;
@@ -1450,8 +1458,7 @@ bch2_delete_fixup_extent(struct extent_insert_state *s)
  
         EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
  
-       s->whiteout     = *insert;
-       s->do_journal   = false;
+       s->whiteout = *insert;
  
         while (bkey_cmp(s->committed, insert->k.p) < 0 &&
                (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
@@ -1474,12 +1481,12 @@ bch2_delete_fixup_extent(struct extent_insert_state *s)
                 overlap = bch2_extent_overlap(&insert->k, k.k);
  
                 ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
-               if (ret != BTREE_INSERT_OK)
-                       goto stop;
+               if (ret)
+                       break;
  
                 ret = extent_insert_advance_pos(s, k.s_c);
                 if (ret)
-                       goto stop;
+                       break;
  
                 s->do_journal = true;
  
@@ -1520,25 +1527,65 @@ next:
                 bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
         }
  
-       if (ret == BTREE_INSERT_OK &&
-           bkey_cmp(s->committed, insert->k.p) < 0)
-               ret = extent_insert_advance_pos(s, bkey_s_c_null);
-stop:
-       extent_insert_committed(s);
+       return ret;
+}
  
-       bch2_fs_usage_apply(c, &s->stats, s->trans->disk_res,
-                          gc_pos_btree_node(b));
+static enum btree_insert_ret
+__bch2_insert_fixup_extent(struct extent_insert_state *s)
+{
+       struct btree_iter *iter = s->insert->iter;
+       struct btree_iter_level *l = &iter->l[0];
+       struct btree *b = l->b;
+       struct btree_node_iter *node_iter = &l->iter;
+       struct bkey_packed *_k;
+       struct bkey unpacked;
+       struct bkey_i *insert = s->insert->k;
+       enum btree_insert_ret ret = BTREE_INSERT_OK;
  
-       EBUG_ON(bkey_cmp(iter->pos, s->committed));
-       EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) !=
-               !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF));
+       while (bkey_cmp(s->committed, insert->k.p) < 0 &&
+              (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
+              (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
+               struct bset_tree *t = bch2_bkey_to_bset(b, _k);
+               struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
+               enum bch_extent_overlap overlap;
  
-       bch2_cut_front(iter->pos, insert);
+               EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+               EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
  
-       if (insert->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
-               ret = BTREE_INSERT_NEED_TRAVERSE;
+               if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
+                       break;
+
+               overlap = bch2_extent_overlap(&insert->k, k.k);
+
+               ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
+               if (ret)
+                       break;
+
+               if (!k.k->size)
+                       goto squash;
+
+               /*
+                * Only call advance pos & call hook for nonzero size extents:
+                */
+               ret = extent_insert_advance_pos(s, k.s_c);
+               if (ret)
+                       break;
  
-       EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK);
+               if (k.k->size &&
+                   (k.k->needs_whiteout || bset_written(b, bset(b, t))))
+                       insert->k.needs_whiteout = true;
+
+               if (overlap == BCH_EXTENT_OVERLAP_ALL &&
+                   bkey_whiteout(k.k) &&
+                   k.k->needs_whiteout) {
+                       unreserve_whiteout(b, t, _k);
+                       _k->needs_whiteout = false;
+               }
+squash:
+               ret = extent_squash(s, insert, t, _k, k, overlap);
+               if (ret != BTREE_INSERT_OK)
+                       break;
+       }
  
         return ret;
  }
@@ -1590,9 +1637,6 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
         struct btree_iter *iter = insert->iter;
         struct btree_iter_level *l = &iter->l[0];
         struct btree *b = l->b;
-       struct btree_node_iter *node_iter = &l->iter;
-       struct bkey_packed *_k;
-       struct bkey unpacked;
         enum btree_insert_ret ret = BTREE_INSERT_OK;
  
         struct extent_insert_state s = {
@@ -1605,9 +1649,6 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
         EBUG_ON(iter->level);
         EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size);
  
-       if (s.deleting)
-               return bch2_delete_fixup_extent(&s);
-
         /*
          * As we process overlapping extents, we advance @iter->pos both to
          * signal to our caller (btree_insert_key()) how much of @insert->k has
@@ -1616,67 +1657,32 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
          */
         EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
  
-       if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+       if (!s.deleting &&
+           !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
                 bch2_add_sectors(&s, bkey_i_to_s_c(insert->k),
                                 bkey_start_offset(&insert->k->k),
                                 insert->k->k.size);
  
-       while (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
-              (ret = extent_insert_should_stop(&s)) == BTREE_INSERT_OK &&
-              (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
-               struct bset_tree *t = bch2_bkey_to_bset(b, _k);
-               struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
-               enum bch_extent_overlap overlap;
-
-               EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
-               EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
-
-               if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0)
-                       break;
-
-               overlap = bch2_extent_overlap(&insert->k->k, k.k);
-
-               ret = extent_insert_check_split_compressed(&s, k.s_c, overlap);
-               if (ret != BTREE_INSERT_OK)
-                       goto stop;
-
-               if (!k.k->size)
-                       goto squash;
-
-               /*
-                * Only call advance pos & call hook for nonzero size extents:
-                */
-               ret = extent_insert_advance_pos(&s, k.s_c);
-               if (ret != BTREE_INSERT_OK)
-                       goto stop;
-
-               if (k.k->size &&
-                   (k.k->needs_whiteout || bset_written(b, bset(b, t))))
-                       insert->k->k.needs_whiteout = true;
-
-               if (overlap == BCH_EXTENT_OVERLAP_ALL &&
-                   bkey_whiteout(k.k) &&
-                   k.k->needs_whiteout) {
-                       unreserve_whiteout(b, t, _k);
-                       _k->needs_whiteout = false;
-               }
-squash:
-               ret = extent_squash(&s, insert->k, t, _k, k, overlap);
-               if (ret != BTREE_INSERT_OK)
-                       goto stop;
-       }
+       ret = !s.deleting
+               ? __bch2_insert_fixup_extent(&s)
+               : __bch2_delete_fixup_extent(&s);
  
         if (ret == BTREE_INSERT_OK &&
             bkey_cmp(s.committed, insert->k->k.p) < 0)
                 ret = extent_insert_advance_pos(&s, bkey_s_c_null);
-stop:
+
         extent_insert_committed(&s);
+
+       if (s.deleting)
+               bch2_cut_front(iter->pos, insert->k);
+
         /*
          * Subtract any remaining sectors from @insert, if we bailed out early
          * and didn't fully insert @insert:
          */
-       if (insert->k->k.size &&
-           !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+       if (!s.deleting &&
+           !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
+           insert->k->k.size)
                 bch2_subtract_sectors(&s, bkey_i_to_s_c(insert->k),
                                      bkey_start_offset(&insert->k->k),
                                      insert->k->k.size);
@@ -1692,13 +1698,13 @@ stop:
         if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
                 ret = BTREE_INSERT_NEED_TRAVERSE;
  
-       EBUG_ON(insert->k->k.size && ret == BTREE_INSERT_OK);
+       WARN_ONCE((ret == BTREE_INSERT_OK) != (insert->k->k.size == 0),
+                 "ret %u insert->k.size %u", ret, insert->k->k.size);
  
         return ret;
  }
  
-static const char *bch2_extent_invalid(const struct bch_fs *c,
-                                      struct bkey_s_c k)
+const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
  {
         if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
                 return "value too big";
@@ -1865,8 +1871,7 @@ bad_ptr:
         return;
  }
  
-static void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
-                                  struct bkey_s_c k)
+void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
  {
         switch (k.k->type) {
         case BCH_EXTENT:
@@ -1880,8 +1885,8 @@ static void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
         }
  }
  
-static void bch2_extent_to_text(struct bch_fs *c, char *buf,
-                               size_t size, struct bkey_s_c k)
+void bch2_extent_to_text(struct bch_fs *c, char *buf,
+                        size_t size, struct bkey_s_c k)
  {
         char *out = buf, *end = buf + size;
         const char *invalid;
@@ -1963,7 +1968,7 @@ void bch2_extent_crc_append(struct bkey_i_extent *e,
         extent_for_each_crc(extent_i_to_s(e), crc, i)
                 ;
  
-       if (!memcmp(&crc, &new, sizeof(crc)))
+       if (!bch2_crc_unpacked_cmp(crc, new))
                 return;
  
         bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
@@ -2089,9 +2094,8 @@ void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
         }
  }
  
-static enum merge_result bch2_extent_merge(struct bch_fs *c,
-                                          struct btree *bk,
-                                          struct bkey_i *l, struct bkey_i *r)
+enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b,
+                                   struct bkey_i *l, struct bkey_i *r)
  {
         struct bkey_s_extent el, er;
         union bch_extent_entry *en_l, *en_r;
@@ -2410,13 +2414,3 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
  
         return ret;
  }
-
-const struct bkey_ops bch2_bkey_extent_ops = {
-       .key_invalid    = bch2_extent_invalid,
-       .key_debugcheck = bch2_extent_debugcheck,
-       .val_to_text    = bch2_extent_to_text,
-       .swab           = bch2_ptr_swab,
-       .key_normalize  = bch2_ptr_normalize,
-       .key_merge      = bch2_extent_merge,
-       .is_extents     = true,
-};
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h

index 376e51c9381651cd0e44c91c77df546bfe685c81..8dc15484f4853e04a14f38eedfa191af2ef27123 100644 (file)
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -15,6 +15,36 @@ struct extent_insert_hook;
  struct bch_devs_mask;
  union bch_extent_crc;
  
+const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_btree_ptr_debugcheck(struct bch_fs *, struct btree *,
+                              struct bkey_s_c);
+void bch2_btree_ptr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
+
+#define bch2_bkey_btree_ops (struct bkey_ops) {                        \
+       .key_invalid    = bch2_btree_ptr_invalid,               \
+       .key_debugcheck = bch2_btree_ptr_debugcheck,            \
+       .val_to_text    = bch2_btree_ptr_to_text,               \
+       .swab           = bch2_ptr_swab,                        \
+}
+
+const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_extent_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c);
+void bch2_extent_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+bool bch2_ptr_normalize(struct bch_fs *, struct btree *, struct bkey_s);
+enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
+                                   struct bkey_i *, struct bkey_i *);
+
+#define bch2_bkey_extent_ops (struct bkey_ops) {               \
+       .key_invalid    = bch2_extent_invalid,                  \
+       .key_debugcheck = bch2_extent_debugcheck,               \
+       .val_to_text    = bch2_extent_to_text,                  \
+       .swab           = bch2_ptr_swab,                        \
+       .key_normalize  = bch2_ptr_normalize,                   \
+       .key_merge      = bch2_extent_merge,                    \
+       .is_extents     = true,                                 \
+}
+
  struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *,
                                                   struct btree *,
                                                   struct btree_node_iter_large *);
@@ -23,9 +53,6 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
                                                      struct btree *,
                                                      struct btree_node_iter_large *);
  
-extern const struct bkey_ops bch2_bkey_btree_ops;
-extern const struct bkey_ops bch2_bkey_extent_ops;
-
  struct extent_pick_ptr
  bch2_btree_pick_ptr(struct bch_fs *, const struct btree *,
                     struct bch_devs_mask *avoid);
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c

index cb90738c5c26b00d86d6f3038cec243ae569e97a..d1473f2a315a2a77f8a7209bc63203baefffa51b 100644 (file)
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -468,7 +468,10 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
                 }
  
                 BUG_ON(bkey_cmp(extent_iter.pos, bkey_start_pos(&k->k)));
-               BUG_ON(!ret != !k->k.size);
+
+               if (WARN_ONCE(!ret != !k->k.size,
+                             "ret %i k->size %u", ret, k->k.size))
+                       ret = k->k.size ? -EINTR : 0;
  err:
                 if (ret == -EINTR)
                         continue;
diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c

index 797aa2a981e3605f3310c32946b504c380dc6179..3ae5ac975dfb0aec9d62e6865daf6ad3e786eb77 100644 (file)
--- a/libbcachefs/inode.c
+++ b/libbcachefs/inode.c
@@ -175,8 +175,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode,
         return 0;
  }
  
-static const char *bch2_inode_invalid(const struct bch_fs *c,
-                                     struct bkey_s_c k)
+const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k)
  {
         if (k.k->p.offset)
                 return "nonzero offset";
@@ -224,8 +223,8 @@ static const char *bch2_inode_invalid(const struct bch_fs *c,
         }
  }
  
-static void bch2_inode_to_text(struct bch_fs *c, char *buf,
-                              size_t size, struct bkey_s_c k)
+void bch2_inode_to_text(struct bch_fs *c, char *buf,
+                       size_t size, struct bkey_s_c k)
  {
         char *out = buf, *end = out + size;
         struct bkey_s_c_inode inode;
@@ -247,11 +246,6 @@ static void bch2_inode_to_text(struct bch_fs *c, char *buf,
         }
  }
  
-const struct bkey_ops bch2_bkey_inode_ops = {
-       .key_invalid    = bch2_inode_invalid,
-       .val_to_text    = bch2_inode_to_text,
-};
-
  void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u,
                      uid_t uid, gid_t gid, umode_t mode, dev_t rdev,
                      struct bch_inode_unpacked *parent)
diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h

index 5c7aeadcb1a6c2799e65a045a07a4888e0d05590..26461063f774d35581f0c78ad795ab73d44ccef1 100644 (file)
--- a/libbcachefs/inode.h
+++ b/libbcachefs/inode.h
@@ -5,7 +5,13 @@
  
  #include <linux/math64.h>
  
-extern const struct bkey_ops bch2_bkey_inode_ops;
+const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_inode_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_inode_ops (struct bkey_ops) {                \
+       .key_invalid    = bch2_inode_invalid,           \
+       .val_to_text    = bch2_inode_to_text,           \
+}
  
  struct bch_inode_unpacked {
         u64                     bi_inum;
diff --git a/libbcachefs/io.c b/libbcachefs/io.c

index 7ee9c3928039b98bdb9b2d73df0c7aa24c4daf9f..27e45081313fc94600b44b8805050c887df75f10 100644 (file)
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -20,6 +20,7 @@
  #include "journal.h"
  #include "keylist.h"
  #include "move.h"
+#include "replicas.h"
  #include "super.h"
  #include "super-io.h"
  #include "tier.h"
@@ -196,8 +197,6 @@ static void bch2_write_done(struct closure *cl)
  {
         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
  
-       BUG_ON(!(op->flags & BCH_WRITE_DONE));
-
         if (!op->error && (op->flags & BCH_WRITE_FLUSH))
                 op->error = bch2_journal_error(&op->c->journal);
  
@@ -205,7 +204,6 @@ static void bch2_write_done(struct closure *cl)
                 bch2_disk_reservation_put(op->c, &op->res);
         percpu_ref_put(&op->c->writes);
         bch2_keylist_free(&op->insert_keys, op->inline_keys);
-       op->flags &= ~(BCH_WRITE_DONE|BCH_WRITE_LOOPED);
  
         closure_return(cl);
  }
@@ -232,9 +230,8 @@ int bch2_write_index_default(struct bch_write_op *op)
  /**
   * bch_write_index - after a write, update index to point to new data
   */
-static void bch2_write_index(struct closure *cl)
+static void __bch2_write_index(struct bch_write_op *op)
  {
-       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
         struct bch_fs *c = op->c;
         struct keylist *keys = &op->insert_keys;
         struct bkey_s_extent e;
@@ -242,8 +239,6 @@ static void bch2_write_index(struct closure *cl)
         struct bkey_i *src, *dst = keys->keys, *n, *k;
         int ret;
  
-       op->flags |= BCH_WRITE_LOOPED;
-
         for (src = keys->keys; src != keys->top; src = n) {
                 n = bkey_next(src);
                 bkey_copy(dst, src);
@@ -292,9 +287,19 @@ static void bch2_write_index(struct closure *cl)
         }
  out:
         bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets);
+       return;
+err:
+       keys->top = keys->keys;
+       op->error = ret;
+       goto out;
+}
  
-       if (!(op->flags & BCH_WRITE_DONE))
-               continue_at(cl, __bch2_write, op->io_wq);
+static void bch2_write_index(struct closure *cl)
+{
+       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+       struct bch_fs *c = op->c;
+
+       __bch2_write_index(op);
  
         if (!op->error && (op->flags & BCH_WRITE_FLUSH)) {
                 bch2_journal_flush_seq_async(&c->journal,
@@ -304,12 +309,6 @@ out:
         } else {
                 continue_at_nobarrier(cl, bch2_write_done, NULL);
         }
-       return;
-err:
-       keys->top = keys->keys;
-       op->error = ret;
-       op->flags |= BCH_WRITE_DONE;
-       goto out;
  }
  
  static void bch2_write_endio(struct bio *bio)
@@ -730,18 +729,18 @@ static void __bch2_write(struct closure *cl)
         struct bch_fs *c = op->c;
         struct write_point *wp;
         int ret;
-
+again:
         do {
                 /* +1 for possible cache device: */
                 if (op->open_buckets_nr + op->nr_replicas + 1 >
                     ARRAY_SIZE(op->open_buckets))
-                       continue_at(cl, bch2_write_index, index_update_wq(op));
+                       goto flush_io;
  
                 if (bch2_keylist_realloc(&op->insert_keys,
                                         op->inline_keys,
                                         ARRAY_SIZE(op->inline_keys),
                                         BKEY_EXTENT_U64s_MAX))
-                       continue_at(cl, bch2_write_index, index_update_wq(op));
+                       goto flush_io;
  
                 wp = bch2_alloc_sectors_start(c,
                         op->target,
@@ -760,33 +759,7 @@ static void __bch2_write(struct closure *cl)
                                 goto err;
                         }
  
-                       /*
-                        * If we already have some keys, must insert them first
-                        * before allocating another open bucket. We only hit
-                        * this case if open_bucket_nr > 1.
-                        */
-                       if (!bch2_keylist_empty(&op->insert_keys))
-                               continue_at(cl, bch2_write_index,
-                                           index_update_wq(op));
-
-                       /*
-                        * If we've looped, we're running out of a workqueue -
-                        * not the bch2_write() caller's context - and we don't
-                        * want to block the workqueue:
-                        */
-                       if (op->flags & BCH_WRITE_LOOPED)
-                               continue_at(cl, __bch2_write, op->io_wq);
-
-                       /*
-                        * Otherwise, we do want to block the caller on alloc
-                        * failure instead of letting it queue up more and more
-                        * writes:
-                        * XXX: this technically needs a try_to_freeze() -
-                        * except that that's not safe because caller may have
-                        * issued other IO... hmm..
-                        */
-                       closure_sync(cl);
-                       continue;
+                       goto flush_io;
                 }
  
                 ret = bch2_write_extent(op, wp);
@@ -802,28 +775,24 @@ static void __bch2_write(struct closure *cl)
                         goto err;
         } while (ret);
  
-       op->flags |= BCH_WRITE_DONE;
         continue_at(cl, bch2_write_index, index_update_wq(op));
  err:
-       /*
-        * Right now we can only error here if we went RO - the
-        * allocation failed, but we already checked for -ENOSPC when we
-        * got our reservation.
-        *
-        * XXX capacity might have changed, but we don't check for that
-        * yet:
-        */
         op->error = ret;
-       op->flags |= BCH_WRITE_DONE;
  
-       /*
-        * No reason not to insert keys for whatever data was successfully
-        * written (especially for a cmpxchg operation that's moving data
-        * around)
-        */
         continue_at(cl, !bch2_keylist_empty(&op->insert_keys)
                     ? bch2_write_index
                     : bch2_write_done, index_update_wq(op));
+flush_io:
+       closure_sync(cl);
+
+       if (!bch2_keylist_empty(&op->insert_keys)) {
+               __bch2_write_index(op);
+
+               if (op->error)
+                       continue_at_nobarrier(cl, bch2_write_done, NULL);
+       }
+
+       goto again;
  }
  
  /**
@@ -969,7 +938,7 @@ static bool should_promote(struct bch_fs *c, struct bkey_s_c_extent e,
         if (percpu_ref_is_dying(&c->writes))
                 return false;
  
-       return bch2_extent_has_target(c, e, target);
+       return bch2_extent_has_target(c, e, target) == NULL;
  }
  
  /* Read */
diff --git a/libbcachefs/io.h b/libbcachefs/io.h

index bf0b17e1deb9626d14ef0c18a6868ec59be96804..a0c795abe9bdb9b31f343eb230dc2064f3eb2fb7 100644 (file)
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@@ -36,8 +36,6 @@ enum bch_write_flags {
  
         /* Internal: */
         BCH_WRITE_JOURNAL_SEQ_PTR       = (1 << 9),
-       BCH_WRITE_DONE                  = (1 << 10),
-       BCH_WRITE_LOOPED                = (1 << 11),
  };
  
  static inline u64 *op_journal_seq(struct bch_write_op *op)
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c

index e50007673e4acd5a0d587bec05e9a6a4c0e6e538..b525a85c27e46b25f0a5153543a0a28b7e57ca68 100644 (file)
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -19,6 +19,7 @@
  #include "io.h"
  #include "keylist.h"
  #include "journal.h"
+#include "replicas.h"
  #include "super-io.h"
  #include "vstructs.h"
  
@@ -1582,40 +1583,19 @@ err:
         return ret;
  }
  
-/*
- * Allocate more journal space at runtime - not currently making use if it, but
- * the code works:
- */
-static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
-                                      unsigned nr)
+static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
+                                        bool new_fs, struct closure *cl)
  {
-       struct journal *j = &c->journal;
+       struct bch_fs *c = ca->fs;
         struct journal_device *ja = &ca->journal;
         struct bch_sb_field_journal *journal_buckets;
-       struct disk_reservation disk_res = { 0, 0 };
-       struct closure cl;
         u64 *new_bucket_seq = NULL, *new_buckets = NULL;
         int ret = 0;
  
-       closure_init_stack(&cl);
-
         /* don't handle reducing nr of buckets yet: */
         if (nr <= ja->nr)
                 return 0;
  
-       /*
-        * note: journal buckets aren't really counted as _sectors_ used yet, so
-        * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
-        * when space used goes up without a reservation - but we do need the
-        * reservation to ensure we'll actually be able to allocate:
-        */
-
-       if (bch2_disk_reservation_get(c, &disk_res,
-                       bucket_to_sector(ca, nr - ja->nr), 1, 0))
-               return -ENOSPC;
-
-       mutex_lock(&c->sb_lock);
-
         ret = -ENOMEM;
         new_buckets     = kzalloc(nr * sizeof(u64), GFP_KERNEL);
         new_bucket_seq  = kzalloc(nr * sizeof(u64), GFP_KERNEL);
@@ -1627,29 +1607,41 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
         if (!journal_buckets)
                 goto err;
  
-       spin_lock(&j->lock);
+       if (c)
+               spin_lock(&c->journal.lock);
+
         memcpy(new_buckets,     ja->buckets,    ja->nr * sizeof(u64));
         memcpy(new_bucket_seq,  ja->bucket_seq, ja->nr * sizeof(u64));
         swap(new_buckets,       ja->buckets);
         swap(new_bucket_seq,    ja->bucket_seq);
-       spin_unlock(&j->lock);
+
+       if (c)
+               spin_unlock(&c->journal.lock);
  
         while (ja->nr < nr) {
-               struct open_bucket *ob;
-               size_t bucket;
-               int ob_idx;
+               struct open_bucket *ob = NULL;
+               long bucket;
  
-               ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, &cl);
-               if (ob_idx < 0) {
-                       if (!closure_wait(&c->freelist_wait, &cl))
-                               closure_sync(&cl);
-                       continue;
+               if (new_fs) {
+                       bucket = bch2_bucket_alloc_new_fs(ca);
+                       if (bucket < 0) {
+                               ret = -ENOSPC;
+                               goto err;
+                       }
+               } else {
+                       int ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, cl);
+                       if (ob_idx < 0) {
+                               ret = cl ? -EAGAIN : -ENOSPC;
+                               goto err;
+                       }
+
+                       ob = c->open_buckets + ob_idx;
+                       bucket = sector_to_bucket(ca, ob->ptr.offset);
                 }
  
-               ob = c->open_buckets + ob_idx;
-               bucket = sector_to_bucket(ca, ob->ptr.offset);
+               if (c)
+                       spin_lock(&c->journal.lock);
  
-               spin_lock(&j->lock);
                 __array_insert_item(ja->buckets,                ja->nr, ja->last_idx);
                 __array_insert_item(ja->bucket_seq,             ja->nr, ja->last_idx);
                 __array_insert_item(journal_buckets->buckets,   ja->nr, ja->last_idx);
@@ -1664,34 +1656,77 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
                         ja->last_idx++;
                 }
                 ja->nr++;
-               spin_unlock(&j->lock);
  
-               bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
-                                         ca->mi.bucket_size,
-                                         gc_phase(GC_PHASE_SB), 0);
+               if (c)
+                       spin_unlock(&c->journal.lock);
  
-               bch2_open_bucket_put(c, ob);
+               bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
+                               ca->mi.bucket_size,
+                               gc_phase(GC_PHASE_SB),
+                               new_fs
+                               ? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE
+                               : 0);
+
+               if (!new_fs)
+                       bch2_open_bucket_put(c, ob);
         }
  
-       bch2_write_super(c);
-
         ret = 0;
  err:
-       mutex_unlock(&c->sb_lock);
-
         kfree(new_bucket_seq);
         kfree(new_buckets);
-       bch2_disk_reservation_put(c, &disk_res);
  
-       if (!ret)
-               bch2_dev_allocator_add(c, ca);
+       return ret;
+}
+
+/*
+ * Allocate more journal space at runtime - not currently making use if it, but
+ * the code works:
+ */
+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
+                               unsigned nr)
+{
+       struct journal_device *ja = &ca->journal;
+       struct closure cl;
+       unsigned current_nr;
+       int ret;
+
+       closure_init_stack(&cl);
+
+       do {
+               struct disk_reservation disk_res = { 0, 0 };
+
+               closure_sync(&cl);
+
+               mutex_lock(&c->sb_lock);
+               current_nr = ja->nr;
+
+               /*
+                * note: journal buckets aren't really counted as _sectors_ used yet, so
+                * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
+                * when space used goes up without a reservation - but we do need the
+                * reservation to ensure we'll actually be able to allocate:
+                */
+
+               if (bch2_disk_reservation_get(c, &disk_res,
+                               bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
+                       mutex_unlock(&c->sb_lock);
+                       return -ENOSPC;
+               }
+
+               ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl);
+
+               bch2_disk_reservation_put(c, &disk_res);
  
-       closure_sync(&cl);
+               if (ja->nr != current_nr)
+                       bch2_write_super(c);
+               mutex_unlock(&c->sb_lock);
+       } while (ret == -EAGAIN);
  
         return ret;
  }
  
-int bch2_dev_journal_alloc(struct bch_fs *c, struct bch_dev *ca)
+int bch2_dev_journal_alloc(struct bch_dev *ca)
  {
         unsigned nr;
  
@@ -1707,7 +1742,7 @@ int bch2_dev_journal_alloc(struct bch_fs *c, struct bch_dev *ca)
                      min(1 << 10,
                          (1 << 20) / ca->mi.bucket_size));
  
-       return bch2_set_nr_journal_buckets(c, ca, nr);
+       return __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
  }
  
  /* Journalling */
@@ -2320,8 +2355,8 @@ static void journal_write(struct closure *cl)
  
         journal_write_compact(jset);
  
-       jset->read_clock        = cpu_to_le16(c->prio_clock[READ].hand);
-       jset->write_clock       = cpu_to_le16(c->prio_clock[WRITE].hand);
+       jset->read_clock        = cpu_to_le16(c->bucket_clock[READ].hand);
+       jset->write_clock       = cpu_to_le16(c->bucket_clock[WRITE].hand);
         jset->magic             = cpu_to_le64(jset_magic(c));
         jset->version           = cpu_to_le32(BCACHE_JSET_VERSION);
  
diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h

index 46ae8f0d256dd408213d50f2d3d823efa167ac18..cf5cc9ba008eebc5f375058804296ecda63f5d6f 100644 (file)
--- a/libbcachefs/journal.h
+++ b/libbcachefs/journal.h
@@ -400,7 +400,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
  ssize_t bch2_journal_print_debug(struct journal *, char *);
  ssize_t bch2_journal_print_pins(struct journal *, char *);
  
-int bch2_dev_journal_alloc(struct bch_fs *, struct bch_dev *);
+int bch2_dev_journal_alloc(struct bch_dev *);
  
  void bch2_dev_journal_stop(struct journal *, struct bch_dev *);
  void bch2_fs_journal_stop(struct journal *);
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c

index 1bc0e714222045db3f7825dd0fe01259bbfe8949..ea519102a22825f03a2923e2dbd5792852b3aa38 100644 (file)
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -11,6 +11,7 @@
  #include "keylist.h"
  #include "migrate.h"
  #include "move.h"
+#include "replicas.h"
  #include "super-io.h"
  
  static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
diff --git a/libbcachefs/move.c b/libbcachefs/move.c

index 07d2e2c881dbd8f4be030623c9d33b5224a4c77d..87e6e80cb77ae104ea6f58e4fc5162398a30f1db 100644 (file)
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -6,6 +6,7 @@
  #include "inode.h"
  #include "io.h"
  #include "move.h"
+#include "replicas.h"
  #include "super-io.h"
  #include "keylist.h"
  
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c

index 3b4a5292ef6a4007a0f0e71d248ad4a0157d7996..28dabca74565e3046ad396f03ffbf70ff48e4d4c 100644 (file)
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -9,6 +9,7 @@
  #include "btree_update.h"
  #include "buckets.h"
  #include "clock.h"
+#include "disk_groups.h"
  #include "extents.h"
  #include "eytzinger.h"
  #include "io.h"
@@ -51,7 +52,7 @@ static inline int sectors_used_cmp(copygc_heap *heap,
                                    struct copygc_heap_entry l,
                                    struct copygc_heap_entry r)
  {
-       return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark);
+       return (l.sectors > r.sectors) - (l.sectors < r.sectors);
  }
  
  static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
@@ -78,7 +79,7 @@ static bool __copygc_pred(struct bch_dev *ca,
  
                 return (i >= 0 &&
                         ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
-                       ptr->gen == h->data[i].mark.gen);
+                       ptr->gen == h->data[i].gen);
         }
  
         return false;
@@ -154,8 +155,9 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
                         continue;
  
                 e = (struct copygc_heap_entry) {
-                       .offset = bucket_to_sector(ca, b),
-                       .mark   = m
+                       .gen            = m.gen,
+                       .sectors        = bucket_sectors_used(m),
+                       .offset         = bucket_to_sector(ca, b),
                 };
                 heap_add_or_replace(h, e, -sectors_used_cmp);
         }
@@ -163,11 +165,11 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
         up_read(&c->gc_lock);
  
         for (i = h->data; i < h->data + h->used; i++)
-               sectors_to_move += bucket_sectors_used(i->mark);
+               sectors_to_move += i->sectors;
  
         while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
                 BUG_ON(!heap_pop(h, e, -sectors_used_cmp));
-               sectors_to_move -= bucket_sectors_used(e.mark);
+               sectors_to_move -= e.sectors;
         }
  
         buckets_to_move = h->used;
@@ -191,7 +193,7 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
                 size_t b = sector_to_bucket(ca, i->offset);
                 struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
  
-               if (i->mark.gen == m.gen && bucket_sectors_used(m)) {
+               if (i->gen == m.gen && bucket_sectors_used(m)) {
                         sectors_not_moved += bucket_sectors_used(m);
                         buckets_not_moved++;
                 }
@@ -284,7 +286,8 @@ int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
         if (bch2_fs_init_fault("copygc_start"))
                 return -ENOMEM;
  
-       t = kthread_create(bch2_copygc_thread, ca, "bch_copygc");
+       t = kthread_create(bch2_copygc_thread, ca,
+                          "bch_copygc[%s]", ca->name);
         if (IS_ERR(t))
                 return PTR_ERR(t);
  
diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c

index 326b8ad9caf20d5dce19ed60a8307c538aa99a49..8db8096e5ed4c06af9380ea32802626a224cb65f 100644 (file)
--- a/libbcachefs/opts.c
+++ b/libbcachefs/opts.c
@@ -2,6 +2,7 @@
  #include <linux/kernel.h>
  
  #include "bcachefs.h"
+#include "disk_groups.h"
  #include "opts.h"
  #include "super-io.h"
  #include "util.h"
diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c

index d28f1333e69365b60fd028a95d1b0157f70f97dc..bb03d83a53e4901561979df644309d1694dcdc88 100644 (file)
--- a/libbcachefs/quota.c
+++ b/libbcachefs/quota.c
@@ -4,7 +4,22 @@
  #include "quota.h"
  #include "super-io.h"
  
-static const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
+static const char *bch2_sb_validate_quota(struct bch_sb *sb,
+                                         struct bch_sb_field *f)
+{
+       struct bch_sb_field_quota *q = field_to_type(f, quota);
+
+       if (vstruct_bytes(&q->field) != sizeof(*q))
+               return "invalid field quota: wrong size";
+
+       return NULL;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_quota = {
+       .validate       = bch2_sb_validate_quota,
+};
+
+const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k)
  {
         struct bkey_s_c_quota dq;
  
@@ -30,8 +45,8 @@ static const char * const bch2_quota_counters[] = {
         "inodes",
  };
  
-static void bch2_quota_to_text(struct bch_fs *c, char *buf,
-                              size_t size, struct bkey_s_c k)
+void bch2_quota_to_text(struct bch_fs *c, char *buf,
+                       size_t size, struct bkey_s_c k)
  {
         char *out = buf, *end= buf + size;
         struct bkey_s_c_quota dq;
@@ -50,11 +65,6 @@ static void bch2_quota_to_text(struct bch_fs *c, char *buf,
         }
  }
  
-const struct bkey_ops bch2_bkey_quota_ops = {
-       .key_invalid    = bch2_quota_invalid,
-       .val_to_text    = bch2_quota_to_text,
-};
-
  #ifdef CONFIG_BCACHEFS_QUOTA
  
  #include <linux/cred.h>
@@ -399,7 +409,7 @@ static void bch2_sb_quota_read(struct bch_fs *c)
         struct bch_sb_field_quota *sb_quota;
         unsigned i, j;
  
-       sb_quota = bch2_sb_get_quota(c->disk_sb);
+       sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
         if (!sb_quota)
                 return;
  
@@ -476,13 +486,13 @@ static int bch2_quota_enable(struct super_block   *sb, unsigned uflags)
  
         mutex_lock(&c->sb_lock);
         if (uflags & FS_QUOTA_UDQ_ENFD)
-               SET_BCH_SB_USRQUOTA(c->disk_sb, true);
+               SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true);
  
         if (uflags & FS_QUOTA_GDQ_ENFD)
-               SET_BCH_SB_GRPQUOTA(c->disk_sb, true);
+               SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true);
  
         if (uflags & FS_QUOTA_PDQ_ENFD)
-               SET_BCH_SB_PRJQUOTA(c->disk_sb, true);
+               SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true);
  
         bch2_write_super(c);
         mutex_unlock(&c->sb_lock);
@@ -499,13 +509,13 @@ static int bch2_quota_disable(struct super_block *sb, unsigned uflags)
  
         mutex_lock(&c->sb_lock);
         if (uflags & FS_QUOTA_UDQ_ENFD)
-               SET_BCH_SB_USRQUOTA(c->disk_sb, false);
+               SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false);
  
         if (uflags & FS_QUOTA_GDQ_ENFD)
-               SET_BCH_SB_GRPQUOTA(c->disk_sb, false);
+               SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false);
  
         if (uflags & FS_QUOTA_PDQ_ENFD)
-               SET_BCH_SB_PRJQUOTA(c->disk_sb, false);
+               SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false);
  
         bch2_write_super(c);
         mutex_unlock(&c->sb_lock);
@@ -616,9 +626,10 @@ static int bch2_quota_set_info(struct super_block *sb, int type,
         q = &c->quotas[type];
  
         mutex_lock(&c->sb_lock);
-       sb_quota = bch2_sb_get_quota(c->disk_sb);
+       sb_quota = bch2_sb_get_quota(c->disk_sb.sb);
         if (!sb_quota) {
-               sb_quota = bch2_fs_sb_resize_quota(c, sizeof(*sb_quota) / sizeof(u64));
+               sb_quota = bch2_sb_resize_quota(&c->disk_sb,
+                                       sizeof(*sb_quota) / sizeof(u64));
                 if (!sb_quota)
                         return -ENOSPC;
         }
diff --git a/libbcachefs/quota.h b/libbcachefs/quota.h

index 509b7f0e069d0f33423f72f70fa5c7b05603b0cf..0b24f22cf4fbeac6224f6859c073f85bd9225bbf 100644 (file)
--- a/libbcachefs/quota.h
+++ b/libbcachefs/quota.h
@@ -1,9 +1,18 @@
  #ifndef _BCACHEFS_QUOTA_H
  #define _BCACHEFS_QUOTA_H
  
+#include "inode.h"
  #include "quota_types.h"
  
-extern const struct bkey_ops bch2_bkey_quota_ops;
+extern const struct bch_sb_field_ops bch_sb_field_ops_quota;
+
+const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_quota_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_quota_ops (struct bkey_ops) {                \
+       .key_invalid    = bch2_quota_invalid,           \
+       .val_to_text    = bch2_quota_to_text,           \
+}
  
  enum quota_acct_mode {
         BCH_QUOTA_PREALLOC,
diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c

new file mode 100644 (file)

index 0000000..6c52d1d
--- /dev/null
+++ b/libbcachefs/replicas.c
@@ -0,0 +1,698 @@
+
+#include "bcachefs.h"
+#include "replicas.h"
+#include "super-io.h"
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
+                                           struct bch_replicas_cpu *);
+
+/* Replicas tracking - in memory: */
+
+#define for_each_cpu_replicas_entry(_r, _i)                            \
+       for (_i = (_r)->entries;                                        \
+            (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
+            _i = (void *) (_i) + (_r)->entry_size)
+
+static inline struct bch_replicas_cpu_entry *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+       return (void *) r->entries + r->entry_size * i;
+}
+
+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
+{
+       eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+}
+
+static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
+                                    unsigned dev)
+{
+       return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
+}
+
+static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
+                                   unsigned dev)
+{
+       e->devs[dev >> 3] |= 1 << (dev & 7);
+}
+
+static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
+{
+       return (r->entry_size -
+               offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
+}
+
+int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
+                             char *buf, size_t size)
+{
+       char *out = buf, *end = out + size;
+       struct bch_replicas_cpu_entry *e;
+       bool first = true;
+       unsigned i;
+
+       for_each_cpu_replicas_entry(r, e) {
+               bool first_e = true;
+
+               if (!first)
+                       out += scnprintf(out, end - out, " ");
+               first = false;
+
+               out += scnprintf(out, end - out, "%u: [", e->data_type);
+
+               for (i = 0; i < replicas_dev_slots(r); i++)
+                       if (replicas_test_dev(e, i)) {
+                               if (!first_e)
+                                       out += scnprintf(out, end - out, " ");
+                               first_e = false;
+                               out += scnprintf(out, end - out, "%u", i);
+                       }
+               out += scnprintf(out, end - out, "]");
+       }
+
+       return out - buf;
+}
+
+static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e,
+                                       enum bch_data_type data_type,
+                                       struct bch_replicas_cpu_entry *r,
+                                       unsigned *max_dev)
+{
+       const struct bch_extent_ptr *ptr;
+       unsigned nr = 0;
+
+       BUG_ON(!data_type ||
+              data_type == BCH_DATA_SB ||
+              data_type >= BCH_DATA_NR);
+
+       memset(r, 0, sizeof(*r));
+       r->data_type = data_type;
+
+       *max_dev = 0;
+
+       extent_for_each_ptr(e, ptr)
+               if (!ptr->cached) {
+                       *max_dev = max_t(unsigned, *max_dev, ptr->dev);
+                       replicas_set_dev(r, ptr->dev);
+                       nr++;
+               }
+       return nr;
+}
+
+static inline void devlist_to_replicas(struct bch_devs_list devs,
+                                      enum bch_data_type data_type,
+                                      struct bch_replicas_cpu_entry *r,
+                                      unsigned *max_dev)
+{
+       unsigned i;
+
+       BUG_ON(!data_type ||
+              data_type == BCH_DATA_SB ||
+              data_type >= BCH_DATA_NR);
+
+       memset(r, 0, sizeof(*r));
+       r->data_type = data_type;
+
+       *max_dev = 0;
+
+       for (i = 0; i < devs.nr; i++) {
+               *max_dev = max_t(unsigned, *max_dev, devs.devs[i]);
+               replicas_set_dev(r, devs.devs[i]);
+       }
+}
+
+static struct bch_replicas_cpu *
+cpu_replicas_add_entry(struct bch_replicas_cpu *old,
+                      struct bch_replicas_cpu_entry new_entry,
+                      unsigned max_dev)
+{
+       struct bch_replicas_cpu *new;
+       unsigned i, nr, entry_size;
+
+       entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+               DIV_ROUND_UP(max_dev + 1, 8);
+       entry_size = max(entry_size, old->entry_size);
+       nr = old->nr + 1;
+
+       new = kzalloc(sizeof(struct bch_replicas_cpu) +
+                     nr * entry_size, GFP_NOIO);
+       if (!new)
+               return NULL;
+
+       new->nr         = nr;
+       new->entry_size = entry_size;
+
+       for (i = 0; i < old->nr; i++)
+               memcpy(cpu_replicas_entry(new, i),
+                      cpu_replicas_entry(old, i),
+                      min(new->entry_size, old->entry_size));
+
+       memcpy(cpu_replicas_entry(new, old->nr),
+              &new_entry,
+              new->entry_size);
+
+       bch2_cpu_replicas_sort(new);
+       return new;
+}
+
+static bool replicas_has_entry(struct bch_replicas_cpu *r,
+                               struct bch_replicas_cpu_entry search,
+                               unsigned max_dev)
+{
+       return max_dev < replicas_dev_slots(r) &&
+               eytzinger0_find(r->entries, r->nr,
+                               r->entry_size,
+                               memcmp, &search) < r->nr;
+}
+
+noinline
+static int bch2_mark_replicas_slowpath(struct bch_fs *c,
+                               struct bch_replicas_cpu_entry new_entry,
+                               unsigned max_dev)
+{
+       struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
+       int ret = -ENOMEM;
+
+       mutex_lock(&c->sb_lock);
+
+       old_gc = rcu_dereference_protected(c->replicas_gc,
+                                          lockdep_is_held(&c->sb_lock));
+       if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
+               new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
+               if (!new_gc)
+                       goto err;
+       }
+
+       old_r = rcu_dereference_protected(c->replicas,
+                                         lockdep_is_held(&c->sb_lock));
+       if (!replicas_has_entry(old_r, new_entry, max_dev)) {
+               new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
+               if (!new_r)
+                       goto err;
+
+               ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
+               if (ret)
+                       goto err;
+       }
+
+       /* allocations done, now commit: */
+
+       if (new_r)
+               bch2_write_super(c);
+
+       /* don't update in memory replicas until changes are persistent */
+
+       if (new_gc) {
+               rcu_assign_pointer(c->replicas_gc, new_gc);
+               kfree_rcu(old_gc, rcu);
+       }
+
+       if (new_r) {
+               rcu_assign_pointer(c->replicas, new_r);
+               kfree_rcu(old_r, rcu);
+       }
+
+       mutex_unlock(&c->sb_lock);
+       return 0;
+err:
+       mutex_unlock(&c->sb_lock);
+       if (new_gc)
+               kfree(new_gc);
+       if (new_r)
+               kfree(new_r);
+       return ret;
+}
+
+int bch2_mark_replicas(struct bch_fs *c,
+                      enum bch_data_type data_type,
+                      struct bch_devs_list devs)
+{
+       struct bch_replicas_cpu_entry search;
+       struct bch_replicas_cpu *r, *gc_r;
+       unsigned max_dev;
+       bool marked;
+
+       if (!devs.nr)
+               return 0;
+
+       BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
+
+       devlist_to_replicas(devs, data_type, &search, &max_dev);
+
+       rcu_read_lock();
+       r = rcu_dereference(c->replicas);
+       gc_r = rcu_dereference(c->replicas_gc);
+       marked = replicas_has_entry(r, search, max_dev) &&
+               (!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
+       rcu_read_unlock();
+
+       return likely(marked) ? 0
+               : bch2_mark_replicas_slowpath(c, search, max_dev);
+}
+
+int bch2_mark_bkey_replicas(struct bch_fs *c,
+                           enum bch_data_type data_type,
+                           struct bkey_s_c k)
+{
+       struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+       unsigned i;
+       int ret;
+
+       for (i = 0; i < cached.nr; i++)
+               if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
+                                             bch2_dev_list_single(cached.devs[i]))))
+                       return ret;
+
+       return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
+}
+
+int bch2_replicas_gc_end(struct bch_fs *c, int err)
+{
+       struct bch_replicas_cpu *new_r, *old_r;
+       int ret = 0;
+
+       lockdep_assert_held(&c->replicas_gc_lock);
+
+       mutex_lock(&c->sb_lock);
+
+       new_r = rcu_dereference_protected(c->replicas_gc,
+                                         lockdep_is_held(&c->sb_lock));
+
+       if (err) {
+               rcu_assign_pointer(c->replicas_gc, NULL);
+               kfree_rcu(new_r, rcu);
+               goto err;
+       }
+
+       if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
+               ret = -ENOSPC;
+               goto err;
+       }
+
+       old_r = rcu_dereference_protected(c->replicas,
+                                         lockdep_is_held(&c->sb_lock));
+
+       rcu_assign_pointer(c->replicas, new_r);
+       rcu_assign_pointer(c->replicas_gc, NULL);
+       kfree_rcu(old_r, rcu);
+
+       bch2_write_super(c);
+err:
+       mutex_unlock(&c->sb_lock);
+       return ret;
+}
+
+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
+{
+       struct bch_replicas_cpu *dst, *src;
+       struct bch_replicas_cpu_entry *e;
+
+       lockdep_assert_held(&c->replicas_gc_lock);
+
+       mutex_lock(&c->sb_lock);
+       BUG_ON(c->replicas_gc);
+
+       src = rcu_dereference_protected(c->replicas,
+                                       lockdep_is_held(&c->sb_lock));
+
+       dst = kzalloc(sizeof(struct bch_replicas_cpu) +
+                     src->nr * src->entry_size, GFP_NOIO);
+       if (!dst) {
+               mutex_unlock(&c->sb_lock);
+               return -ENOMEM;
+       }
+
+       dst->nr         = 0;
+       dst->entry_size = src->entry_size;
+
+       for_each_cpu_replicas_entry(src, e)
+               if (!((1 << e->data_type) & typemask))
+                       memcpy(cpu_replicas_entry(dst, dst->nr++),
+                              e, dst->entry_size);
+
+       bch2_cpu_replicas_sort(dst);
+
+       rcu_assign_pointer(c->replicas_gc, dst);
+       mutex_unlock(&c->sb_lock);
+
+       return 0;
+}
+
+/* Replicas tracking - superblock: */
+
+static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
+                                       unsigned *nr,
+                                       unsigned *bytes,
+                                       unsigned *max_dev)
+{
+       struct bch_replicas_entry *i;
+       unsigned j;
+
+       *nr     = 0;
+       *bytes  = sizeof(*r);
+       *max_dev = 0;
+
+       if (!r)
+               return;
+
+       for_each_replicas_entry(r, i) {
+               for (j = 0; j < i->nr; j++)
+                       *max_dev = max_t(unsigned, *max_dev, i->devs[j]);
+               (*nr)++;
+       }
+
+       *bytes = (void *) i - (void *) r;
+}
+
+static struct bch_replicas_cpu *
+__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
+{
+       struct bch_replicas_cpu *cpu_r;
+       unsigned i, nr, bytes, max_dev, entry_size;
+
+       bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
+
+       entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
+               DIV_ROUND_UP(max_dev + 1, 8);
+
+       cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
+                       nr * entry_size, GFP_NOIO);
+       if (!cpu_r)
+               return NULL;
+
+       cpu_r->nr               = nr;
+       cpu_r->entry_size       = entry_size;
+
+       if (nr) {
+               struct bch_replicas_cpu_entry *dst =
+                       cpu_replicas_entry(cpu_r, 0);
+               struct bch_replicas_entry *src = sb_r->entries;
+
+               while (dst < cpu_replicas_entry(cpu_r, nr)) {
+                       dst->data_type = src->data_type;
+                       for (i = 0; i < src->nr; i++)
+                               replicas_set_dev(dst, src->devs[i]);
+
+                       src     = replicas_entry_next(src);
+                       dst     = (void *) dst + entry_size;
+               }
+       }
+
+       bch2_cpu_replicas_sort(cpu_r);
+       return cpu_r;
+}
+
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
+{
+       struct bch_sb_field_replicas *sb_r;
+       struct bch_replicas_cpu *cpu_r, *old_r;
+
+       sb_r    = bch2_sb_get_replicas(c->disk_sb.sb);
+       cpu_r   = __bch2_sb_replicas_to_cpu_replicas(sb_r);
+       if (!cpu_r)
+               return -ENOMEM;
+
+       old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
+       rcu_assign_pointer(c->replicas, cpu_r);
+       if (old_r)
+               kfree_rcu(old_r, rcu);
+
+       return 0;
+}
+
+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
+                                           struct bch_replicas_cpu *r)
+{
+       struct bch_sb_field_replicas *sb_r;
+       struct bch_replicas_entry *sb_e;
+       struct bch_replicas_cpu_entry *e;
+       size_t i, bytes;
+
+       bytes = sizeof(struct bch_sb_field_replicas);
+
+       for_each_cpu_replicas_entry(r, e) {
+               bytes += sizeof(struct bch_replicas_entry);
+               for (i = 0; i < r->entry_size - 1; i++)
+                       bytes += hweight8(e->devs[i]);
+       }
+
+       sb_r = bch2_sb_resize_replicas(&c->disk_sb,
+                       DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
+       if (!sb_r)
+               return -ENOSPC;
+
+       memset(&sb_r->entries, 0,
+              vstruct_end(&sb_r->field) -
+              (void *) &sb_r->entries);
+
+       sb_e = sb_r->entries;
+       for_each_cpu_replicas_entry(r, e) {
+               sb_e->data_type = e->data_type;
+
+               for (i = 0; i < replicas_dev_slots(r); i++)
+                       if (replicas_test_dev(e, i))
+                               sb_e->devs[sb_e->nr++] = i;
+
+               sb_e = replicas_entry_next(sb_e);
+
+               BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
+       }
+
+       return 0;
+}
+
+static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f)
+{
+       struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
+       struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
+       struct bch_replicas_cpu *cpu_r = NULL;
+       struct bch_replicas_entry *e;
+       const char *err;
+       unsigned i;
+
+       for_each_replicas_entry(sb_r, e) {
+               err = "invalid replicas entry: invalid data type";
+               if (e->data_type >= BCH_DATA_NR)
+                       goto err;
+
+               err = "invalid replicas entry: no devices";
+               if (!e->nr)
+                       goto err;
+
+               err = "invalid replicas entry: too many devices";
+               if (e->nr >= BCH_REPLICAS_MAX)
+                       goto err;
+
+               err = "invalid replicas entry: invalid device";
+               for (i = 0; i < e->nr; i++)
+                       if (!bch2_dev_exists(sb, mi, e->devs[i]))
+                               goto err;
+       }
+
+       err = "cannot allocate memory";
+       cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
+       if (!cpu_r)
+               goto err;
+
+       sort_cmp_size(cpu_r->entries,
+                     cpu_r->nr,
+                     cpu_r->entry_size,
+                     memcmp, NULL);
+
+       for (i = 0; i + 1 < cpu_r->nr; i++) {
+               struct bch_replicas_cpu_entry *l =
+                       cpu_replicas_entry(cpu_r, i);
+               struct bch_replicas_cpu_entry *r =
+                       cpu_replicas_entry(cpu_r, i + 1);
+
+               BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
+
+               err = "duplicate replicas entry";
+               if (!memcmp(l, r, cpu_r->entry_size))
+                       goto err;
+       }
+
+       err = NULL;
+err:
+       kfree(cpu_r);
+       return err;
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_replicas = {
+       .validate       = bch2_sb_validate_replicas,
+};
+
+int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size)
+{
+       char *out = buf, *end = out + size;
+       struct bch_replicas_entry *e;
+       bool first = true;
+       unsigned i;
+
+       if (!r) {
+               out += scnprintf(out, end - out, "(no replicas section found)");
+               return out - buf;
+       }
+
+       for_each_replicas_entry(r, e) {
+               if (!first)
+                       out += scnprintf(out, end - out, " ");
+               first = false;
+
+               out += scnprintf(out, end - out, "%u: [", e->data_type);
+
+               for (i = 0; i < e->nr; i++)
+                       out += scnprintf(out, end - out,
+                                        i ? " %u" : "%u", e->devs[i]);
+               out += scnprintf(out, end - out, "]");
+       }
+
+       return out - buf;
+}
+
+/* Query replicas: */
+
+bool bch2_replicas_marked(struct bch_fs *c,
+                         enum bch_data_type data_type,
+                         struct bch_devs_list devs)
+{
+       struct bch_replicas_cpu_entry search;
+       unsigned max_dev;
+       bool ret;
+
+       if (!devs.nr)
+               return true;
+
+       devlist_to_replicas(devs, data_type, &search, &max_dev);
+
+       rcu_read_lock();
+       ret = replicas_has_entry(rcu_dereference(c->replicas),
+                                search, max_dev);
+       rcu_read_unlock();
+
+       return ret;
+}
+
+bool bch2_bkey_replicas_marked(struct bch_fs *c,
+                              enum bch_data_type data_type,
+                              struct bkey_s_c k)
+{
+       struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+       unsigned i;
+
+       for (i = 0; i < cached.nr; i++)
+               if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
+                                         bch2_dev_list_single(cached.devs[i])))
+                       return false;
+
+       return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k));
+}
+
+struct replicas_status __bch2_replicas_status(struct bch_fs *c,
+                                             struct bch_devs_mask online_devs)
+{
+       struct bch_sb_field_members *mi;
+       struct bch_replicas_cpu_entry *e;
+       struct bch_replicas_cpu *r;
+       unsigned i, dev, dev_slots, nr_online, nr_offline;
+       struct replicas_status ret;
+
+       memset(&ret, 0, sizeof(ret));
+
+       for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
+               ret.replicas[i].nr_online = UINT_MAX;
+
+       mi = bch2_sb_get_members(c->disk_sb.sb);
+       rcu_read_lock();
+
+       r = rcu_dereference(c->replicas);
+       dev_slots = replicas_dev_slots(r);
+
+       for_each_cpu_replicas_entry(r, e) {
+               if (e->data_type >= ARRAY_SIZE(ret.replicas))
+                       panic("e %p data_type %u\n", e, e->data_type);
+
+               nr_online = nr_offline = 0;
+
+               for (dev = 0; dev < dev_slots; dev++) {
+                       if (!replicas_test_dev(e, dev))
+                               continue;
+
+                       BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, dev));
+
+                       if (test_bit(dev, online_devs.d))
+                               nr_online++;
+                       else
+                               nr_offline++;
+               }
+
+               ret.replicas[e->data_type].nr_online =
+                       min(ret.replicas[e->data_type].nr_online,
+                           nr_online);
+
+               ret.replicas[e->data_type].nr_offline =
+                       max(ret.replicas[e->data_type].nr_offline,
+                           nr_offline);
+       }
+
+       rcu_read_unlock();
+
+       return ret;
+}
+
+struct replicas_status bch2_replicas_status(struct bch_fs *c)
+{
+       return __bch2_replicas_status(c, bch2_online_devs(c));
+}
+
+static bool have_enough_devs(struct replicas_status s,
+                            enum bch_data_type type,
+                            bool force_if_degraded,
+                            bool force_if_lost)
+{
+       return (!s.replicas[type].nr_offline || force_if_degraded) &&
+               (s.replicas[type].nr_online || force_if_lost);
+}
+
+bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
+{
+       return (have_enough_devs(s, BCH_DATA_JOURNAL,
+                                flags & BCH_FORCE_IF_METADATA_DEGRADED,
+                                flags & BCH_FORCE_IF_METADATA_LOST) &&
+               have_enough_devs(s, BCH_DATA_BTREE,
+                                flags & BCH_FORCE_IF_METADATA_DEGRADED,
+                                flags & BCH_FORCE_IF_METADATA_LOST) &&
+               have_enough_devs(s, BCH_DATA_USER,
+                                flags & BCH_FORCE_IF_DATA_DEGRADED,
+                                flags & BCH_FORCE_IF_DATA_LOST));
+}
+
+unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
+{
+       struct replicas_status s = bch2_replicas_status(c);
+
+       return meta
+               ? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
+                     s.replicas[BCH_DATA_BTREE].nr_online)
+               : s.replicas[BCH_DATA_USER].nr_online;
+}
+
+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
+{
+       struct bch_replicas_cpu_entry *e;
+       struct bch_replicas_cpu *r;
+       unsigned ret = 0;
+
+       rcu_read_lock();
+       r = rcu_dereference(c->replicas);
+
+       if (ca->dev_idx >= replicas_dev_slots(r))
+               goto out;
+
+       for_each_cpu_replicas_entry(r, e)
+               if (replicas_test_dev(e, ca->dev_idx))
+                       ret |= 1 << e->data_type;
+out:
+       rcu_read_unlock();
+
+       return ret;
+}
diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h

new file mode 100644 (file)

index 0000000..49f114b
--- /dev/null
+++ b/libbcachefs/replicas.h
@@ -0,0 +1,51 @@
+#ifndef _BCACHEFS_REPLICAS_H
+#define _BCACHEFS_REPLICAS_H
+
+bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
+                         struct bch_devs_list);
+bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type,
+                              struct bkey_s_c);
+int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
+                      struct bch_devs_list);
+int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type,
+                           struct bkey_s_c);
+
+int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t);
+int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t);
+
+struct replicas_status {
+       struct {
+               unsigned        nr_online;
+               unsigned        nr_offline;
+       }                       replicas[BCH_DATA_NR];
+};
+
+struct replicas_status __bch2_replicas_status(struct bch_fs *,
+                                             struct bch_devs_mask);
+struct replicas_status bch2_replicas_status(struct bch_fs *);
+bool bch2_have_enough_devs(struct replicas_status, unsigned);
+
+unsigned bch2_replicas_online(struct bch_fs *, bool);
+unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
+
+int bch2_replicas_gc_end(struct bch_fs *, int);
+int bch2_replicas_gc_start(struct bch_fs *, unsigned);
+
+/* iterate over superblock replicas - used by userspace tools: */
+
+static inline struct bch_replicas_entry *
+replicas_entry_next(struct bch_replicas_entry *i)
+{
+       return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
+}
+
+#define for_each_replicas_entry(_r, _i)                                        \
+       for (_i = (_r)->entries;                                        \
+            (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
+            (_i) = replicas_entry_next(_i))
+
+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
+
+#endif /* _BCACHEFS_REPLICAS_H */
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c

index 69101f3a68a5e8b1fd2574118c789498e3938d51..a2b981a3c9c50db224a59326595bba7d453b0fdb 100644 (file)
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -1,8 +1,11 @@
  
  #include "bcachefs.h"
  #include "checksum.h"
+#include "disk_groups.h"
  #include "error.h"
  #include "io.h"
+#include "replicas.h"
+#include "quota.h"
  #include "super-io.h"
  #include "super.h"
  #include "vstructs.h"
@@ -10,13 +13,6 @@
  #include <linux/backing-dev.h>
  #include <linux/sort.h>
  
-static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
-static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
-                                           struct bch_replicas_cpu *);
-static int bch2_sb_disk_groups_to_cpu(struct bch_fs *);
-
-/* superblock fields (optional/variable size sections: */
-
  const char * const bch2_sb_fields[] = {
  #define x(name, nr)    #name,
         BCH_SB_FIELDS()
@@ -24,34 +20,8 @@ const char * const bch2_sb_fields[] = {
         NULL
  };
  
-#define x(f, nr)                                       \
-static const char *bch2_sb_validate_##f(struct bch_sb *, struct bch_sb_field *);
-       BCH_SB_FIELDS()
-#undef x
-
-struct bch_sb_field_ops {
-       const char *    (*validate)(struct bch_sb *, struct bch_sb_field *);
-};
-
-static const struct bch_sb_field_ops bch2_sb_field_ops[] = {
-#define x(f, nr)                                       \
-       [BCH_SB_FIELD_##f] = {                          \
-               .validate = bch2_sb_validate_##f,       \
-       },
-       BCH_SB_FIELDS()
-#undef x
-};
-
-static const char *bch2_sb_field_validate(struct bch_sb *sb,
-                                         struct bch_sb_field *f)
-
-{
-       unsigned type = le32_to_cpu(f->type);
-
-       return type < BCH_SB_FIELD_NR
-               ? bch2_sb_field_ops[type].validate(sb, f)
-               : NULL;
-}
+static const char *bch2_sb_field_validate(struct bch_sb *,
+                                         struct bch_sb_field *);
  
  struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
                                       enum bch_sb_field_type type)
@@ -66,14 +36,18 @@ struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb,
         return NULL;
  }
  
-static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb *sb,
-                                                 struct bch_sb_field *f,
-                                                 unsigned u64s)
+static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb,
+                                                  struct bch_sb_field *f,
+                                                  unsigned u64s)
  {
         unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+       unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s;
+
+       BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) >
+              sb->page_order);
  
         if (!f) {
-               f = vstruct_last(sb);
+               f = vstruct_last(sb->sb);
                 memset(f, 0, sizeof(u64) * u64s);
                 f->u64s = cpu_to_le32(u64s);
                 f->type = 0;
@@ -84,13 +58,13 @@ static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb *sb,
                 f->u64s = cpu_to_le32(u64s);
                 dst = vstruct_end(f);
  
-               memmove(dst, src, vstruct_end(sb) - src);
+               memmove(dst, src, vstruct_end(sb->sb) - src);
  
                 if (dst > src)
                         memset(src, 0, dst - src);
         }
  
-       le32_add_cpu(&sb->u64s, u64s - old_u64s);
+       sb->sb->u64s = cpu_to_le32(sb_u64s);
  
         return f;
  }
@@ -108,26 +82,42 @@ void bch2_free_super(struct bch_sb_handle *sb)
         memset(sb, 0, sizeof(*sb));
  }
  
-static int __bch2_super_realloc(struct bch_sb_handle *sb, unsigned order)
+int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
  {
+       size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s);
+       unsigned order = get_order(new_bytes);
         struct bch_sb *new_sb;
         struct bio *bio;
  
+       if (sb->have_layout) {
+               u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
+
+               if (new_bytes > max_bytes) {
+                       char buf[BDEVNAME_SIZE];
+
+                       pr_err("%s: superblock too big: want %zu but have %llu",
+                              bdevname(sb->bdev, buf), new_bytes, max_bytes);
+                       return -ENOSPC;
+               }
+       }
+
         if (sb->page_order >= order && sb->sb)
                 return 0;
  
         if (dynamic_fault("bcachefs:add:super_realloc"))
                 return -ENOMEM;
  
-       bio = bio_kmalloc(GFP_KERNEL, 1 << order);
-       if (!bio)
-               return -ENOMEM;
+       if (sb->have_bio) {
+               bio = bio_kmalloc(GFP_KERNEL, 1 << order);
+               if (!bio)
+                       return -ENOMEM;
  
-       if (sb->bio)
-               bio_put(sb->bio);
-       sb->bio = bio;
+               if (sb->bio)
+                       bio_put(sb->bio);
+               sb->bio = bio;
+       }
  
-       new_sb = (void *) __get_free_pages(GFP_KERNEL, order);
+       new_sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
         if (!new_sb)
                 return -ENOMEM;
  
@@ -142,45 +132,6 @@ static int __bch2_super_realloc(struct bch_sb_handle *sb, unsigned order)
         return 0;
  }
  
-static int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
-{
-       u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s);
-       u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
-
-       if (new_bytes > max_bytes) {
-               char buf[BDEVNAME_SIZE];
-
-               pr_err("%s: superblock too big: want %llu but have %llu",
-                      bdevname(sb->bdev, buf), new_bytes, max_bytes);
-               return -ENOSPC;
-       }
-
-       return __bch2_super_realloc(sb, get_order(new_bytes));
-}
-
-static int bch2_fs_sb_realloc(struct bch_fs *c, unsigned u64s)
-{
-       u64 bytes = __vstruct_bytes(struct bch_sb, u64s);
-       struct bch_sb *sb;
-       unsigned order = get_order(bytes);
-
-       if (c->disk_sb && order <= c->disk_sb_order)
-               return 0;
-
-       sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
-       if (!sb)
-               return -ENOMEM;
-
-       if (c->disk_sb)
-               memcpy(sb, c->disk_sb, PAGE_SIZE << c->disk_sb_order);
-
-       free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
-
-       c->disk_sb = sb;
-       c->disk_sb_order = order;
-       return 0;
-}
-
  struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
                                           enum bch_sb_field_type type,
                                           unsigned u64s)
@@ -192,38 +143,26 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb,
         if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
                 return NULL;
  
-       f = __bch2_sb_field_resize(sb->sb, f, u64s);
-       f->type = cpu_to_le32(type);
-       return f;
-}
-
-struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *c,
-                                           enum bch_sb_field_type type,
-                                           unsigned u64s)
-{
-       struct bch_sb_field *f = bch2_sb_field_get(c->disk_sb, type);
-       ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
-       ssize_t d = -old_u64s + u64s;
-       struct bch_dev *ca;
-       unsigned i;
-
-       lockdep_assert_held(&c->sb_lock);
+       if (sb->fs_sb) {
+               struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb);
+               struct bch_dev *ca;
+               unsigned i;
  
-       if (bch2_fs_sb_realloc(c, le32_to_cpu(c->disk_sb->u64s) + d))
-               return NULL;
+               lockdep_assert_held(&c->sb_lock);
  
-       /* XXX: we're not checking that offline device have enough space */
+               /* XXX: we're not checking that offline device have enough space */
  
-       for_each_online_member(ca, c, i) {
-               struct bch_sb_handle *sb = &ca->disk_sb;
+               for_each_online_member(ca, c, i) {
+                       struct bch_sb_handle *sb = &ca->disk_sb;
  
-               if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
-                       percpu_ref_put(&ca->ref);
-                       return NULL;
+                       if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
+                               percpu_ref_put(&ca->ref);
+                               return NULL;
+                       }
                 }
         }
  
-       f = __bch2_sb_field_resize(c->disk_sb, f, u64s);
+       f = __bch2_sb_field_resize(sb, f, u64s);
         f->type = cpu_to_le32(type);
         return f;
  }
@@ -384,7 +323,7 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
  
  static void bch2_sb_update(struct bch_fs *c)
  {
-       struct bch_sb *src = c->disk_sb;
+       struct bch_sb *src = c->disk_sb.sb;
         struct bch_sb_field_members *mi = bch2_sb_get_members(src);
         struct bch_dev *ca;
         unsigned i;
@@ -407,9 +346,10 @@ static void bch2_sb_update(struct bch_fs *c)
  }
  
  /* doesn't copy member info */
-static void __copy_super(struct bch_sb *dst, struct bch_sb *src)
+static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
  {
         struct bch_sb_field *src_f, *dst_f;
+       struct bch_sb *dst = dst_handle->sb;
  
         dst->version            = src->version;
         dst->seq                = src->seq;
@@ -433,8 +373,8 @@ static void __copy_super(struct bch_sb *dst, struct bch_sb *src)
                         continue;
  
                 dst_f = bch2_sb_field_get(dst, le32_to_cpu(src_f->type));
-               dst_f = __bch2_sb_field_resize(dst, dst_f,
-                               le32_to_cpu(src_f->u64s));
+               dst_f = __bch2_sb_field_resize(dst_handle, dst_f,
+                                              le32_to_cpu(src_f->u64s));
  
                 memcpy(dst_f, src_f, vstruct_bytes(src_f));
         }
@@ -451,11 +391,12 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
  
         lockdep_assert_held(&c->sb_lock);
  
-       ret = bch2_fs_sb_realloc(c, le32_to_cpu(src->u64s) - journal_u64s);
+       ret = bch2_sb_realloc(&c->disk_sb,
+                             le32_to_cpu(src->u64s) - journal_u64s);
         if (ret)
                 return ret;
  
-       __copy_super(c->disk_sb, src);
+       __copy_super(&c->disk_sb, src);
  
         ret = bch2_sb_replicas_to_cpu_replicas(c);
         if (ret)
@@ -471,7 +412,7 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
  
  int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
  {
-       struct bch_sb *src = c->disk_sb, *dst = ca->disk_sb.sb;
+       struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb;
         struct bch_sb_field_journal *journal_buckets =
                 bch2_sb_get_journal(dst);
         unsigned journal_u64s = journal_buckets
@@ -484,7 +425,7 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
         if (ret)
                 return ret;
  
-       __copy_super(dst, src);
+       __copy_super(&ca->disk_sb, src);
         return 0;
  }
  
@@ -494,7 +435,6 @@ static const char *read_one_super(struct bch_sb_handle *sb, u64 offset)
  {
         struct bch_csum csum;
         size_t bytes;
-       unsigned order;
  reread:
         bio_reset(sb->bio);
         bio_set_dev(sb->bio, sb->bdev);
@@ -518,9 +458,8 @@ reread:
         if (bytes > 512 << sb->sb->layout.sb_max_size_bits)
                 return "Bad superblock: too big";
  
-       order = get_order(bytes);
-       if (order > sb->page_order) {
-               if (__bch2_super_realloc(sb, order))
+       if (get_order(bytes) > sb->page_order) {
+               if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)))
                         return "cannot allocate memory";
                 goto reread;
         }
@@ -550,7 +489,8 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
         pr_verbose_init(*opts, "");
  
         memset(sb, 0, sizeof(*sb));
-       sb->mode = FMODE_READ;
+       sb->mode        = FMODE_READ;
+       sb->have_bio    = true;
  
         if (!opt_get(*opts, noexcl))
                 sb->mode |= FMODE_EXCL;
@@ -575,7 +515,7 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
         }
  
         err = "cannot allocate memory";
-       ret = __bch2_super_realloc(sb, 0);
+       ret = bch2_sb_realloc(sb, 0);
         if (ret)
                 goto err;
  
@@ -644,6 +584,7 @@ got_super:
                 bdev_get_queue(sb->bdev)->backing_dev_info->capabilities
                         |= BDI_CAP_STABLE_WRITES;
         ret = 0;
+       sb->have_layout = true;
  out:
         pr_verbose_init(*opts, "ret %i", ret);
         return ret;
@@ -711,7 +652,7 @@ void bch2_write_super(struct bch_fs *c)
         closure_init_stack(cl);
         memset(&sb_written, 0, sizeof(sb_written));
  
-       le64_add_cpu(&c->disk_sb->seq, 1);
+       le64_add_cpu(&c->disk_sb.sb->seq, 1);
  
         for_each_online_member(ca, c, i)
                 bch2_sb_from_fs(c, ca);
@@ -837,6 +778,10 @@ err:
         return err;
  }
  
+static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
+       .validate       = bch2_sb_validate_journal,
+};
+
  /* BCH_SB_FIELD_members: */
  
  static const char *bch2_sb_validate_members(struct bch_sb *sb,
@@ -880,6 +825,10 @@ static const char *bch2_sb_validate_members(struct bch_sb *sb,
         return NULL;
  }
  
+static const struct bch_sb_field_ops bch_sb_field_ops_members = {
+       .validate       = bch2_sb_validate_members,
+};
+
  /* BCH_SB_FIELD_crypt: */
  
  static const char *bch2_sb_validate_crypt(struct bch_sb *sb,
@@ -896,980 +845,42 @@ static const char *bch2_sb_validate_crypt(struct bch_sb *sb,
         return NULL;
  }
  
-/* BCH_SB_FIELD_replicas: */
-
-/* Replicas tracking - in memory: */
-
-#define for_each_cpu_replicas_entry(_r, _i)                            \
-       for (_i = (_r)->entries;                                        \
-            (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
-            _i = (void *) (_i) + (_r)->entry_size)
-
-static inline struct bch_replicas_cpu_entry *
-cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
-{
-       return (void *) r->entries + r->entry_size * i;
-}
-
-static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
-{
-       eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
-}
-
-static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e,
-                                    unsigned dev)
-{
-       return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0;
-}
-
-static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e,
-                                   unsigned dev)
-{
-       e->devs[dev >> 3] |= 1 << (dev & 7);
-}
-
-static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r)
-{
-       return (r->entry_size -
-               offsetof(struct bch_replicas_cpu_entry, devs)) * 8;
-}
-
-int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *r,
-                             char *buf, size_t size)
-{
-       char *out = buf, *end = out + size;
-       struct bch_replicas_cpu_entry *e;
-       bool first = true;
-       unsigned i;
-
-       for_each_cpu_replicas_entry(r, e) {
-               bool first_e = true;
-
-               if (!first)
-                       out += scnprintf(out, end - out, " ");
-               first = false;
-
-               out += scnprintf(out, end - out, "%u: [", e->data_type);
-
-               for (i = 0; i < replicas_dev_slots(r); i++)
-                       if (replicas_test_dev(e, i)) {
-                               if (!first_e)
-                                       out += scnprintf(out, end - out, " ");
-                               first_e = false;
-                               out += scnprintf(out, end - out, "%u", i);
-                       }
-               out += scnprintf(out, end - out, "]");
-       }
-
-       return out - buf;
-}
-
-static inline unsigned bkey_to_replicas(struct bkey_s_c_extent e,
-                                       enum bch_data_type data_type,
-                                       struct bch_replicas_cpu_entry *r,
-                                       unsigned *max_dev)
-{
-       const struct bch_extent_ptr *ptr;
-       unsigned nr = 0;
-
-       BUG_ON(!data_type ||
-              data_type == BCH_DATA_SB ||
-              data_type >= BCH_DATA_NR);
-
-       memset(r, 0, sizeof(*r));
-       r->data_type = data_type;
-
-       *max_dev = 0;
-
-       extent_for_each_ptr(e, ptr)
-               if (!ptr->cached) {
-                       *max_dev = max_t(unsigned, *max_dev, ptr->dev);
-                       replicas_set_dev(r, ptr->dev);
-                       nr++;
-               }
-       return nr;
-}
-
-static inline void devlist_to_replicas(struct bch_devs_list devs,
-                                      enum bch_data_type data_type,
-                                      struct bch_replicas_cpu_entry *r,
-                                      unsigned *max_dev)
-{
-       unsigned i;
-
-       BUG_ON(!data_type ||
-              data_type == BCH_DATA_SB ||
-              data_type >= BCH_DATA_NR);
-
-       memset(r, 0, sizeof(*r));
-       r->data_type = data_type;
-
-       *max_dev = 0;
-
-       for (i = 0; i < devs.nr; i++) {
-               *max_dev = max_t(unsigned, *max_dev, devs.devs[i]);
-               replicas_set_dev(r, devs.devs[i]);
-       }
-}
-
-static struct bch_replicas_cpu *
-cpu_replicas_add_entry(struct bch_replicas_cpu *old,
-                      struct bch_replicas_cpu_entry new_entry,
-                      unsigned max_dev)
-{
-       struct bch_replicas_cpu *new;
-       unsigned i, nr, entry_size;
-
-       entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
-               DIV_ROUND_UP(max_dev + 1, 8);
-       entry_size = max(entry_size, old->entry_size);
-       nr = old->nr + 1;
-
-       new = kzalloc(sizeof(struct bch_replicas_cpu) +
-                     nr * entry_size, GFP_NOIO);
-       if (!new)
-               return NULL;
-
-       new->nr         = nr;
-       new->entry_size = entry_size;
-
-       for (i = 0; i < old->nr; i++)
-               memcpy(cpu_replicas_entry(new, i),
-                      cpu_replicas_entry(old, i),
-                      min(new->entry_size, old->entry_size));
-
-       memcpy(cpu_replicas_entry(new, old->nr),
-              &new_entry,
-              new->entry_size);
-
-       bch2_cpu_replicas_sort(new);
-       return new;
-}
-
-static bool replicas_has_entry(struct bch_replicas_cpu *r,
-                               struct bch_replicas_cpu_entry search,
-                               unsigned max_dev)
-{
-       return max_dev < replicas_dev_slots(r) &&
-               eytzinger0_find(r->entries, r->nr,
-                               r->entry_size,
-                               memcmp, &search) < r->nr;
-}
-
-noinline
-static int bch2_mark_replicas_slowpath(struct bch_fs *c,
-                               struct bch_replicas_cpu_entry new_entry,
-                               unsigned max_dev)
-{
-       struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL;
-       int ret = -ENOMEM;
-
-       mutex_lock(&c->sb_lock);
-
-       old_gc = rcu_dereference_protected(c->replicas_gc,
-                                          lockdep_is_held(&c->sb_lock));
-       if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) {
-               new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev);
-               if (!new_gc)
-                       goto err;
-       }
-
-       old_r = rcu_dereference_protected(c->replicas,
-                                         lockdep_is_held(&c->sb_lock));
-       if (!replicas_has_entry(old_r, new_entry, max_dev)) {
-               new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev);
-               if (!new_r)
-                       goto err;
-
-               ret = bch2_cpu_replicas_to_sb_replicas(c, new_r);
-               if (ret)
-                       goto err;
-       }
-
-       /* allocations done, now commit: */
-
-       if (new_r)
-               bch2_write_super(c);
-
-       /* don't update in memory replicas until changes are persistent */
-
-       if (new_gc) {
-               rcu_assign_pointer(c->replicas_gc, new_gc);
-               kfree_rcu(old_gc, rcu);
-       }
-
-       if (new_r) {
-               rcu_assign_pointer(c->replicas, new_r);
-               kfree_rcu(old_r, rcu);
-       }
-
-       mutex_unlock(&c->sb_lock);
-       return 0;
-err:
-       mutex_unlock(&c->sb_lock);
-       if (new_gc)
-               kfree(new_gc);
-       if (new_r)
-               kfree(new_r);
-       return ret;
-}
-
-int bch2_mark_replicas(struct bch_fs *c,
-                      enum bch_data_type data_type,
-                      struct bch_devs_list devs)
-{
-       struct bch_replicas_cpu_entry search;
-       struct bch_replicas_cpu *r, *gc_r;
-       unsigned max_dev;
-       bool marked;
-
-       if (!devs.nr)
-               return 0;
-
-       BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
-
-       devlist_to_replicas(devs, data_type, &search, &max_dev);
-
-       rcu_read_lock();
-       r = rcu_dereference(c->replicas);
-       gc_r = rcu_dereference(c->replicas_gc);
-       marked = replicas_has_entry(r, search, max_dev) &&
-               (!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev));
-       rcu_read_unlock();
-
-       return likely(marked) ? 0
-               : bch2_mark_replicas_slowpath(c, search, max_dev);
-}
-
-int bch2_mark_bkey_replicas(struct bch_fs *c,
-                           enum bch_data_type data_type,
-                           struct bkey_s_c k)
-{
-       struct bch_devs_list cached = bch2_bkey_cached_devs(k);
-       unsigned i;
-       int ret;
-
-       for (i = 0; i < cached.nr; i++)
-               if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
-                                             bch2_dev_list_single(cached.devs[i]))))
-                       return ret;
-
-       return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
-}
-
-int bch2_replicas_gc_end(struct bch_fs *c, int err)
-{
-       struct bch_replicas_cpu *new_r, *old_r;
-       int ret = 0;
-
-       lockdep_assert_held(&c->replicas_gc_lock);
-
-       mutex_lock(&c->sb_lock);
-
-       new_r = rcu_dereference_protected(c->replicas_gc,
-                                         lockdep_is_held(&c->sb_lock));
-
-       if (err) {
-               rcu_assign_pointer(c->replicas_gc, NULL);
-               kfree_rcu(new_r, rcu);
-               goto err;
-       }
-
-       if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
-               ret = -ENOSPC;
-               goto err;
-       }
-
-       old_r = rcu_dereference_protected(c->replicas,
-                                         lockdep_is_held(&c->sb_lock));
-
-       rcu_assign_pointer(c->replicas, new_r);
-       rcu_assign_pointer(c->replicas_gc, NULL);
-       kfree_rcu(old_r, rcu);
-
-       bch2_write_super(c);
-err:
-       mutex_unlock(&c->sb_lock);
-       return ret;
-}
-
-int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
-{
-       struct bch_replicas_cpu *dst, *src;
-       struct bch_replicas_cpu_entry *e;
-
-       lockdep_assert_held(&c->replicas_gc_lock);
-
-       mutex_lock(&c->sb_lock);
-       BUG_ON(c->replicas_gc);
-
-       src = rcu_dereference_protected(c->replicas,
-                                       lockdep_is_held(&c->sb_lock));
-
-       dst = kzalloc(sizeof(struct bch_replicas_cpu) +
-                     src->nr * src->entry_size, GFP_NOIO);
-       if (!dst) {
-               mutex_unlock(&c->sb_lock);
-               return -ENOMEM;
-       }
-
-       dst->nr         = 0;
-       dst->entry_size = src->entry_size;
-
-       for_each_cpu_replicas_entry(src, e)
-               if (!((1 << e->data_type) & typemask))
-                       memcpy(cpu_replicas_entry(dst, dst->nr++),
-                              e, dst->entry_size);
-
-       bch2_cpu_replicas_sort(dst);
-
-       rcu_assign_pointer(c->replicas_gc, dst);
-       mutex_unlock(&c->sb_lock);
-
-       return 0;
-}
-
-/* Replicas tracking - superblock: */
-
-static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r,
-                                       unsigned *nr,
-                                       unsigned *bytes,
-                                       unsigned *max_dev)
-{
-       struct bch_replicas_entry *i;
-       unsigned j;
-
-       *nr     = 0;
-       *bytes  = sizeof(*r);
-       *max_dev = 0;
-
-       if (!r)
-               return;
-
-       for_each_replicas_entry(r, i) {
-               for (j = 0; j < i->nr; j++)
-                       *max_dev = max_t(unsigned, *max_dev, i->devs[j]);
-               (*nr)++;
-       }
-
-       *bytes = (void *) i - (void *) r;
-}
-
-static struct bch_replicas_cpu *
-__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r)
-{
-       struct bch_replicas_cpu *cpu_r;
-       unsigned i, nr, bytes, max_dev, entry_size;
-
-       bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev);
-
-       entry_size = offsetof(struct bch_replicas_cpu_entry, devs) +
-               DIV_ROUND_UP(max_dev + 1, 8);
-
-       cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) +
-                       nr * entry_size, GFP_NOIO);
-       if (!cpu_r)
-               return NULL;
-
-       cpu_r->nr               = nr;
-       cpu_r->entry_size       = entry_size;
-
-       if (nr) {
-               struct bch_replicas_cpu_entry *dst =
-                       cpu_replicas_entry(cpu_r, 0);
-               struct bch_replicas_entry *src = sb_r->entries;
-
-               while (dst < cpu_replicas_entry(cpu_r, nr)) {
-                       dst->data_type = src->data_type;
-                       for (i = 0; i < src->nr; i++)
-                               replicas_set_dev(dst, src->devs[i]);
-
-                       src     = replicas_entry_next(src);
-                       dst     = (void *) dst + entry_size;
-               }
-       }
-
-       bch2_cpu_replicas_sort(cpu_r);
-       return cpu_r;
-}
-
-static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
-{
-       struct bch_sb_field_replicas *sb_r;
-       struct bch_replicas_cpu *cpu_r, *old_r;
-
-       sb_r    = bch2_sb_get_replicas(c->disk_sb);
-       cpu_r   = __bch2_sb_replicas_to_cpu_replicas(sb_r);
-       if (!cpu_r)
-               return -ENOMEM;
-
-       old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock));
-       rcu_assign_pointer(c->replicas, cpu_r);
-       if (old_r)
-               kfree_rcu(old_r, rcu);
-
-       return 0;
-}
-
-static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
-                                           struct bch_replicas_cpu *r)
-{
-       struct bch_sb_field_replicas *sb_r;
-       struct bch_replicas_entry *sb_e;
-       struct bch_replicas_cpu_entry *e;
-       size_t i, bytes;
-
-       bytes = sizeof(struct bch_sb_field_replicas);
-
-       for_each_cpu_replicas_entry(r, e) {
-               bytes += sizeof(struct bch_replicas_entry);
-               for (i = 0; i < r->entry_size - 1; i++)
-                       bytes += hweight8(e->devs[i]);
-       }
-
-       sb_r = bch2_fs_sb_resize_replicas(c,
-                       DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64)));
-       if (!sb_r)
-               return -ENOSPC;
-
-       memset(&sb_r->entries, 0,
-              vstruct_end(&sb_r->field) -
-              (void *) &sb_r->entries);
-
-       sb_e = sb_r->entries;
-       for_each_cpu_replicas_entry(r, e) {
-               sb_e->data_type = e->data_type;
-
-               for (i = 0; i < replicas_dev_slots(r); i++)
-                       if (replicas_test_dev(e, i))
-                               sb_e->devs[sb_e->nr++] = i;
-
-               sb_e = replicas_entry_next(sb_e);
-
-               BUG_ON((void *) sb_e > vstruct_end(&sb_r->field));
-       }
-
-       return 0;
-}
-
-static const char *bch2_sb_validate_replicas(struct bch_sb *sb,
-                                            struct bch_sb_field *f)
-{
-       struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas);
-       struct bch_sb_field_members *mi = bch2_sb_get_members(sb);
-       struct bch_replicas_cpu *cpu_r = NULL;
-       struct bch_replicas_entry *e;
-       const char *err;
-       unsigned i;
-
-       for_each_replicas_entry(sb_r, e) {
-               err = "invalid replicas entry: invalid data type";
-               if (e->data_type >= BCH_DATA_NR)
-                       goto err;
-
-               err = "invalid replicas entry: no devices";
-               if (!e->nr)
-                       goto err;
-
-               err = "invalid replicas entry: too many devices";
-               if (e->nr >= BCH_REPLICAS_MAX)
-                       goto err;
-
-               err = "invalid replicas entry: invalid device";
-               for (i = 0; i < e->nr; i++)
-                       if (!bch2_dev_exists(sb, mi, e->devs[i]))
-                               goto err;
-       }
-
-       err = "cannot allocate memory";
-       cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r);
-       if (!cpu_r)
-               goto err;
-
-       sort_cmp_size(cpu_r->entries,
-                     cpu_r->nr,
-                     cpu_r->entry_size,
-                     memcmp, NULL);
-
-       for (i = 0; i + 1 < cpu_r->nr; i++) {
-               struct bch_replicas_cpu_entry *l =
-                       cpu_replicas_entry(cpu_r, i);
-               struct bch_replicas_cpu_entry *r =
-                       cpu_replicas_entry(cpu_r, i + 1);
-
-               BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0);
-
-               err = "duplicate replicas entry";
-               if (!memcmp(l, r, cpu_r->entry_size))
-                       goto err;
-       }
-
-       err = NULL;
-err:
-       kfree(cpu_r);
-       return err;
-}
-
-int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t size)
-{
-       char *out = buf, *end = out + size;
-       struct bch_replicas_entry *e;
-       bool first = true;
-       unsigned i;
-
-       if (!r) {
-               out += scnprintf(out, end - out, "(no replicas section found)");
-               return out - buf;
-       }
-
-       for_each_replicas_entry(r, e) {
-               if (!first)
-                       out += scnprintf(out, end - out, " ");
-               first = false;
-
-               out += scnprintf(out, end - out, "%u: [", e->data_type);
-
-               for (i = 0; i < e->nr; i++)
-                       out += scnprintf(out, end - out,
-                                        i ? " %u" : "%u", e->devs[i]);
-               out += scnprintf(out, end - out, "]");
-       }
-
-       return out - buf;
-}
-
-/* Query replicas: */
-
-bool bch2_replicas_marked(struct bch_fs *c,
-                         enum bch_data_type data_type,
-                         struct bch_devs_list devs)
-{
-       struct bch_replicas_cpu_entry search;
-       unsigned max_dev;
-       bool ret;
-
-       if (!devs.nr)
-               return true;
-
-       devlist_to_replicas(devs, data_type, &search, &max_dev);
-
-       rcu_read_lock();
-       ret = replicas_has_entry(rcu_dereference(c->replicas),
-                                search, max_dev);
-       rcu_read_unlock();
-
-       return ret;
-}
-
-bool bch2_bkey_replicas_marked(struct bch_fs *c,
-                              enum bch_data_type data_type,
-                              struct bkey_s_c k)
-{
-       struct bch_devs_list cached = bch2_bkey_cached_devs(k);
-       unsigned i;
-
-       for (i = 0; i < cached.nr; i++)
-               if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
-                                         bch2_dev_list_single(cached.devs[i])))
-                       return false;
-
-       return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k));
-}
-
-struct replicas_status __bch2_replicas_status(struct bch_fs *c,
-                                             struct bch_devs_mask online_devs)
-{
-       struct bch_sb_field_members *mi;
-       struct bch_replicas_cpu_entry *e;
-       struct bch_replicas_cpu *r;
-       unsigned i, dev, dev_slots, nr_online, nr_offline;
-       struct replicas_status ret;
-
-       memset(&ret, 0, sizeof(ret));
-
-       for (i = 0; i < ARRAY_SIZE(ret.replicas); i++)
-               ret.replicas[i].nr_online = UINT_MAX;
-
-       mi = bch2_sb_get_members(c->disk_sb);
-       rcu_read_lock();
-
-       r = rcu_dereference(c->replicas);
-       dev_slots = replicas_dev_slots(r);
-
-       for_each_cpu_replicas_entry(r, e) {
-               if (e->data_type >= ARRAY_SIZE(ret.replicas))
-                       panic("e %p data_type %u\n", e, e->data_type);
-
-               nr_online = nr_offline = 0;
-
-               for (dev = 0; dev < dev_slots; dev++) {
-                       if (!replicas_test_dev(e, dev))
-                               continue;
-
-                       BUG_ON(!bch2_dev_exists(c->disk_sb, mi, dev));
-
-                       if (test_bit(dev, online_devs.d))
-                               nr_online++;
-                       else
-                               nr_offline++;
-               }
-
-               ret.replicas[e->data_type].nr_online =
-                       min(ret.replicas[e->data_type].nr_online,
-                           nr_online);
-
-               ret.replicas[e->data_type].nr_offline =
-                       max(ret.replicas[e->data_type].nr_offline,
-                           nr_offline);
-       }
-
-       rcu_read_unlock();
-
-       return ret;
-}
-
-struct replicas_status bch2_replicas_status(struct bch_fs *c)
-{
-       return __bch2_replicas_status(c, bch2_online_devs(c));
-}
-
-static bool have_enough_devs(struct replicas_status s,
-                            enum bch_data_type type,
-                            bool force_if_degraded,
-                            bool force_if_lost)
-{
-       return (!s.replicas[type].nr_offline || force_if_degraded) &&
-               (s.replicas[type].nr_online || force_if_lost);
-}
-
-bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
-{
-       return (have_enough_devs(s, BCH_DATA_JOURNAL,
-                                flags & BCH_FORCE_IF_METADATA_DEGRADED,
-                                flags & BCH_FORCE_IF_METADATA_LOST) &&
-               have_enough_devs(s, BCH_DATA_BTREE,
-                                flags & BCH_FORCE_IF_METADATA_DEGRADED,
-                                flags & BCH_FORCE_IF_METADATA_LOST) &&
-               have_enough_devs(s, BCH_DATA_USER,
-                                flags & BCH_FORCE_IF_DATA_DEGRADED,
-                                flags & BCH_FORCE_IF_DATA_LOST));
-}
-
-unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
-{
-       struct replicas_status s = bch2_replicas_status(c);
-
-       return meta
-               ? min(s.replicas[BCH_DATA_JOURNAL].nr_online,
-                     s.replicas[BCH_DATA_BTREE].nr_online)
-               : s.replicas[BCH_DATA_USER].nr_online;
-}
-
-unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
-{
-       struct bch_replicas_cpu_entry *e;
-       struct bch_replicas_cpu *r;
-       unsigned ret = 0;
-
-       rcu_read_lock();
-       r = rcu_dereference(c->replicas);
-
-       if (ca->dev_idx >= replicas_dev_slots(r))
-               goto out;
-
-       for_each_cpu_replicas_entry(r, e)
-               if (replicas_test_dev(e, ca->dev_idx))
-                       ret |= 1 << e->data_type;
-out:
-       rcu_read_unlock();
-
-       return ret;
-}
+static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
+       .validate       = bch2_sb_validate_crypt,
+};
  
-/* Quotas: */
+static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
+#define x(f, nr)                                       \
+       [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
+       BCH_SB_FIELDS()
+#undef x
+};
  
-static const char *bch2_sb_validate_quota(struct bch_sb *sb,
+static const char *bch2_sb_field_validate(struct bch_sb *sb,
                                           struct bch_sb_field *f)
  {
-       struct bch_sb_field_quota *q = field_to_type(f, quota);
-
-       if (vstruct_bytes(&q->field) != sizeof(*q))
-               return "invalid field quota: wrong size";
-
-       return NULL;
-}
-
-/* Disk groups: */
-
-static int strcmp_void(const void *l, const void *r)
-{
-       return strcmp(l, r);
-}
-
-static const char *bch2_sb_validate_disk_groups(struct bch_sb *sb,
-                                               struct bch_sb_field *f)
-{
-       struct bch_sb_field_disk_groups *groups =
-               field_to_type(f, disk_groups);
-       struct bch_disk_group *g;
-       struct bch_sb_field_members *mi;
-       struct bch_member *m;
-       unsigned i, nr_groups, nr_live = 0, len;
-       char **labels, *l;
-       const char *err = NULL;
-
-       mi              = bch2_sb_get_members(sb);
-       groups          = bch2_sb_get_disk_groups(sb);
-       nr_groups       = disk_groups_nr(groups);
-
-       for (m = mi->members;
-            m < mi->members + sb->nr_devices;
-            m++) {
-               unsigned g;
-
-               if (!BCH_MEMBER_GROUP(m))
-                       continue;
-
-               g = BCH_MEMBER_GROUP(m) - 1;
-
-               if (g >= nr_groups ||
-                   BCH_GROUP_DELETED(&groups->entries[g]))
-                       return "disk has invalid group";
-       }
-
-       if (!nr_groups)
-               return NULL;
-
-       labels = kcalloc(nr_groups, sizeof(char *), GFP_KERNEL);
-       if (!labels)
-               return "cannot allocate memory";
-
-       for (g = groups->entries;
-            g < groups->entries + nr_groups;
-            g++) {
-               if (BCH_GROUP_DELETED(g))
-                       continue;
-
-               len = strnlen(g->label, sizeof(g->label));
-
-               labels[nr_live++] = l = kmalloc(len + 1, GFP_KERNEL);
-               if (!l) {
-                       err = "cannot allocate memory";
-                       goto err;
-               }
-
-               memcpy(l, g->label, len);
-               l[len] = '\0';
-       }
-
-       sort(labels, nr_live, sizeof(labels[0]), strcmp_void, NULL);
-
-       for (i = 0; i + 1 < nr_live; i++)
-               if (!strcmp(labels[i], labels[i + 1])) {
-                       err = "duplicate group labels";
-                       goto err;
-               }
-
-       err = NULL;
-err:
-       for (i = 0; i < nr_live; i++)
-               kfree(labels[i]);
-       kfree(labels);
-       return err;
-}
-
-static int bch2_sb_disk_groups_to_cpu(struct bch_fs *c)
-{
-       struct bch_sb_field_members *mi;
-       struct bch_sb_field_disk_groups *groups;
-       struct bch_disk_groups_cpu *cpu_g, *old_g;
-       unsigned i, nr_groups;
-
-       lockdep_assert_held(&c->sb_lock);
-
-       mi              = bch2_sb_get_members(c->disk_sb);
-       groups          = bch2_sb_get_disk_groups(c->disk_sb);
-       nr_groups       = disk_groups_nr(groups);
-
-       if (!groups)
-               return 0;
-
-       cpu_g = kzalloc(sizeof(*cpu_g) +
-                       sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL);
-       if (!cpu_g)
-               return -ENOMEM;
-
-       cpu_g->nr = nr_groups;
-
-       for (i = 0; i < nr_groups; i++) {
-               struct bch_disk_group *src      = &groups->entries[i];
-               struct bch_disk_group_cpu *dst  = &cpu_g->entries[i];
-
-               dst->deleted = BCH_GROUP_DELETED(src);
-       }
-
-       for (i = 0; i < c->disk_sb->nr_devices; i++) {
-               struct bch_member *m = mi->members + i;
-               struct bch_disk_group_cpu *dst =
-                       &cpu_g->entries[BCH_MEMBER_GROUP(m)];
-
-               if (!bch2_member_exists(m))
-                       continue;
-
-               dst = BCH_MEMBER_GROUP(m)
-                       ? &cpu_g->entries[BCH_MEMBER_GROUP(m) - 1]
-                       : NULL;
-               if (dst)
-                       __set_bit(i, dst->devs.d);
-       }
-
-       old_g = c->disk_groups;
-       rcu_assign_pointer(c->disk_groups, cpu_g);
-       if (old_g)
-               kfree_rcu(old_g, rcu);
-
-       return 0;
-}
-
-const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target)
-{
-       struct target t = target_decode(target);
-
-       switch (t.type) {
-       case TARGET_DEV: {
-               struct bch_dev *ca = t.dev < c->sb.nr_devices
-                       ? rcu_dereference(c->devs[t.dev])
-                       : NULL;
-               return ca ? &ca->self : NULL;
-       }
-       case TARGET_GROUP: {
-               struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
-
-               return t.group < g->nr && !g->entries[t.group].deleted
-                       ? &g->entries[t.group].devs
-                       : NULL;
-       }
-       default:
-               BUG();
-       }
-}
-
-int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups,
-                          const char *name)
-{
-       unsigned i, nr_groups = disk_groups_nr(groups);
-       unsigned len = strlen(name);
-
-       for (i = 0; i < nr_groups; i++) {
-               struct bch_disk_group *g = groups->entries + i;
-
-               if (BCH_GROUP_DELETED(g))
-                       continue;
-
-               if (strnlen(g->label, sizeof(g->label)) == len &&
-                   !memcmp(name, g->label, len))
-                       return i;
-       }
-
-       return -1;
-}
-
-static int bch2_disk_group_find(struct bch_fs *c, const char *name)
-{
-       int ret;
-
-       mutex_lock(&c->sb_lock);
-       ret = __bch2_disk_group_find(bch2_sb_get_disk_groups(c->disk_sb), name);
-       mutex_unlock(&c->sb_lock);
+       unsigned type = le32_to_cpu(f->type);
  
-       return ret;
+       return type < BCH_SB_FIELD_NR
+               ? bch2_sb_field_ops[type]->validate(sb, f)
+               : NULL;
  }
  
-int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v)
+size_t bch2_sb_field_to_text(char *buf, size_t size,
+                            struct bch_sb *sb, struct bch_sb_field *f)
  {
-       struct bch_dev *ca;
-       int g;
-
-       if (!strlen(buf) || !strcmp(buf, "none")) {
-               *v = 0;
-               return 0;
-       }
-
-       /* Is it a device? */
-       ca = bch2_dev_lookup(c, buf);
-       if (!IS_ERR(ca)) {
-               *v = dev_to_target(ca->dev_idx);
-               percpu_ref_put(&ca->ref);
-               return 0;
-       }
+       unsigned type = le32_to_cpu(f->type);
+       size_t (*to_text)(char *, size_t, struct bch_sb *,
+                                  struct bch_sb_field *) =
+               type < BCH_SB_FIELD_NR
+               ? bch2_sb_field_ops[type]->to_text
+               : NULL;
  
-       g = bch2_disk_group_find(c, buf);
-       if (g >= 0) {
-               *v = group_to_target(g);
+       if (!to_text) {
+               if (size)
+                       buf[0] = '\0';
                 return 0;
         }
  
-       return -EINVAL;
-}
-
-int bch2_opt_target_print(struct bch_fs *c, char *buf, size_t len, u64 v)
-{
-       struct target t = target_decode(v);
-       int ret;
-
-       switch (t.type) {
-       case TARGET_NULL:
-               return scnprintf(buf, len, "none");
-       case TARGET_DEV: {
-               struct bch_dev *ca;
-
-               rcu_read_lock();
-               ca = t.dev < c->sb.nr_devices
-                       ? rcu_dereference(c->devs[t.dev])
-                       : NULL;
-
-               if (ca && percpu_ref_tryget(&ca->io_ref)) {
-                       char b[BDEVNAME_SIZE];
-
-                       ret = scnprintf(buf, len, "/dev/%s",
-                                       bdevname(ca->disk_sb.bdev, b));
-                       percpu_ref_put(&ca->io_ref);
-               } else if (ca) {
-                       ret = scnprintf(buf, len, "offline device %u", t.dev);
-               } else {
-                       ret = scnprintf(buf, len, "invalid device %u", t.dev);
-               }
-
-               rcu_read_unlock();
-               break;
-       }
-       case TARGET_GROUP: {
-               struct bch_sb_field_disk_groups *groups;
-               struct bch_disk_group *g;
-
-               mutex_lock(&c->sb_lock);
-               groups = bch2_sb_get_disk_groups(c->disk_sb);
-
-               g = t.group < disk_groups_nr(groups)
-                       ? groups->entries + t.group
-                       : NULL;
-
-               if (g && !BCH_GROUP_DELETED(g)) {
-                       ret = len ? min(len - 1, strnlen(g->label, sizeof(g->label))) : 0;
-
-                       memcpy(buf, g->label, ret);
-                       if (len)
-                               buf[ret] = '\0';
-               } else {
-                       ret = scnprintf(buf, len, "invalid group %u", t.group);
-               }
-
-               mutex_unlock(&c->sb_lock);
-               break;
-       }
-       default:
-               BUG();
-       }
-
-       return ret;
+       return to_text(buf, size, sb, f);
  }
diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h

index 2514ac8a7b8cb0f40e54543f53517292ecca3543..f407c205b93b4c4bc95018877f4d8581de588296 100644 (file)
--- a/libbcachefs/super-io.h
+++ b/libbcachefs/super-io.h
@@ -11,8 +11,6 @@
  struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
  struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *,
                                           enum bch_sb_field_type, unsigned);
-struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *,
-                                        enum bch_sb_field_type, unsigned);
  
  #define field_to_type(_f, _name)                                       \
         container_of_or_null(_f, struct bch_sb_field_##_name, field)
@@ -30,13 +28,6 @@ bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s)      \
  {                                                                      \
         return field_to_type(bch2_sb_field_resize(sb,                   \
                                 BCH_SB_FIELD_##_name, u64s), _name);    \
-}                                                                      \
-                                                                       \
-static inline struct bch_sb_field_##_name *                            \
-bch2_fs_sb_resize_##_name(struct bch_fs *c, unsigned u64s)             \
-{                                                                      \
-       return field_to_type(bch2_fs_sb_field_resize(c,                 \
-                               BCH_SB_FIELD_##_name, u64s), _name);    \
  }
  
  BCH_SB_FIELDS()
@@ -44,6 +35,12 @@ BCH_SB_FIELDS()
  
  extern const char * const bch2_sb_fields[];
  
+struct bch_sb_field_ops {
+       const char *    (*validate)(struct bch_sb *, struct bch_sb_field *);
+       size_t          (*to_text)(char *, size_t, struct bch_sb *,
+                                  struct bch_sb_field *);
+};
+
  static inline bool bch2_sb_test_feature(struct bch_sb *sb,
                                         enum bch_sb_features f)
  {
@@ -90,7 +87,7 @@ int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *);
  int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *);
  
  void bch2_free_super(struct bch_sb_handle *);
-int bch2_super_realloc(struct bch_sb_handle *, unsigned);
+int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
  
  const char *bch2_sb_validate(struct bch_sb_handle *);
  
@@ -139,135 +136,4 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
         };
  }
  
-/* BCH_SB_FIELD_replicas: */
-
-bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
-                         struct bch_devs_list);
-bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type,
-                              struct bkey_s_c);
-int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
-                      struct bch_devs_list);
-int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type,
-                           struct bkey_s_c);
-
-int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t);
-int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t);
-
-struct replicas_status {
-       struct {
-               unsigned        nr_online;
-               unsigned        nr_offline;
-       }                       replicas[BCH_DATA_NR];
-};
-
-struct replicas_status __bch2_replicas_status(struct bch_fs *,
-                                             struct bch_devs_mask);
-struct replicas_status bch2_replicas_status(struct bch_fs *);
-bool bch2_have_enough_devs(struct replicas_status, unsigned);
-
-unsigned bch2_replicas_online(struct bch_fs *, bool);
-unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
-
-int bch2_replicas_gc_end(struct bch_fs *, int);
-int bch2_replicas_gc_start(struct bch_fs *, unsigned);
-
-/* iterate over superblock replicas - used by userspace tools: */
-
-static inline struct bch_replicas_entry *
-replicas_entry_next(struct bch_replicas_entry *i)
-{
-       return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr;
-}
-
-#define for_each_replicas_entry(_r, _i)                                        \
-       for (_i = (_r)->entries;                                        \
-            (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\
-            (_i) = replicas_entry_next(_i))
-
-/* disk groups: */
-
-static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups)
-{
-       return groups
-               ? (vstruct_end(&groups->field) -
-                  (void *) &groups->entries[0]) / sizeof(struct bch_disk_group)
-               : 0;
-}
-
-struct target {
-       enum {
-               TARGET_NULL,
-               TARGET_DEV,
-               TARGET_GROUP,
-       }                       type;
-       union {
-               unsigned        dev;
-               unsigned        group;
-       };
-};
-
-#define TARGET_DEV_START       1
-#define TARGET_GROUP_START     (256 + TARGET_DEV_START)
-
-static inline u16 dev_to_target(unsigned dev)
-{
-       return TARGET_DEV_START + dev;
-}
-
-static inline u16 group_to_target(unsigned group)
-{
-       return TARGET_GROUP_START + group;
-}
-
-static inline struct target target_decode(unsigned target)
-{
-       if (target >= TARGET_GROUP_START)
-               return (struct target) {
-                       .type   = TARGET_GROUP,
-                       .group  = target - TARGET_GROUP_START
-               };
-
-       if (target >= TARGET_DEV_START)
-               return (struct target) {
-                       .type   = TARGET_DEV,
-                       .group  = target - TARGET_DEV_START
-               };
-
-       return (struct target) { .type = TARGET_NULL };
-}
-
-static inline bool dev_in_target(struct bch_dev *ca, unsigned target)
-{
-       struct target t = target_decode(target);
-
-       switch (t.type) {
-       case TARGET_NULL:
-               return false;
-       case TARGET_DEV:
-               return ca->dev_idx == t.dev;
-       case TARGET_GROUP:
-               return ca->mi.group && ca->mi.group - 1 == t.group;
-       default:
-               BUG();
-       }
-}
-
-static inline bool dev_idx_in_target(struct bch_fs *c, unsigned dev, unsigned target)
-{
-       bool ret;
-
-       rcu_read_lock();
-       ret = dev_in_target(rcu_dereference(c->devs[dev]), target);
-       rcu_read_unlock();
-
-       return ret;
-}
-
-const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
-
-int __bch2_disk_group_find(struct bch_sb_field_disk_groups *, const char *);
-
-int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *);
-int bch2_opt_target_print(struct bch_fs *, char *, size_t, u64);
-
  #endif /* _BCACHEFS_SUPER_IO_H */
diff --git a/libbcachefs/super.c b/libbcachefs/super.c

index 77670ea6e077c2aaef9964fa0980fda7e3dbe7c4..05910c404aec3303eb210599096468376b20a8d1 100644 (file)
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -18,6 +18,7 @@
  #include "clock.h"
  #include "compress.h"
  #include "debug.h"
+#include "disk_groups.h"
  #include "error.h"
  #include "fs.h"
  #include "fs-io.h"
@@ -30,6 +31,7 @@
  #include "migrate.h"
  #include "movinggc.h"
  #include "quota.h"
+#include "replicas.h"
  #include "super.h"
  #include "super-io.h"
  #include "sysfs.h"
@@ -122,7 +124,7 @@ static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid)
         lockdep_assert_held(&bch_fs_list_lock);
  
         list_for_each_entry(c, &bch_fs_list, list)
-               if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
+               if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid_le)))
                         return c;
  
         return NULL;
@@ -203,23 +205,12 @@ static void bch_fs_mark_clean(struct bch_fs *c)
             !test_bit(BCH_FS_ERROR, &c->flags) &&
             !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
                 mutex_lock(&c->sb_lock);
-               SET_BCH_SB_CLEAN(c->disk_sb, true);
+               SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
                 bch2_write_super(c);
                 mutex_unlock(&c->sb_lock);
         }
  }
  
-static bool btree_interior_updates_done(struct bch_fs *c)
-{
-       bool ret;
-
-       mutex_lock(&c->btree_interior_update_lock);
-       ret = list_empty(&c->btree_interior_update_list);
-       mutex_unlock(&c->btree_interior_update_lock);
-
-       return ret;
-}
-
  static void __bch2_fs_read_only(struct bch_fs *c)
  {
         struct bch_dev *ca;
@@ -251,7 +242,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
          * fully complete:
          */
         closure_wait_event(&c->btree_interior_update_wait,
-                          btree_interior_updates_done(c));
+                          !bch2_btree_interior_updates_nr_pending(c));
  
         if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
                 bch2_btree_verify_flushed(c);
@@ -433,7 +424,8 @@ static void bch2_fs_free(struct bch_fs *c)
         if (c->wq)
                 destroy_workqueue(c->wq);
  
-       free_pages((unsigned long) c->disk_sb, c->disk_sb_order);
+       free_pages((unsigned long) c->disk_sb.sb,
+                  c->disk_sb.page_order);
         kvpfree(c, sizeof(*c));
         module_put(THIS_MODULE);
  }
@@ -501,11 +493,54 @@ void bch2_fs_stop(struct bch_fs *c)
         kobject_put(&c->kobj);
  }
  
+static const char *bch2_fs_online(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       const char *err = NULL;
+       unsigned i;
+       int ret;
+
+       lockdep_assert_held(&bch_fs_list_lock);
+
+       if (!list_empty(&c->list))
+               return NULL;
+
+       if (__bch2_uuid_to_fs(c->sb.uuid))
+               return "filesystem UUID already open";
+
+       ret = bch2_fs_chardev_init(c);
+       if (ret)
+               return "error creating character device";
+
+       bch2_fs_debug_init(c);
+
+       if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
+           kobject_add(&c->internal, &c->kobj, "internal") ||
+           kobject_add(&c->opts_dir, &c->kobj, "options") ||
+           kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
+           bch2_opts_create_sysfs_files(&c->opts_dir))
+               return "error creating sysfs objects";
+
+       mutex_lock(&c->state_lock);
+
+       err = "error creating sysfs objects";
+       __for_each_member_device(ca, c, i, NULL)
+               if (bch2_dev_sysfs_online(c, ca))
+                       goto err;
+
+       list_add(&c->list, &bch_fs_list);
+       err = NULL;
+err:
+       mutex_unlock(&c->state_lock);
+       return err;
+}
+
  static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
  {
         struct bch_sb_field_members *mi;
         struct bch_fs *c;
         unsigned i, iter_size;
+       const char *err;
  
         pr_verbose_init(opts, "");
  
@@ -516,6 +551,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
         __module_get(THIS_MODULE);
  
         c->minor                = -1;
+       c->disk_sb.fs_sb        = true;
  
         mutex_init(&c->state_lock);
         mutex_init(&c->sb_lock);
@@ -627,9 +663,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
             bch2_fs_fsio_init(c))
                 goto err;
  
-       mi = bch2_sb_get_members(c->disk_sb);
+       mi = bch2_sb_get_members(c->disk_sb.sb);
         for (i = 0; i < c->sb.nr_devices; i++)
-               if (bch2_dev_exists(c->disk_sb, mi, i) &&
+               if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
                     bch2_dev_alloc(c, i))
                         goto err;
  
@@ -644,6 +680,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
         kobject_init(&c->internal, &bch2_fs_internal_ktype);
         kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
         kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+
+       mutex_lock(&bch_fs_list_lock);
+       err = bch2_fs_online(c);
+       mutex_unlock(&bch_fs_list_lock);
+       if (err) {
+               bch_err(c, "bch2_fs_online() error: %s", err);
+               goto err;
+       }
  out:
         pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM);
         return c;
@@ -653,60 +697,7 @@ err:
         goto out;
  }
  
-static const char *__bch2_fs_online(struct bch_fs *c)
-{
-       struct bch_dev *ca;
-       const char *err = NULL;
-       unsigned i;
-       int ret;
-
-       lockdep_assert_held(&bch_fs_list_lock);
-
-       if (!list_empty(&c->list))
-               return NULL;
-
-       if (__bch2_uuid_to_fs(c->sb.uuid))
-               return "filesystem UUID already open";
-
-       ret = bch2_fs_chardev_init(c);
-       if (ret)
-               return "error creating character device";
-
-       bch2_fs_debug_init(c);
-
-       if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
-           kobject_add(&c->internal, &c->kobj, "internal") ||
-           kobject_add(&c->opts_dir, &c->kobj, "options") ||
-           kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
-           bch2_opts_create_sysfs_files(&c->opts_dir))
-               return "error creating sysfs objects";
-
-       mutex_lock(&c->state_lock);
-
-       err = "error creating sysfs objects";
-       __for_each_member_device(ca, c, i, NULL)
-               if (bch2_dev_sysfs_online(c, ca))
-                       goto err;
-
-       list_add(&c->list, &bch_fs_list);
-       err = NULL;
-err:
-       mutex_unlock(&c->state_lock);
-       return err;
-}
-
-static const char *bch2_fs_online(struct bch_fs *c)
-{
-       const char *err;
-
-       mutex_lock(&bch_fs_list_lock);
-       err = __bch2_fs_online(c);
-       mutex_unlock(&bch_fs_list_lock);
-
-       return err;
-}
-
-static const char *__bch2_fs_start(struct bch_fs *c)
+const char *bch2_fs_start(struct bch_fs *c)
  {
         const char *err = "cannot allocate memory";
         struct bch_sb_field_members *mi;
@@ -730,15 +721,15 @@ static const char *__bch2_fs_start(struct bch_fs *c)
                 bch2_dev_allocator_add(c, ca);
         bch2_recalc_capacity(c);
  
-       if (BCH_SB_INITIALIZED(c->disk_sb)) {
+       if (BCH_SB_INITIALIZED(c->disk_sb.sb)) {
                 ret = bch2_journal_read(c, &journal);
                 if (ret)
                         goto err;
  
                 j = &list_entry(journal.prev, struct journal_replay, list)->j;
  
-               c->prio_clock[READ].hand = le16_to_cpu(j->read_clock);
-               c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock);
+               c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
+               c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
  
                 for (i = 0; i < BTREE_ID_NR; i++) {
                         unsigned level;
@@ -824,21 +815,18 @@ static const char *__bch2_fs_start(struct bch_fs *c)
                 bch_notice(c, "initializing new filesystem");
  
                 set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
-               set_bit(BCH_FS_BRAND_NEW_FS, &c->flags);
  
                 ret = bch2_initial_gc(c, &journal);
                 if (ret)
                         goto err;
  
                 err = "unable to allocate journal buckets";
-               for_each_rw_member(ca, c, i)
-                       if (bch2_dev_journal_alloc(c, ca)) {
+               for_each_online_member(ca, c, i)
+                       if (bch2_dev_journal_alloc(ca)) {
                                 percpu_ref_put(&ca->io_ref);
                                 goto err;
                         }
  
-               clear_bit(BCH_FS_BRAND_NEW_FS, &c->flags);
-
                 for (i = 0; i < BTREE_ID_NR; i++)
                         bch2_btree_root_alloc(c, i);
  
@@ -889,18 +877,20 @@ recovery_done:
         }
  
         mutex_lock(&c->sb_lock);
-       mi = bch2_sb_get_members(c->disk_sb);
+       mi = bch2_sb_get_members(c->disk_sb.sb);
         now = ktime_get_seconds();
  
         for_each_member_device(ca, c, i)
                 mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
  
-       SET_BCH_SB_INITIALIZED(c->disk_sb, true);
-       SET_BCH_SB_CLEAN(c->disk_sb, false);
+       SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
+       SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
  
         bch2_write_super(c);
         mutex_unlock(&c->sb_lock);
  
+       set_bit(BCH_FS_STARTED, &c->flags);
+
         err = NULL;
  out:
         mutex_unlock(&c->state_lock);
@@ -939,11 +929,6 @@ fsck_err:
         goto out;
  }
  
-const char *bch2_fs_start(struct bch_fs *c)
-{
-       return __bch2_fs_start(c) ?: bch2_fs_online(c);
-}
-
  static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
  {
         struct bch_sb_field_members *sb_mi;
@@ -956,7 +941,7 @@ static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
                 return "mismatched block size";
  
         if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) <
-           BCH_SB_BTREE_NODE_SIZE(c->disk_sb))
+           BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb))
                 return "new cache bucket size is too small";
  
         return NULL;
@@ -1082,28 +1067,19 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
         return 0;
  }
  
-static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
+static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
+                                       struct bch_member *member)
  {
-       struct bch_member *member;
-       struct bch_dev *ca = NULL;
-       int ret = 0;
-
-       pr_verbose_init(c->opts, "");
-
-       if (bch2_fs_init_fault("dev_alloc"))
-               goto err;
+       struct bch_dev *ca;
  
         ca = kzalloc(sizeof(*ca), GFP_KERNEL);
         if (!ca)
-               goto err;
+               return NULL;
  
         kobject_init(&ca->kobj, &bch2_dev_ktype);
         init_completion(&ca->ref_completion);
         init_completion(&ca->io_ref_completion);
  
-       ca->dev_idx = dev_idx;
-       __set_bit(ca->dev_idx, ca->self.d);
-
         init_rwsem(&ca->bucket_lock);
  
         writepoint_init(&ca->copygc_write_point, BCH_DATA_USER);
@@ -1113,14 +1089,8 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
  
         INIT_WORK(&ca->io_error_work, bch2_io_error_work);
  
-       if (bch2_fs_init_fault("dev_alloc"))
-               goto err;
-
-       member = bch2_sb_get_members(c->disk_sb)->members + dev_idx;
-
         ca->mi = bch2_mi_to_cpu(member);
         ca->uuid = member->uuid;
-       scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
  
         if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
                             0, GFP_KERNEL) ||
@@ -1132,11 +1102,43 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
             !(ca->io_done       = alloc_percpu(*ca->io_done)))
                 goto err;
  
+       return ca;
+err:
+       bch2_dev_free(ca);
+       return NULL;
+}
+
+static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca,
+                           unsigned dev_idx)
+{
+       ca->dev_idx = dev_idx;
+       __set_bit(ca->dev_idx, ca->self.d);
+       scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx);
+
         ca->fs = c;
         rcu_assign_pointer(c->devs[ca->dev_idx], ca);
  
         if (bch2_dev_sysfs_online(c, ca))
                 pr_warn("error creating sysfs objects");
+}
+
+static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
+{
+       struct bch_member *member =
+               bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx;
+       struct bch_dev *ca = NULL;
+       int ret = 0;
+
+       pr_verbose_init(c->opts, "");
+
+       if (bch2_fs_init_fault("dev_alloc"))
+               goto err;
+
+       ca = __bch2_dev_alloc(c, member);
+       if (!ca)
+               goto err;
+
+       bch2_dev_attach(c, ca, dev_idx);
  out:
         pr_verbose_init(c->opts, "ret %i", ret);
         return ret;
@@ -1147,21 +1149,9 @@ err:
         goto out;
  }
  
-static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
+static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb)
  {
-       struct bch_dev *ca;
-       int ret;
-
-       lockdep_assert_held(&c->state_lock);
-
-       if (le64_to_cpu(sb->sb->seq) >
-           le64_to_cpu(c->disk_sb->seq))
-               bch2_sb_to_fs(c, sb->sb);
-
-       BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
-              !c->devs[sb->sb->dev_idx]);
-
-       ca = bch_dev_locked(c, sb->sb->dev_idx);
+       unsigned ret;
  
         if (bch2_dev_is_online(ca)) {
                 bch_err(ca, "already have device online in slot %u",
@@ -1179,7 +1169,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
  
         if (get_capacity(sb->bdev->bd_disk) <
             ca->mi.bucket_size * ca->mi.nbuckets) {
-               bch_err(c, "device too small");
+               bch_err(ca, "device too small");
                 return -EINVAL;
         }
  
@@ -1187,35 +1177,50 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
         if (ret)
                 return ret;
  
-       /*
-        * Increase journal write timeout if flushes to this device are
-        * expensive:
-        */
-       if (!blk_queue_nonrot(bdev_get_queue(sb->bdev)) &&
-           journal_flushes_device(ca))
-               c->journal.write_delay_ms =
-                       max(c->journal.write_delay_ms, 1000U);
-
         /* Commit: */
         ca->disk_sb = *sb;
         if (sb->mode & FMODE_EXCL)
                 ca->disk_sb.bdev->bd_holder = ca;
         memset(sb, 0, sizeof(*sb));
  
+       if (ca->fs)
+               mutex_lock(&ca->fs->sb_lock);
+
+       bch2_mark_dev_superblock(ca->fs, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+
+       if (ca->fs)
+               mutex_unlock(&ca->fs->sb_lock);
+
+       percpu_ref_reinit(&ca->io_ref);
+
+       return 0;
+}
+
+static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
+{
+       struct bch_dev *ca;
+       int ret;
+
+       lockdep_assert_held(&c->state_lock);
+
+       if (le64_to_cpu(sb->sb->seq) >
+           le64_to_cpu(c->disk_sb.sb->seq))
+               bch2_sb_to_fs(c, sb->sb);
+
+       BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices ||
+              !c->devs[sb->sb->dev_idx]);
+
+       ca = bch_dev_locked(c, sb->sb->dev_idx);
+
+       ret = __bch2_dev_attach_bdev(ca, sb);
+       if (ret)
+               return ret;
+
         if (c->sb.nr_devices == 1)
                 bdevname(ca->disk_sb.bdev, c->name);
         bdevname(ca->disk_sb.bdev, ca->name);
  
-       mutex_lock(&c->sb_lock);
-       bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
-       mutex_unlock(&c->sb_lock);
-
-       if (ca->mi.state == BCH_MEMBER_STATE_RW)
-               bch2_dev_allocator_add(c, ca);
-
         rebalance_wakeup(c);
-
-       percpu_ref_reinit(&ca->io_ref);
         return 0;
  }
  
@@ -1289,10 +1294,10 @@ static bool bch2_fs_may_start(struct bch_fs *c)
  
         if (!c->opts.degraded) {
                 mutex_lock(&c->sb_lock);
-               mi = bch2_sb_get_members(c->disk_sb);
+               mi = bch2_sb_get_members(c->disk_sb.sb);
  
-               for (i = 0; i < c->disk_sb->nr_devices; i++) {
-                       if (!bch2_dev_exists(c->disk_sb, mi, i))
+               for (i = 0; i < c->disk_sb.sb->nr_devices; i++) {
+                       if (!bch2_dev_exists(c->disk_sb.sb, mi, i))
                                 continue;
  
                         ca = bch_dev_locked(c, i);
@@ -1360,7 +1365,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
         bch_notice(ca, "%s", bch2_dev_state[new_state]);
  
         mutex_lock(&c->sb_lock);
-       mi = bch2_sb_get_members(c->disk_sb);
+       mi = bch2_sb_get_members(c->disk_sb.sb);
         SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state);
         bch2_write_super(c);
         mutex_unlock(&c->sb_lock);
@@ -1470,7 +1475,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
          * this device must be gone:
          */
         mutex_lock(&c->sb_lock);
-       mi = bch2_sb_get_members(c->disk_sb);
+       mi = bch2_sb_get_members(c->disk_sb.sb);
         memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
  
         bch2_write_super(c);
@@ -1492,8 +1497,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
         struct bch_sb_handle sb;
         const char *err;
         struct bch_dev *ca = NULL;
-       struct bch_sb_field_members *mi, *dev_mi;
-       struct bch_member saved_mi;
+       struct bch_sb_field_members *mi;
+       struct bch_member dev_mi;
         unsigned dev_idx, nr_devices, u64s;
         int ret;
  
@@ -1505,24 +1510,52 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
         if (err)
                 return -EINVAL;
  
+       dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx];
+
         err = bch2_dev_may_add(sb.sb, c);
         if (err)
                 return -EINVAL;
  
+       ca = __bch2_dev_alloc(c, &dev_mi);
+       if (!ca) {
+               bch2_free_super(&sb);
+               return -ENOMEM;
+       }
+
+       ret = __bch2_dev_attach_bdev(ca, &sb);
+       if (ret) {
+               bch2_dev_free(ca);
+               return ret;
+       }
+
+       err = "journal alloc failed";
+       ret = bch2_dev_journal_alloc(ca);
+       if (ret)
+               goto err;
+
         mutex_lock(&c->state_lock);
         mutex_lock(&c->sb_lock);
  
-       /* Grab member info for new disk: */
-       dev_mi = bch2_sb_get_members(sb.sb);
-       saved_mi = dev_mi->members[sb.sb->dev_idx];
-       saved_mi.last_mount = cpu_to_le64(ktime_get_seconds());
+       err = "insufficient space in new superblock";
+       ret = bch2_sb_from_fs(c, ca);
+       if (ret)
+               goto err_unlock;
+
+       mi = bch2_sb_get_members(ca->disk_sb.sb);
+
+       if (!bch2_sb_resize_members(&ca->disk_sb,
+                               le32_to_cpu(mi->field.u64s) +
+                               sizeof(dev_mi) / sizeof(u64))) {
+               ret = -ENOSPC;
+               goto err_unlock;
+       }
  
         if (dynamic_fault("bcachefs:add:no_slot"))
                 goto no_slot;
  
-       mi = bch2_sb_get_members(c->disk_sb);
+       mi = bch2_sb_get_members(c->disk_sb.sb);
         for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++)
-               if (!bch2_dev_exists(c->disk_sb, mi, dev_idx))
+               if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx))
                         goto have_slot;
  no_slot:
         err = "no slots available in superblock";
@@ -1533,64 +1566,47 @@ have_slot:
         nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices);
         u64s = (sizeof(struct bch_sb_field_members) +
                 sizeof(struct bch_member) * nr_devices) / sizeof(u64);
-       err = "no space in superblock for member info";
  
-       dev_mi = bch2_sb_resize_members(&sb, u64s);
-       if (!dev_mi)
-               goto err_unlock;
+       err = "no space in superblock for member info";
+       ret = -ENOSPC;
  
-       mi = bch2_fs_sb_resize_members(c, u64s);
+       mi = bch2_sb_resize_members(&c->disk_sb, u64s);
         if (!mi)
                 goto err_unlock;
  
-       memcpy(dev_mi, mi, u64s * sizeof(u64));
-       dev_mi->members[dev_idx] = saved_mi;
+       /* success: */
  
-       sb.sb->uuid             = c->disk_sb->uuid;
-       sb.sb->dev_idx          = dev_idx;
-       sb.sb->nr_devices       = nr_devices;
+       mi->members[dev_idx] = dev_mi;
+       mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_seconds());
+       c->disk_sb.sb->nr_devices       = nr_devices;
  
-       /* commit new member info */
-       memcpy(mi, dev_mi, u64s * sizeof(u64));
-       c->disk_sb->nr_devices  = nr_devices;
-       c->sb.nr_devices        = nr_devices;
+       ca->disk_sb.sb->dev_idx = dev_idx;
+       bch2_dev_attach(c, ca, dev_idx);
  
         bch2_write_super(c);
         mutex_unlock(&c->sb_lock);
  
-       if (bch2_dev_alloc(c, dev_idx)) {
-               err = "cannot allocate memory";
-               ret = -ENOMEM;
-               goto err;
-       }
-
-       if (__bch2_dev_online(c, &sb)) {
-               err = "bch2_dev_online() error";
-               ret = -ENOMEM;
-               goto err;
-       }
-
-       ca = bch_dev_locked(c, dev_idx);
         if (ca->mi.state == BCH_MEMBER_STATE_RW) {
                 err = __bch2_dev_read_write(c, ca);
                 if (err)
-                       goto err;
-
-               err = "journal alloc failed";
-               if (bch2_dev_journal_alloc(c, ca))
-                       goto err;
+                       goto err_late;
         }
  
         mutex_unlock(&c->state_lock);
         return 0;
+
  err_unlock:
         mutex_unlock(&c->sb_lock);
-err:
         mutex_unlock(&c->state_lock);
+err:
+       if (ca)
+               bch2_dev_free(ca);
         bch2_free_super(&sb);
-
         bch_err(c, "Unable to add device: %s", err);
-       return ret ?: -EINVAL;
+       return ret;
+err_late:
+       bch_err(c, "Error going rw after adding device: %s", err);
+       return -EINVAL;
  }
  
  /* Hot add existing device to running filesystem: */
@@ -1613,12 +1629,12 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
  
         dev_idx = sb.sb->dev_idx;
  
-       err = bch2_dev_in_fs(c->disk_sb, sb.sb);
+       err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
         if (err)
                 goto err;
  
-       if (__bch2_dev_online(c, &sb)) {
-               err = "__bch2_dev_online() error";
+       if (bch2_dev_attach_bdev(c, &sb)) {
+               err = "bch2_dev_attach_bdev() error";
                 goto err;
         }
  
@@ -1688,7 +1704,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
         }
  
         mutex_lock(&c->sb_lock);
-       mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx];
+       mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
         mi->nbuckets = cpu_to_le64(nbuckets);
  
         bch2_write_super(c);
@@ -1721,74 +1737,6 @@ found:
         return ca;
  }
  
-int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *label)
-{
-       struct bch_sb_field_disk_groups *groups;
-       struct bch_disk_group *g;
-       struct bch_member *mi;
-       unsigned i, v, nr_groups;
-       int ret;
-
-       if (strlen(label) > BCH_SB_LABEL_SIZE)
-               return -EINVAL;
-
-       mutex_lock(&c->sb_lock);
-       groups          = bch2_sb_get_disk_groups(c->disk_sb);
-       nr_groups       = disk_groups_nr(groups);
-
-       if (!strcmp(label, "none")) {
-               v = 0;
-               goto write_sb;
-       }
-
-       ret = __bch2_disk_group_find(groups, label);
-       if (ret >= 0) {
-               v = ret + 1;
-               goto write_sb;
-       }
-
-       /* not found - create a new disk group: */
-
-       for (i = 0;
-            i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]);
-            i++)
-               ;
-
-       if (i == nr_groups) {
-               unsigned u64s =
-                       (sizeof(struct bch_sb_field_disk_groups) +
-                        sizeof(struct bch_disk_group) * (nr_groups + 1)) /
-                       sizeof(u64);
-
-               groups = bch2_fs_sb_resize_disk_groups(c, u64s);
-               if (!groups) {
-                       mutex_unlock(&c->sb_lock);
-                       return -ENOSPC;
-               }
-
-               nr_groups = disk_groups_nr(groups);
-       }
-
-       BUG_ON(i >= nr_groups);
-
-       g = &groups->entries[i];
-       v = i + 1;
-
-       memcpy(g->label, label, strlen(label));
-       if (strlen(label) < sizeof(g->label))
-               g->label[strlen(label)] = '\0';
-       SET_BCH_GROUP_DELETED(g, 0);
-       SET_BCH_GROUP_DATA_ALLOWED(g, ~0);
-write_sb:
-       mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx];
-       SET_BCH_MEMBER_GROUP(mi, v);
-
-       bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
-       return 0;
-}
-
  /* Filesystem open: */
  
  struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
@@ -1845,7 +1793,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
         err = "bch2_dev_online() error";
         mutex_lock(&c->state_lock);
         for (i = 0; i < nr_devices; i++)
-               if (__bch2_dev_online(c, &sb[i])) {
+               if (bch2_dev_attach_bdev(c, &sb[i])) {
                         mutex_unlock(&c->state_lock);
                         goto err_print;
                 }
@@ -1856,15 +1804,10 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
                 goto err_print;
  
         if (!c->opts.nostart) {
-               err = __bch2_fs_start(c);
+               err = bch2_fs_start(c);
                 if (err)
                         goto err_print;
         }
-
-       err = bch2_fs_online(c);
-       if (err)
-               goto err_print;
-
  out:
         kfree(sb);
         module_put(THIS_MODULE);
@@ -1900,7 +1843,7 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
         if (c) {
                 closure_get(&c->cl);
  
-               err = bch2_dev_in_fs(c->disk_sb, sb->sb);
+               err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb);
                 if (err)
                         goto err;
         } else {
@@ -1915,22 +1858,18 @@ static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
         err = "bch2_dev_online() error";
  
         mutex_lock(&c->sb_lock);
-       if (__bch2_dev_online(c, sb)) {
+       if (bch2_dev_attach_bdev(c, sb)) {
                 mutex_unlock(&c->sb_lock);
                 goto err;
         }
         mutex_unlock(&c->sb_lock);
  
         if (!c->opts.nostart && bch2_fs_may_start(c)) {
-               err = __bch2_fs_start(c);
+               err = bch2_fs_start(c);
                 if (err)
                         goto err;
         }
  
-       err = __bch2_fs_online(c);
-       if (err)
-               goto err;
-
         closure_put(&c->cl);
         mutex_unlock(&bch_fs_list_lock);
  
diff --git a/libbcachefs/super.h b/libbcachefs/super.h

index 652a572ff329d73e478538bb891f743a861dcd9c..a52ee3bb37ee3e80d3e8e1f3677d5c0f8bed1071 100644 (file)
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@@ -195,7 +195,6 @@ int bch2_dev_online(struct bch_fs *, const char *);
  int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int);
  int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64);
  struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *);
-int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *);
  
  bool bch2_fs_emergency_read_only(struct bch_fs *);
  void bch2_fs_read_only(struct bch_fs *);
diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h

index f5468182e4851387aadc2806569052c466f27ef5..ab83ade959e42403e59ffcb1b2f30e4efaeb6f31 100644 (file)
--- a/libbcachefs/super_types.h
+++ b/libbcachefs/super_types.h
@@ -7,6 +7,9 @@ struct bch_sb_handle {
         struct bio              *bio;
         unsigned                page_order;
         fmode_t                 mode;
+       unsigned                have_layout:1;
+       unsigned                have_bio:1;
+       unsigned                fs_sb:1;
  };
  
  struct bch_devs_mask {
@@ -44,8 +47,9 @@ struct bch_replicas_cpu {
  };
  
  struct bch_disk_group_cpu {
-       struct bch_devs_mask            devs;
         bool                            deleted;
+       u16                             parent;
+       struct bch_devs_mask            devs;
  };
  
  struct bch_disk_groups_cpu {
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c

index 82457348d062144e5bfc2d7ca7b7827fa5c133f0..e8089db9a1bd7da855f9ad6bc2bfe4d1b01c5744 100644 (file)
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -18,11 +18,13 @@
  #include "btree_update_interior.h"
  #include "btree_gc.h"
  #include "buckets.h"
+#include "disk_groups.h"
  #include "inode.h"
  #include "journal.h"
  #include "keylist.h"
  #include "move.h"
  #include "opts.h"
+#include "replicas.h"
  #include "super-io.h"
  #include "tier.h"
  
@@ -140,10 +142,10 @@ read_attribute(first_bucket);
  read_attribute(nbuckets);
  read_attribute(durability);
  read_attribute(iostats);
-read_attribute(read_priority_stats);
-read_attribute(write_priority_stats);
-read_attribute(fragmentation_stats);
-read_attribute(oldest_gen_stats);
+read_attribute(last_read_quantiles);
+read_attribute(last_write_quantiles);
+read_attribute(fragmentation_quantiles);
+read_attribute(oldest_gen_quantiles);
  read_attribute(reserve_stats);
  read_attribute(btree_cache_size);
  read_attribute(compression_stats);
@@ -167,7 +169,7 @@ rw_attribute(journal_reclaim_delay_ms);
  
  rw_attribute(discard);
  rw_attribute(cache_replacement_policy);
-rw_attribute(group);
+rw_attribute(label);
  
  rw_attribute(copy_gc_enabled);
  sysfs_pd_controller_attribute(copy_gc);
@@ -546,7 +548,7 @@ STORE(bch2_fs_opts_dir)
  
         if (opt->set_sb != SET_NO_SB_OPT) {
                 mutex_lock(&c->sb_lock);
-               opt->set_sb(c->disk_sb, v);
+               opt->set_sb(c->disk_sb.sb, v);
                 bch2_write_super(c);
                 mutex_unlock(&c->sb_lock);
         }
@@ -621,36 +623,41 @@ struct attribute *bch2_fs_time_stats_files[] = {
         NULL
  };
  
-typedef unsigned (bucket_map_fn)(struct bch_dev *, size_t, void *);
+typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *,
+                                size_t, void *);
  
-static unsigned bucket_priority_fn(struct bch_dev *ca, size_t b,
-                                  void *private)
+static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
+                                 size_t b, void *private)
  {
-       struct bucket *g = bucket(ca, b);
         int rw = (private ? 1 : 0);
  
-       return ca->fs->prio_clock[rw].hand - g->prio[rw];
+       return bucket_last_io(c, bucket(ca, b), rw);
  }
  
-static unsigned bucket_sectors_used_fn(struct bch_dev *ca, size_t b,
-                                      void *private)
+static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
+                                      size_t b, void *private)
  {
         struct bucket *g = bucket(ca, b);
         return bucket_sectors_used(g->mark);
  }
  
-static unsigned bucket_oldest_gen_fn(struct bch_dev *ca, size_t b,
-                                    void *private)
+static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
+                                    size_t b, void *private)
  {
         return bucket_gc_gen(ca, b);
  }
  
-static ssize_t show_quantiles(struct bch_dev *ca, char *buf,
-                             bucket_map_fn *fn, void *private)
+static int unsigned_cmp(const void *_l, const void *_r)
  {
-       int cmp(const void *l, const void *r)
-       {       return *((unsigned *) r) - *((unsigned *) l); }
+       unsigned l = *((unsigned *) _l);
+       unsigned r = *((unsigned *) _r);
+
+       return (l > r) - (l < r);
+}
  
+static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca,
+                             char *buf, bucket_map_fn *fn, void *private)
+{
         size_t i, n;
         /* Compute 31 quantiles */
         unsigned q[31], *p;
@@ -666,9 +673,9 @@ static ssize_t show_quantiles(struct bch_dev *ca, char *buf,
         }
  
         for (i = ca->mi.first_bucket; i < n; i++)
-               p[i] = fn(ca, i, private);
+               p[i] = fn(c, ca, i, private);
  
-       sort(p, n, sizeof(unsigned), cmp, NULL);
+       sort(p, n, sizeof(unsigned), unsigned_cmp, NULL);
         up_read(&ca->bucket_lock);
  
         while (n &&
@@ -804,24 +811,18 @@ SHOW(bch2_dev)
         sysfs_print(durability,         ca->mi.durability);
         sysfs_print(discard,            ca->mi.discard);
  
-       if (attr == &sysfs_group) {
-               struct bch_sb_field_disk_groups *groups;
-               struct bch_disk_group *g;
-               unsigned len;
-
-               if (!ca->mi.group)
-                       return scnprintf(out, end - out, "none\n");
-
-               mutex_lock(&c->sb_lock);
-               groups = bch2_sb_get_disk_groups(c->disk_sb);
-
-               g = &groups->entries[ca->mi.group - 1];
-               len = strnlen(g->label, sizeof(g->label));
-               memcpy(buf, g->label, len);
-               mutex_unlock(&c->sb_lock);
+       if (attr == &sysfs_label) {
+               if (ca->mi.group) {
+                       mutex_lock(&c->sb_lock);
+                       out += bch2_disk_path_print(&c->disk_sb, out, end - out,
+                                                   ca->mi.group - 1);
+                       mutex_unlock(&c->sb_lock);
+               } else {
+                       out += scnprintf(out, end - out, "none");
+               }
  
-               buf[len++] = '\n';
-               return len;
+               out += scnprintf(out, end - out, "\n");
+               return out - buf;
         }
  
         if (attr == &sysfs_has_data) {
@@ -852,14 +853,16 @@ SHOW(bch2_dev)
  
         if (attr == &sysfs_iostats)
                 return show_dev_iostats(ca, buf);
-       if (attr == &sysfs_read_priority_stats)
-               return show_quantiles(ca, buf, bucket_priority_fn, (void *) 0);
-       if (attr == &sysfs_write_priority_stats)
-               return show_quantiles(ca, buf, bucket_priority_fn, (void *) 1);
-       if (attr == &sysfs_fragmentation_stats)
-               return show_quantiles(ca, buf, bucket_sectors_used_fn, NULL);
-       if (attr == &sysfs_oldest_gen_stats)
-               return show_quantiles(ca, buf, bucket_oldest_gen_fn, NULL);
+
+       if (attr == &sysfs_last_read_quantiles)
+               return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0);
+       if (attr == &sysfs_last_write_quantiles)
+               return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1);
+       if (attr == &sysfs_fragmentation_quantiles)
+               return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL);
+       if (attr == &sysfs_oldest_gen_quantiles)
+               return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL);
+
         if (attr == &sysfs_reserve_stats)
                 return show_reserve_stats(ca, buf);
         if (attr == &sysfs_alloc_debug)
@@ -880,7 +883,7 @@ STORE(bch2_dev)
                 bool v = strtoul_or_return(buf);
  
                 mutex_lock(&c->sb_lock);
-               mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx];
+               mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
  
                 if (v != BCH_MEMBER_DISCARD(mi)) {
                         SET_BCH_MEMBER_DISCARD(mi, v);
@@ -896,7 +899,7 @@ STORE(bch2_dev)
                         return v;
  
                 mutex_lock(&c->sb_lock);
-               mi = &bch2_sb_get_members(c->disk_sb)->members[ca->dev_idx];
+               mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx];
  
                 if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) {
                         SET_BCH_MEMBER_REPLACEMENT(mi, v);
@@ -905,7 +908,7 @@ STORE(bch2_dev)
                 mutex_unlock(&c->sb_lock);
         }
  
-       if (attr == &sysfs_group) {
+       if (attr == &sysfs_label) {
                 char *tmp;
                 int ret;
  
@@ -938,16 +941,16 @@ struct attribute *bch2_dev_files[] = {
         &sysfs_discard,
         &sysfs_cache_replacement_policy,
         &sysfs_state_rw,
-       &sysfs_group,
+       &sysfs_label,
  
         &sysfs_has_data,
         &sysfs_iostats,
  
         /* alloc info - other stats: */
-       &sysfs_read_priority_stats,
-       &sysfs_write_priority_stats,
-       &sysfs_fragmentation_stats,
-       &sysfs_oldest_gen_stats,
+       &sysfs_last_read_quantiles,
+       &sysfs_last_write_quantiles,
+       &sysfs_fragmentation_quantiles,
+       &sysfs_oldest_gen_quantiles,
         &sysfs_reserve_stats,
  
         /* debug: */
diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c

index 211a844c69cf91c009d7ea757e7d61fa9388b64b..a15a0fa9dff967d87d347fa57665f73cb8b8bbd3 100644 (file)
--- a/libbcachefs/tier.c
+++ b/libbcachefs/tier.c
@@ -4,6 +4,7 @@
  #include "btree_iter.h"
  #include "buckets.h"
  #include "clock.h"
+#include "disk_groups.h"
  #include "extents.h"
  #include "io.h"
  #include "move.h"
diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c

index 81e942e5039cb6372086166270361baa4d21f043..79a98f757cc950a940c58b1ea373c524706eb8a8 100644 (file)
--- a/libbcachefs/xattr.c
+++ b/libbcachefs/xattr.c
@@ -86,8 +86,7 @@ const struct bch_hash_desc bch2_xattr_hash_desc = {
         .cmp_bkey       = xattr_cmp_bkey,
  };
  
-static const char *bch2_xattr_invalid(const struct bch_fs *c,
-                                    struct bkey_s_c k)
+const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k)
  {
         const struct xattr_handler *handler;
         struct bkey_s_c_xattr xattr;
@@ -126,8 +125,8 @@ static const char *bch2_xattr_invalid(const struct bch_fs *c,
         }
  }
  
-static void bch2_xattr_to_text(struct bch_fs *c, char *buf,
-                             size_t size, struct bkey_s_c k)
+void bch2_xattr_to_text(struct bch_fs *c, char *buf,
+                       size_t size, struct bkey_s_c k)
  {
         const struct xattr_handler *handler;
         struct bkey_s_c_xattr xattr;
@@ -159,11 +158,6 @@ static void bch2_xattr_to_text(struct bch_fs *c, char *buf,
         }
  }
  
-const struct bkey_ops bch2_bkey_xattr_ops = {
-       .key_invalid    = bch2_xattr_invalid,
-       .val_to_text    = bch2_xattr_to_text,
-};
-
  int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode,
                   const char *name, void *buffer, size_t size, int type)
  {
diff --git a/libbcachefs/xattr.h b/libbcachefs/xattr.h

index 9c815a2d7f52164368f28d5fe17bf96363e467f4..a58e7e30342181b0442392879d6f33cc6eb5a808 100644 (file)
--- a/libbcachefs/xattr.h
+++ b/libbcachefs/xattr.h
@@ -4,7 +4,14 @@
  #include "str_hash.h"
  
  extern const struct bch_hash_desc bch2_xattr_hash_desc;
-extern const struct bkey_ops bch2_bkey_xattr_ops;
+
+const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_xattr_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+
+#define bch2_bkey_xattr_ops (struct bkey_ops) {                \
+       .key_invalid    = bch2_xattr_invalid,           \
+       .val_to_text    = bch2_xattr_to_text,           \
+}
  
  struct dentry;
  struct xattr_handler;
author	Kent Overstreet <kent.overstreet@gmail.com>
	Tue, 10 Apr 2018 23:19:09 +0000 (19:19 -0400)
committer	Kent Overstreet <kent.overstreet@gmail.com>
	Tue, 10 Apr 2018 23:23:58 +0000 (19:23 -0400)
.bcachefs_revision		patch \| blob \| history
cmd_key.c		patch \| blob \| history
cmd_migrate.c		patch \| blob \| history
include/linux/bug.h		patch \| blob \| history
include/trace/events/bcachefs.h		patch \| blob \| history
libbcachefs.c		patch \| blob \| history
libbcachefs/alloc.c		patch \| blob \| history
libbcachefs/alloc.h		patch \| blob \| history
libbcachefs/alloc_types.h		patch \| blob \| history
libbcachefs/bcachefs.h		patch \| blob \| history
libbcachefs/bcachefs_format.h		patch \| blob \| history
libbcachefs/bkey_methods.c		patch \| blob \| history
libbcachefs/bkey_methods.h		patch \| blob \| history
libbcachefs/btree_gc.c		patch \| blob \| history
libbcachefs/btree_iter.c		patch \| blob \| history
libbcachefs/btree_iter.h		patch \| blob \| history
libbcachefs/btree_types.h		patch \| blob \| history
libbcachefs/btree_update_interior.c		patch \| blob \| history
libbcachefs/btree_update_interior.h		patch \| blob \| history
libbcachefs/btree_update_leaf.c		patch \| blob \| history
libbcachefs/buckets.c		patch \| blob \| history
libbcachefs/buckets.h		patch \| blob \| history
libbcachefs/buckets_types.h		patch \| blob \| history
libbcachefs/chardev.c		patch \| blob \| history
libbcachefs/checksum.c		patch \| blob \| history
libbcachefs/checksum.h		patch \| blob \| history
libbcachefs/clock_types.h		patch \| blob \| history
libbcachefs/compress.c		patch \| blob \| history
libbcachefs/debug.c		patch \| blob \| history
libbcachefs/dirent.c		patch \| blob \| history
libbcachefs/dirent.h		patch \| blob \| history
libbcachefs/disk_groups.c	[new file with mode: 0644]	patch \| blob
libbcachefs/disk_groups.h	[new file with mode: 0644]	patch \| blob
libbcachefs/extents.c		patch \| blob \| history
libbcachefs/extents.h		patch \| blob \| history
libbcachefs/fs-io.c		patch \| blob \| history
libbcachefs/inode.c		patch \| blob \| history
libbcachefs/inode.h		patch \| blob \| history
libbcachefs/io.c		patch \| blob \| history
libbcachefs/io.h		patch \| blob \| history
libbcachefs/journal.c		patch \| blob \| history
libbcachefs/journal.h		patch \| blob \| history
libbcachefs/migrate.c		patch \| blob \| history
libbcachefs/move.c		patch \| blob \| history
libbcachefs/movinggc.c		patch \| blob \| history
libbcachefs/opts.c		patch \| blob \| history
libbcachefs/quota.c		patch \| blob \| history
libbcachefs/quota.h		patch \| blob \| history
libbcachefs/replicas.c	[new file with mode: 0644]	patch \| blob
libbcachefs/replicas.h	[new file with mode: 0644]	patch \| blob
libbcachefs/super-io.c		patch \| blob \| history
libbcachefs/super-io.h		patch \| blob \| history
libbcachefs/super.c		patch \| blob \| history
libbcachefs/super.h		patch \| blob \| history
libbcachefs/super_types.h		patch \| blob \| history
libbcachefs/sysfs.c		patch \| blob \| history
libbcachefs/tier.c		patch \| blob \| history
libbcachefs/xattr.c		patch \| blob \| history
libbcachefs/xattr.h		patch \| blob \| history