]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
cmd_migrate
authorKent Overstreet <kent.overstreet@gmail.com>
Wed, 1 Mar 2017 10:45:15 +0000 (01:45 -0900)
committerKent Overstreet <kent.overstreet@gmail.com>
Thu, 9 Mar 2017 18:14:11 +0000 (09:14 -0900)
61 files changed:
.bcache_revision
Makefile
bcache.c
cmd_debug.c
cmd_device.c
cmd_format.c
cmd_fsck.c
cmd_key.c
cmd_migrate.c [new file with mode: 0644]
cmd_run.c
cmds.h
crypto.c
crypto.h
include/linux/bcache.h
include/linux/blkdev.h
include/linux/generic-radix-tree.h
include/linux/percpu-refcount.h
libbcache.c
libbcache.h
libbcache/alloc.c
libbcache/alloc.h
libbcache/alloc_types.h
libbcache/bcache.h
libbcache/blockdev.c
libbcache/btree_cache.c
libbcache/btree_cache.h
libbcache/btree_gc.c
libbcache/buckets.c
libbcache/buckets.h
libbcache/buckets_types.h
libbcache/chardev.c
libbcache/checksum.c
libbcache/checksum.h
libbcache/compress.c
libbcache/compress.h
libbcache/debug.c
libbcache/debug.h
libbcache/error.c
libbcache/extents.c
libbcache/fs-gc.c
libbcache/fs.c
libbcache/fs.h
libbcache/io.c
libbcache/journal.c
libbcache/journal.h
libbcache/movinggc.c
libbcache/movinggc.h
libbcache/opts.h
libbcache/super-io.c
libbcache/super-io.h
libbcache/super.c
libbcache/super.h
libbcache/super_types.h
libbcache/sysfs.c
libbcache/tier.c
libbcache/tier.h
linux/blkdev.c
qcow2.c
qcow2.h
tools-util.c
tools-util.h

index 58bdf2da79c6587a2a425102a4bd66e8c1fa2d18..b86381a15eea3bdf84f1a2d809d94419961d9544 100644 (file)
@@ -1 +1 @@
-BCACHE_REVISION=aa4471ac314a1f117957f9fc59c1bfbdf965a28c
+BCACHE_REVISION=c1f1a9e1d9b9664db9c9c03cbac455c2750335bc
index 2defed048e45ff97f6266e49fbaae51cc5aee2a5..682bf8e7fa2a5b73dd09adc30faafef86c73b12c 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -56,6 +56,7 @@ OBJS=bcache.o                 \
      cmd_fsck.o                        \
      cmd_format.o              \
      cmd_key.o                 \
+     cmd_migrate.o             \
      cmd_run.o                 \
      crypto.o                  \
      libbcache.o               \
index ac9eb07edf934187fafb004ee250e462c8adcfec..a0fa860fe9cc1f62669add1e30a5d12cb72a7399 100644 (file)
--- a/bcache.c
+++ b/bcache.c
@@ -50,7 +50,12 @@ static void usage(void)
             "\n"
             "Debug:\n"
             "  bcache dump    Dump filesystem metadata to a qcow2 image\n"
-            "  bcache list    List filesystem metadata in textual form\n");
+            "  bcache list    List filesystem metadata in textual form\n"
+            "\n"
+            "Migrate:\n"
+            "  bcache migrate Migrate an existing filesystem to bcachefs, in place\n"
+            "  bcache migrate_superblock\n"
+            "                 Add default superblock, after bcache migrate\n");
 }
 
 int main(int argc, char *argv[])
@@ -104,6 +109,11 @@ int main(int argc, char *argv[])
        if (!strcmp(cmd, "list"))
                return cmd_list(argc, argv);
 
+       if (!strcmp(cmd, "migrate"))
+               return cmd_migrate(argc, argv);
+       if (!strcmp(cmd, "migrate_superblock"))
+               return cmd_migrate_superblock(argc, argv);
+
        usage();
        return 0;
 }
index 4f2586d40d5b27b687be9cadcb11ba8cc8046233..ca0f4530a4b5109fd2855fa36435a05ae26ef456 100644 (file)
@@ -30,35 +30,35 @@ static void dump_usage(void)
 static void dump_one_device(struct cache_set *c, struct cache *ca, int fd)
 {
        struct bch_sb *sb = ca->disk_sb.sb;
-       sparse_data data;
+       ranges data;
        unsigned i;
 
        darray_init(data);
 
        /* Superblock: */
-       data_add(&data, BCH_SB_LAYOUT_SECTOR << 9,
-                sizeof(struct bch_sb_layout));
+       range_add(&data, BCH_SB_LAYOUT_SECTOR << 9,
+                 sizeof(struct bch_sb_layout));
 
        for (i = 0; i < sb->layout.nr_superblocks; i++)
-               data_add(&data,
-                        le64_to_cpu(sb->layout.sb_offset[i]) << 9,
-                        vstruct_bytes(sb));
+               range_add(&data,
+                         le64_to_cpu(sb->layout.sb_offset[i]) << 9,
+                         vstruct_bytes(sb));
 
        /* Journal: */
        for (i = 0; i < ca->journal.nr; i++)
                if (ca->journal.bucket_seq[i] >= c->journal.last_seq_ondisk) {
                        u64 bucket = ca->journal.buckets[i];
 
-                       data_add(&data,
-                                bucket_bytes(ca) * bucket,
-                                bucket_bytes(ca));
+                       range_add(&data,
+                                 bucket_bytes(ca) * bucket,
+                                 bucket_bytes(ca));
                }
 
        /* Prios/gens: */
        for (i = 0; i < prio_buckets(ca); i++)
-               data_add(&data,
-                        bucket_bytes(ca) * ca->prio_last_buckets[i],
-                        bucket_bytes(ca));
+               range_add(&data,
+                         bucket_bytes(ca) * ca->prio_last_buckets[i],
+                         bucket_bytes(ca));
 
        /* Btree: */
        for (i = 0; i < BTREE_ID_NR; i++) {
@@ -71,9 +71,9 @@ static void dump_one_device(struct cache_set *c, struct cache *ca, int fd)
 
                        extent_for_each_ptr(e, ptr)
                                if (ptr->dev == ca->dev_idx)
-                                       data_add(&data,
-                                                ptr->offset << 9,
-                                                b->written << 9);
+                                       range_add(&data,
+                                                 ptr->offset << 9,
+                                                 b->written << 9);
                }
                bch_btree_iter_unlock(&iter);
        }
@@ -87,7 +87,7 @@ int cmd_dump(int argc, char *argv[])
        struct bch_opts opts = bch_opts_empty();
        struct cache_set *c = NULL;
        const char *err;
-       char *out = NULL, *buf;
+       char *out = NULL;
        unsigned i, nr_devices = 0;
        bool force = false;
        int fd, opt;
@@ -116,9 +116,6 @@ int cmd_dump(int argc, char *argv[])
        if (!out)
                die("Please supply output filename");
 
-       buf = alloca(strlen(out) + 10);
-       strcpy(buf, out);
-
        err = bch_fs_open(argv + optind, argc - optind, opts, &c);
        if (err)
                die("error opening %s: %s", argv[optind], err);
@@ -140,12 +137,11 @@ int cmd_dump(int argc, char *argv[])
                if (!c->cache[i])
                        continue;
 
-               if (nr_devices > 1)
-                       sprintf(buf, "%s.%u", out, i);
-
-               fd = open(buf, mode, 0600);
-               if (fd < 0)
-                       die("error opening %s: %s", buf, strerror(errno));
+               char *path = nr_devices > 1
+                       ? mprintf("%s.%u", out, i)
+                       : strdup(out);
+               fd = xopen(path, mode, 0600);
+               free(path);
 
                dump_one_device(c, c->cache[i], fd);
                close(fd);
@@ -153,7 +149,7 @@ int cmd_dump(int argc, char *argv[])
 
        up_read(&c->gc_lock);
 
-       bch_fs_stop_sync(c);
+       bch_fs_stop(c);
        return 0;
 }
 
@@ -213,14 +209,20 @@ static void list_keys_usage(void)
             "Usage: bcache list_keys [OPTION]... <devices>\n"
             "\n"
             "Options:\n"
-            "  -b btree_id   Integer btree id to list\n"
-            "  -s start      Start pos (as inode:offset)\n"
-            "  -e end        End pos\n"
-            "  -m mode       Mode for listing\n"
-            "  -h            Display this help and exit\n"
+            "  -b (extents|inodes|dirents|xattrs)    Btree to list from\n"
+            "  -s inode:offset                       Start position to list from\n"
+            "  -e inode:offset                       End position\n"
+            "  -m (keys|formats)                     List mode\n"
+            "  -h                                    Display this help and exit\n"
             "Report bugs to <linux-bcache@vger.kernel.org>");
 }
 
+static const char * const list_modes[] = {
+       "keys",
+       "formats",
+       NULL
+};
+
 int cmd_list(int argc, char *argv[])
 {
        struct bch_opts opts = bch_opts_empty();
@@ -229,7 +231,6 @@ int cmd_list(int argc, char *argv[])
        struct bpos start = POS_MIN, end = POS_MAX;
        const char *err;
        int mode = 0, opt;
-       u64 v;
 
        opts.nochanges  = true;
        opts.norecovery = true;
@@ -239,10 +240,8 @@ int cmd_list(int argc, char *argv[])
        while ((opt = getopt(argc, argv, "b:s:e:m:h")) != -1)
                switch (opt) {
                case 'b':
-                       if (kstrtoull(optarg, 10, &v) ||
-                           v >= BTREE_ID_NR)
-                               die("invalid btree id");
-                       btree_id = v;
+                       btree_id = read_string_list_or_die(optarg,
+                                               bch_btree_ids, "btree id");
                        break;
                case 's':
                        start   = parse_pos(optarg);
@@ -251,6 +250,8 @@ int cmd_list(int argc, char *argv[])
                        end     = parse_pos(optarg);
                        break;
                case 'm':
+                       mode = read_string_list_or_die(optarg,
+                                               list_modes, "list mode");
                        break;
                case 'h':
                        list_keys_usage();
@@ -275,6 +276,6 @@ int cmd_list(int argc, char *argv[])
                die("Invalid mode");
        }
 
-       bch_fs_stop_sync(c);
+       bch_fs_stop(c);
        return 0;
 }
index 1c5208af103f32d1c6fc38baf01ddffdf4f106fb..505fedc4b7f4e92ce8e68cb438d7f6538a237043 100644 (file)
@@ -121,10 +121,7 @@ int cmd_device_show(int argc, char *argv[])
 
                char *dev_name = basename(dirname(link));
 
-               int fd = openat(dirfd(fs.sysfs), entry->d_name, O_RDONLY);
-               if (fd < 0)
-                       die("couldn't open device %s: %s\n",
-                           entry->d_name, strerror(errno));
+               int fd = xopenat(dirfd(fs.sysfs), entry->d_name, O_RDONLY);
 
                devices[nr_devices] = fill_dev(strdup(dev_name), nr, fd);
                tiers[devices[nr_devices].tier]++;
index 2b1453eeba9ed6d87b1476affc4da117a53f911d..f222a8b795974a84c140c552ddd8b72f684e4205 100644 (file)
@@ -34,10 +34,8 @@ static int open_for_format(const char *dev, bool force)
        blkid_probe pr;
        const char *fs_type = NULL, *fs_label = NULL;
        size_t fs_type_len, fs_label_len;
-       int fd;
 
-       if ((fd = open(dev, O_RDWR|O_EXCL)) == -1)
-               die("Can't open dev %s: %s\n", dev, strerror(errno));
+       int fd = xopen(dev, O_RDWR|O_EXCL);
 
        if (force)
                return fd;
@@ -70,8 +68,41 @@ static int open_for_format(const char *dev, bool force)
        return fd;
 }
 
+#define OPTS                                                                   \
+t("bcache format - create a new bcache filesystem on one or more devices")     \
+t("Usage: bcache format [OPTION]... <devices>")                                        \
+t("")                                                                          \
+x('b', block_size,             "size",                 NULL)                   \
+x(0,   btree_node_size,        "size",                 "Default 256k")         \
+x(0,   metadata_checksum_type, "(none|crc32c|crc64)",  NULL)                   \
+x(0,   data_checksum_type,     "(none|crc32c|crc64)",  NULL)                   \
+x(0,   compression_type,       "(none|lz4|gzip)",      NULL)                   \
+x(0,   encrypted,              NULL,                   "Enable whole filesystem encryption (chacha20/poly1305)")\
+x(0,   no_passphrase,          NULL,                   "Don't encrypt master encryption key")\
+x('e', error_action,           "(continue|readonly|panic)", NULL)              \
+x(0,   max_journal_entry_size, "size",                 NULL)                   \
+x('L', label,                  "label",                NULL)                   \
+x('U', uuid,                   "uuid",                 NULL)                   \
+x('f', force,                  NULL,                   NULL)                   \
+t("")                                                                          \
+t("Device specific options:")                                                  \
+x(0,   fs_size,                "size",                 "Size of filesystem on device")\
+x(0,   bucket_size,            "size",                 "Bucket size")          \
+x('t', tier,                   "#",                    "Higher tier indicates slower devices")\
+x(0,   discard,                NULL,                   NULL)                   \
+t("Device specific options must come before corresponding devices, e.g.")      \
+t("  bcache format --tier 0 /dev/sdb --tier 1 /dev/sdc")                       \
+t("")                                                                          \
+x('h', help,                   NULL,                   "display this help and exit")
+
 static void usage(void)
 {
+#define t(text)                                puts(text "\n")
+#define x(shortopt, longopt, arg, help) do {                           \
+       OPTS
+#undef x
+#undef t
+
        puts("bcache format - create a new bcache filesystem on one or more devices\n"
             "Usage: bcache format [OPTION]... <devices>\n"
             "\n"
@@ -81,7 +112,8 @@ static void usage(void)
             "      --metadata_checksum_type=(none|crc32c|crc64)\n"
             "      --data_checksum_type=(none|crc32c|crc64)\n"
             "      --compression_type=(none|lz4|gzip)\n"
-            "      --encrypted\n"
+            "      --encrypted             Enable whole filesystem encryption (chacha20/poly1305)\n"
+            "      --no_passphrase         Don't encrypt master encryption key\n"
             "      --error_action=(continue|readonly|panic)\n"
             "                              Action to take on filesystem error\n"
             "      --max_journal_entry_size=size\n"
@@ -103,37 +135,26 @@ static void usage(void)
             "Report bugs to <linux-bcache@vger.kernel.org>");
 }
 
-#define OPTS                                                           \
-       OPT('b',        block_size,             required_argument)      \
-       OPT(0,          btree_node_size,        required_argument)      \
-       OPT(0,          metadata_checksum_type, required_argument)      \
-       OPT(0,          data_checksum_type,     required_argument)      \
-       OPT(0,          compression_type,       required_argument)      \
-       OPT(0,          encrypted,              no_argument)            \
-       OPT('e',        error_action,           required_argument)      \
-       OPT(0,          max_journal_entry_size, required_argument)      \
-       OPT('L',        label,                  required_argument)      \
-       OPT('U',        uuid,                   required_argument)      \
-       OPT('f',        force,                  no_argument)            \
-       OPT(0,          fs_size,                required_argument)      \
-       OPT(0,          bucket_size,            required_argument)      \
-       OPT('t',        tier,                   required_argument)      \
-       OPT(0,          discard,                no_argument)            \
-       OPT('h',        help,                   no_argument)
-
 enum {
        Opt_no_opt = 1,
-#define OPT(shortopt, longopt, has_arg)        Opt_##longopt,
+#define t(text)
+#define x(shortopt, longopt, arg, help)        Opt_##longopt,
        OPTS
-#undef OPT
+#undef x
+#undef t
 };
 
 static const struct option format_opts[] = {
-#define OPT(shortopt, longopt, has_arg)        {                               \
-               #longopt,  has_arg, NULL, Opt_##longopt                 \
-       },
+#define t(text)
+#define x(shortopt, longopt, arg, help)        {                               \
+       .name           = #longopt,                                     \
+       .has_arg        = arg ? required_argument : no_argument,        \
+       .flag           = NULL,                                         \
+       .val            = Opt_##longopt,                                \
+},
        OPTS
-#undef OPT
+#undef x
+#undef t
        { NULL }
 };
 
@@ -161,29 +182,12 @@ static unsigned hatoi_validate(const char *s, const char *msg)
 int cmd_format(int argc, char *argv[])
 {
        darray(struct dev_opts) devices;
-       struct dev_opts *dev;
-       unsigned block_size = 0;
-       unsigned btree_node_size = 0;
-       unsigned meta_csum_type = BCH_CSUM_CRC32C;
-       unsigned data_csum_type = BCH_CSUM_CRC32C;
-       unsigned compression_type = BCH_COMPRESSION_NONE;
-       bool encrypted = false;
-       unsigned on_error_action = BCH_ON_ERROR_RO;
-       char *label = NULL;
-       uuid_le uuid;
-       bool force = false;
-
-       /* Device specific options: */
-       u64 filesystem_size = 0;
-       unsigned bucket_size = 0;
-       unsigned tier = 0;
-       bool discard = false;
-       unsigned max_journal_entry_size = 0;
-       char *passphrase = NULL;
+       struct format_opts opts = format_opts_default();
+       struct dev_opts dev_opts = { 0 }, *dev;
+       bool force = false, no_passphrase = false;
        int opt;
 
        darray_init(devices);
-       uuid_clear(uuid.b);
 
        while ((opt = getopt_long(argc, argv,
                                  "-b:e:L:U:ft:h",
@@ -192,45 +196,52 @@ int cmd_format(int argc, char *argv[])
                switch (opt) {
                case Opt_block_size:
                case 'b':
-                       block_size = hatoi_validate(optarg,
-                                               "block size");
+                       opts.block_size =
+                               hatoi_validate(optarg, "block size");
                        break;
                case Opt_btree_node_size:
-                       btree_node_size = hatoi_validate(optarg,
-                                               "btree node size");
+                       opts.btree_node_size =
+                               hatoi_validate(optarg, "btree node size");
                        break;
                case Opt_metadata_checksum_type:
-                       meta_csum_type = read_string_list_or_die(optarg,
+                       opts.meta_csum_type =
+                               read_string_list_or_die(optarg,
                                                bch_csum_types, "checksum type");
                        break;
                case Opt_data_checksum_type:
-                       data_csum_type = read_string_list_or_die(optarg,
+                       opts.data_csum_type =
+                               read_string_list_or_die(optarg,
                                                bch_csum_types, "checksum type");
                        break;
                case Opt_compression_type:
-                       compression_type = read_string_list_or_die(optarg,
+                       opts.compression_type =
+                               read_string_list_or_die(optarg,
                                                bch_compression_types,
                                                "compression type");
                        break;
                case Opt_encrypted:
-                       encrypted = true;
+                       opts.encrypted = true;
+                       break;
+               case Opt_no_passphrase:
+                       no_passphrase = true;
                        break;
                case Opt_error_action:
                case 'e':
-                       on_error_action = read_string_list_or_die(optarg,
+                       opts.on_error_action =
+                               read_string_list_or_die(optarg,
                                                bch_error_actions, "error action");
                        break;
                case Opt_max_journal_entry_size:
-                       max_journal_entry_size = hatoi_validate(optarg,
-                                               "journal entry size");
+                       opts.max_journal_entry_size =
+                               hatoi_validate(optarg, "journal entry size");
                        break;
                case Opt_label:
                case 'L':
-                       label = strdup(optarg);
+                       opts.label = strdup(optarg);
                        break;
                case Opt_uuid:
                case 'U':
-                       if (uuid_parse(optarg, uuid.b))
+                       if (uuid_parse(optarg, opts.uuid.b))
                                die("Bad uuid");
                        break;
                case Opt_force:
@@ -238,31 +249,28 @@ int cmd_format(int argc, char *argv[])
                        force = true;
                        break;
                case Opt_fs_size:
-                       if (bch_strtoull_h(optarg, &filesystem_size))
+                       if (bch_strtoull_h(optarg, &dev_opts.size))
                                die("invalid filesystem size");
 
-                       filesystem_size >>= 9;
+                       dev_opts.size >>= 9;
                        break;
                case Opt_bucket_size:
-                       bucket_size = hatoi_validate(optarg, "bucket size");
+                       dev_opts.bucket_size =
+                               hatoi_validate(optarg, "bucket size");
                        break;
                case Opt_tier:
                case 't':
-                       if (kstrtouint(optarg, 10, &tier) ||
-                           tier >= BCH_TIER_MAX)
+                       if (kstrtouint(optarg, 10, &dev_opts.tier) ||
+                           dev_opts.tier >= BCH_TIER_MAX)
                                die("invalid tier");
                        break;
                case Opt_discard:
-                       discard = true;
+                       dev_opts.discard = true;
                        break;
                case Opt_no_opt:
-                       darray_append(devices, (struct dev_opts) {
-                               .path                   = strdup(optarg),
-                               .size                   = filesystem_size,
-                               .bucket_size            = bucket_size,
-                               .tier                   = tier,
-                               .discard                = discard,
-                       });
+                       dev_opts.path = strdup(optarg);
+                       darray_append(devices, dev_opts);
+                       dev_opts.size = 0;
                        break;
                case Opt_help:
                case 'h':
@@ -274,18 +282,16 @@ int cmd_format(int argc, char *argv[])
        if (!darray_size(devices))
                die("Please supply a device");
 
-       if (uuid_is_null(uuid.b))
-               uuid_generate(uuid.b);
-
-       if (encrypted) {
-               passphrase = read_passphrase("Enter passphrase: ");
+       if (opts.encrypted && !no_passphrase) {
+               opts.passphrase = read_passphrase("Enter passphrase: ");
 
                if (isatty(STDIN_FILENO)) {
                        char *pass2 =
                                read_passphrase("Enter same passphrase again: ");
 
-                       if (strcmp(passphrase, pass2)) {
-                               memzero_explicit(passphrase, strlen(passphrase));
+                       if (strcmp(opts.passphrase, pass2)) {
+                               memzero_explicit(opts.passphrase,
+                                                strlen(opts.passphrase));
                                memzero_explicit(pass2, strlen(pass2));
                                die("Passphrases do not match");
                        }
@@ -298,23 +304,14 @@ int cmd_format(int argc, char *argv[])
        darray_foreach(dev, devices)
                dev->fd = open_for_format(dev->path, force);
 
-       bcache_format(devices.item, darray_size(devices),
-                     block_size,
-                     btree_node_size,
-                     meta_csum_type,
-                     data_csum_type,
-                     compression_type,
-                     passphrase,
-                     1,
-                     1,
-                     on_error_action,
-                     max_journal_entry_size,
-                     label,
-                     uuid);
-
-       if (passphrase) {
-               memzero_explicit(passphrase, strlen(passphrase));
-               free(passphrase);
+       struct bch_sb *sb =
+               bcache_format(opts, devices.item, darray_size(devices));
+       bcache_super_print(sb, HUMAN_READABLE);
+       free(sb);
+
+       if (opts.passphrase) {
+               memzero_explicit(opts.passphrase, strlen(opts.passphrase));
+               free(opts.passphrase);
        }
 
        return 0;
index a8c8dc53111b33fff410f7090cd576912fad2980..6af566926df1ccd514159b684ce3ce849f448984 100644 (file)
@@ -56,6 +56,6 @@ int cmd_fsck(int argc, char *argv[])
        if (err)
                die("error opening %s: %s", argv[optind], err);
 
-       bch_fs_stop_sync(c);
+       bch_fs_stop(c);
        return 0;
 }
index 587ecbe383c0623f2d572f0a9d02c6be3a90f204..654ad774101972058542432d94929460cff4f37b 100644 (file)
--- a/cmd_key.c
+++ b/cmd_key.c
@@ -1,6 +1,5 @@
 #include <errno.h>
 #include <unistd.h>
-#include <keyutils.h>
 #include <uuid/uuid.h>
 
 #include "cmds.h"
 
 int cmd_unlock(int argc, char *argv[])
 {
-       struct bch_encrypted_key sb_key;
-       struct bch_key passphrase_key;
        struct bch_sb *sb;
-       struct bch_sb_field_crypt *crypt;
        char *passphrase;
-       char uuid[40];
-       char description[60];
 
        if (argc != 2)
                die("please supply a single device");
 
        sb = bcache_super_read(argv[1]);
 
-       crypt = bch_sb_get_crypt(sb);
-       if (!crypt)
-               die("filesystem is not encrypted");
-
-       sb_key = crypt->key;
-
-       if (!bch_key_is_encrypted(&sb_key))
-               die("filesystem does not have encryption key");
-
        passphrase = read_passphrase("Enter passphrase: ");
-       derive_passphrase(crypt, &passphrase_key, passphrase);
-
-       /* Check if the user supplied the correct passphrase: */
-       if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb),
-                                  &sb_key, sizeof(sb_key)))
-               die("error encrypting key");
-
-       if (bch_key_is_encrypted(&sb_key))
-               die("incorrect passphrase");
-
-       uuid_unparse_lower(sb->user_uuid.b, uuid);
-       sprintf(description, "bcache:%s", uuid);
 
-       if (add_key("logon", description,
-                   &passphrase_key, sizeof(passphrase_key),
-                   KEY_SPEC_USER_KEYRING) < 0 ||
-           add_key("user", description,
-                   &passphrase_key, sizeof(passphrase_key),
-                   KEY_SPEC_USER_KEYRING) < 0)
-               die("add_key error: %s", strerror(errno));
+       add_bcache_key(sb, passphrase);
 
-       memzero_explicit(&sb_key, sizeof(sb_key));
-       memzero_explicit(&passphrase_key, sizeof(passphrase_key));
        memzero_explicit(passphrase, strlen(passphrase));
        free(passphrase);
        return 0;
diff --git a/cmd_migrate.c b/cmd_migrate.c
new file mode 100644 (file)
index 0000000..9a02cb9
--- /dev/null
@@ -0,0 +1,835 @@
+#include </usr/include/dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/vfs.h>
+#include <unistd.h>
+#include <attr/xattr.h>
+
+#include <linux/fiemap.h>
+#include <linux/fs.h>
+#include <linux/stat.h>
+
+#include <uuid/uuid.h>
+
+#include "cmds.h"
+#include "crypto.h"
+#include "libbcache.h"
+#include "linux/bcache.h"
+
+#include <linux/dcache.h>
+#include <linux/generic-radix-tree.h>
+#include <linux/xattr.h>
+#include "btree_update.h"
+#include "buckets.h"
+#include "dirent.h"
+#include "fs.h"
+#include "inode.h"
+#include "io.h"
+#include "str_hash.h"
+#include "super.h"
+#include "xattr.h"
+
+static char *dev_t_to_path(dev_t dev)
+{
+       char link[PATH_MAX], *p;
+       int ret;
+
+       char *sysfs_dev = mprintf("/sys/dev/block/%u:%u",
+                                 major(dev), minor(dev));
+       ret = readlink(sysfs_dev, link, sizeof(link));
+       free(sysfs_dev);
+
+       if (ret < 0 || ret >= sizeof(link))
+               die("readlink error while looking up block device: %s", strerror(errno));
+
+       link[ret] = '\0';
+
+       p = strrchr(link, '/');
+       if (!p)
+               die("error looking up device name");
+       p++;
+
+       return mprintf("/dev/%s", p);
+}
+
+static bool path_is_fs_root(char *path)
+{
+       char *line = NULL, *p, *mount;
+       size_t n = 0;
+       FILE *f;
+       bool ret = true;
+
+       f = fopen("/proc/self/mountinfo", "r");
+       if (!f)
+               die("Error getting mount information");
+
+       while (getline(&line, &n, f) != -1) {
+               p = line;
+
+               strsep(&p, " "); /* mount id */
+               strsep(&p, " "); /* parent id */
+               strsep(&p, " "); /* dev */
+               strsep(&p, " "); /* root */
+               mount = strsep(&p, " ");
+               strsep(&p, " ");
+
+               if (mount && !strcmp(path, mount))
+                       goto found;
+       }
+
+       ret = false;
+found:
+       fclose(f);
+       free(line);
+       return ret;
+}
+
+static void mark_unreserved_space(struct cache_set *c, ranges extents)
+{
+       struct cache *ca = c->cache[0];
+       struct hole_iter iter;
+       struct range i;
+
+       for_each_hole(iter, extents, bucket_to_sector(ca, ca->mi.nbuckets) << 9, i) {
+               struct bucket_mark new;
+               u64 b;
+
+               if (i.start == i.end)
+                       return;
+
+               b = sector_to_bucket(ca, i.start >> 9);
+               do {
+                       bucket_cmpxchg(&ca->buckets[b], new, new.nouse = 1);
+                       b++;
+               } while (bucket_to_sector(ca, b) << 9 < i.end);
+       }
+}
+
+static void update_inode(struct cache_set *c,
+                        struct bch_inode_unpacked *inode)
+{
+       struct bkey_inode_buf packed;
+       int ret;
+
+       bch_inode_pack(&packed, inode);
+       ret = bch_btree_update(c, BTREE_ID_INODES, &packed.inode.k_i, NULL);
+       if (ret)
+               die("error creating file: %s", strerror(-ret));
+}
+
+static void create_dirent(struct cache_set *c,
+                         struct bch_inode_unpacked *parent,
+                         const char *name, u64 inum, mode_t mode)
+{
+       struct bch_hash_info parent_hash_info = bch_hash_info_init(parent);
+       struct qstr qname = { { { .len = strlen(name), } }, .name = name };
+
+       int ret = bch_dirent_create(c, parent->inum, &parent_hash_info,
+                                   mode_to_type(mode), &qname,
+                                   inum, NULL, BCH_HASH_SET_MUST_CREATE);
+       if (ret)
+               die("error creating file: %s", strerror(-ret));
+
+       if (S_ISDIR(mode))
+               parent->i_nlink++;
+}
+
+static void create_link(struct cache_set *c,
+                       struct bch_inode_unpacked *parent,
+                       const char *name, u64 inum, mode_t mode)
+{
+       struct bch_inode_unpacked inode;
+       int ret = bch_inode_find_by_inum(c, inum, &inode);
+       if (ret)
+               die("error looking up hardlink: %s", strerror(-ret));
+
+       inode.i_nlink++;
+       update_inode(c, &inode);
+
+       create_dirent(c, parent, name, inum, mode);
+}
+
+static struct bch_inode_unpacked create_file(struct cache_set *c,
+                                            struct bch_inode_unpacked *parent,
+                                            const char *name,
+                                            uid_t uid, gid_t gid,
+                                            mode_t mode, dev_t rdev)
+{
+       struct bch_inode_unpacked new_inode;
+       struct bkey_inode_buf packed;
+       int ret;
+
+       bch_inode_init(c, &new_inode, uid, gid, mode, rdev);
+       bch_inode_pack(&packed, &new_inode);
+
+       ret = bch_inode_create(c, &packed.inode.k_i, BLOCKDEV_INODE_MAX, 0,
+                              &c->unused_inode_hint);
+       if (ret)
+               die("error creating file: %s", strerror(-ret));
+
+       new_inode.inum = packed.inode.k.p.inode;
+       create_dirent(c, parent, name, new_inode.inum, mode);
+
+       return new_inode;
+}
+
+#define for_each_xattr_handler(handlers, handler)              \
+       if (handlers)                                           \
+               for ((handler) = *(handlers)++;                 \
+                       (handler) != NULL;                      \
+                       (handler) = *(handlers)++)
+
+static const struct xattr_handler *xattr_resolve_name(const char **name)
+{
+       const struct xattr_handler **handlers = bch_xattr_handlers;
+       const struct xattr_handler *handler;
+
+       for_each_xattr_handler(handlers, handler) {
+               const char *n;
+
+               n = strcmp_prefix(*name, xattr_prefix(handler));
+               if (n) {
+                       if (!handler->prefix ^ !*n) {
+                               if (*n)
+                                       continue;
+                               return ERR_PTR(-EINVAL);
+                       }
+                       *name = n;
+                       return handler;
+               }
+       }
+       return ERR_PTR(-EOPNOTSUPP);
+}
+
+static void copy_times(struct cache_set *c, struct bch_inode_unpacked *dst,
+                      struct stat *src)
+{
+       dst->i_atime = timespec_to_bch_time(c, src->st_atim);
+       dst->i_mtime = timespec_to_bch_time(c, src->st_mtim);
+       dst->i_ctime = timespec_to_bch_time(c, src->st_ctim);
+}
+
+static void copy_xattrs(struct cache_set *c, struct bch_inode_unpacked *dst,
+                       char *src)
+{
+       struct bch_hash_info hash_info = bch_hash_info_init(dst);
+       ssize_t size = llistxattr(src, NULL, 0);
+       if (size < 0)
+               die("listxattr error: %s", strerror(errno));
+
+       if (!size)
+               return;
+
+       char *buf = malloc(size);
+       size = llistxattr(src, buf, size);
+       if (size < 0)
+               die("listxattr error: %s", strerror(errno));
+
+       for (const char *next, *attr = buf;
+            attr <= buf + size;
+            attr = next) {
+               next = attr + strlen(attr) + 1;
+
+               /* max possible xattr val: */
+               static char val[64 << 10];
+               ssize_t val_size = lgetxattr(src, attr, val, sizeof(val));
+
+               if (val_size < 0)
+                       die("error getting xattr val: %s", strerror(errno));
+
+               const struct xattr_handler *h = xattr_resolve_name(&attr);
+
+               int ret = __bch_xattr_set(c, dst->inum, &hash_info, attr,
+                                         val, val_size, 0, h->flags, NULL);
+               if (ret < 0)
+                       die("error creating xattr: %s", strerror(-ret));
+       }
+
+       free(buf);
+}
+
+static void write_data(struct cache_set *c,
+                      struct bch_inode_unpacked *dst_inode,
+                      u64 dst_offset, void *buf, size_t len)
+{
+       struct disk_reservation res;
+       struct bch_write_op op;
+       struct bch_write_bio bio;
+       struct bio_vec bv;
+       struct closure cl;
+
+       BUG_ON(dst_offset       & (block_bytes(c) - 1));
+       BUG_ON(len              & (block_bytes(c) - 1));
+
+       closure_init_stack(&cl);
+
+       bio_init(&bio.bio);
+       bio.bio.bi_max_vecs     = 1;
+       bio.bio.bi_io_vec       = &bv;
+       bio.bio.bi_iter.bi_size = len;
+       bch_bio_map(&bio.bio, buf);
+
+       int ret = bch_disk_reservation_get(c, &res, len >> 9, 0);
+       if (ret)
+               die("error reserving space in new filesystem: %s", strerror(-ret));
+
+       bch_write_op_init(&op, c, &bio, res, c->write_points,
+                         POS(dst_inode->inum, dst_offset >> 9), NULL, 0);
+       closure_call(&op.cl, bch_write, NULL, &cl);
+       closure_sync(&cl);
+
+       dst_inode->i_sectors += len >> 9;
+}
+
+static char buf[1 << 20] __aligned(PAGE_SIZE);
+
+static void copy_data(struct cache_set *c,
+                     struct bch_inode_unpacked *dst_inode,
+                     int src_fd, u64 start, u64 end)
+{
+       while (start < end) {
+               unsigned len = min_t(u64, end - start, sizeof(buf));
+
+               xpread(src_fd, buf, len, start);
+               write_data(c, dst_inode, start, buf, len);
+               start += len;
+       }
+}
+
+static void link_data(struct cache_set *c, struct bch_inode_unpacked *dst,
+                     u64 logical, u64 physical, u64 length)
+{
+       struct cache *ca = c->cache[0];
+
+       BUG_ON(logical  & (block_bytes(c) - 1));
+       BUG_ON(physical & (block_bytes(c) - 1));
+       BUG_ON(length   & (block_bytes(c) - 1));
+
+       logical         >>= 9;
+       physical        >>= 9;
+       length          >>= 9;
+
+       BUG_ON(physical + length > bucket_to_sector(ca, ca->mi.nbuckets));
+
+       while (length) {
+               struct bkey_i_extent *e;
+               BKEY_PADDED(k) k;
+               u64 b = sector_to_bucket(ca, physical >> 9);
+               struct disk_reservation res;
+               unsigned sectors;
+               int ret;
+
+               sectors = min(ca->mi.bucket_size -
+                             (physical & (ca->mi.bucket_size - 1)),
+                             length);
+
+               e = bkey_extent_init(&k.k);
+               e->k.p.inode    = dst->inum;
+               e->k.p.offset   = logical + sectors;
+               e->k.size       = sectors;
+               extent_ptr_append(e, (struct bch_extent_ptr) {
+                                       .offset = physical,
+                                       .dev = 0,
+                                       .gen = ca->buckets[b].mark.gen,
+                                 });
+
+               ret = bch_disk_reservation_get(c, &res, sectors,
+                                              BCH_DISK_RESERVATION_NOFAIL);
+               if (ret)
+                       die("error reserving space in new filesystem: %s",
+                           strerror(-ret));
+
+               ret = bch_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i,
+                                      &res, NULL, NULL, 0);
+               if (ret)
+                       die("btree insert error %s", strerror(-ret));
+
+               bch_disk_reservation_put(c, &res);
+
+               dst->i_sectors  += sectors;
+               logical         += sectors;
+               physical        += sectors;
+               length          -= sectors;
+       }
+}
+
+static void copy_link(struct cache_set *c, struct bch_inode_unpacked *dst,
+                     char *src)
+{
+       ssize_t ret = readlink(src, buf, sizeof(buf));
+       if (ret < 0)
+               die("readlink error: %s", strerror(errno));
+
+       write_data(c, dst, 0, buf, round_up(ret, block_bytes(c)));
+}
+
+static void copy_file(struct cache_set *c, struct bch_inode_unpacked *dst,
+                     int src, char *src_path, ranges *extents)
+{
+       struct fiemap_iter iter;
+       struct fiemap_extent e;
+
+       fiemap_for_each(src, iter, e)
+               if (e.fe_flags & FIEMAP_EXTENT_UNKNOWN) {
+                       fsync(src);
+                       break;
+               }
+
+       fiemap_for_each(src, iter, e) {
+               if ((e.fe_logical       & (block_bytes(c) - 1)) ||
+                   (e.fe_length        & (block_bytes(c) - 1)))
+                       die("Unaligned extent in %s - can't handle", src_path);
+
+               if (e.fe_flags & (FIEMAP_EXTENT_UNKNOWN|
+                                 FIEMAP_EXTENT_ENCODED|
+                                 FIEMAP_EXTENT_NOT_ALIGNED|
+                                 FIEMAP_EXTENT_DATA_INLINE)) {
+                       copy_data(c, dst,
+                                 src,
+                                 round_down(e.fe_logical, block_bytes(c)),
+                                 round_up(e.fe_logical + e.fe_length,
+                                          block_bytes(c)));
+                       continue;
+               }
+
+               if ((e.fe_physical      & (block_bytes(c) - 1)))
+                       die("Unaligned extent in %s - can't handle", src_path);
+
+               range_add(extents, e.fe_physical, e.fe_length);
+               link_data(c, dst, e.fe_logical, e.fe_physical, e.fe_length);
+       }
+}
+
+struct copy_fs_state {
+       u64                     bcachefs_inum;
+       dev_t                   dev;
+
+       GENRADIX(u64)           hardlinks;
+       ranges                  extents;
+};
+
+static void copy_dir(struct copy_fs_state *s,
+                    struct cache_set *c,
+                    struct bch_inode_unpacked *dst,
+                    int src_fd, const char *src_path)
+{
+       DIR *dir = fdopendir(src_fd);
+       struct dirent *d;
+
+       while ((errno = 0), (d = readdir(dir))) {
+               struct bch_inode_unpacked inode;
+               int fd;
+
+               if (fchdir(src_fd))
+                       die("chdir error: %s", strerror(errno));
+
+               struct stat stat =
+                       xfstatat(src_fd, d->d_name, AT_SYMLINK_NOFOLLOW);
+
+               if (!strcmp(d->d_name, ".") ||
+                   !strcmp(d->d_name, "..") ||
+                   stat.st_ino == s->bcachefs_inum)
+                       continue;
+
+               char *child_path = mprintf("%s/%s", src_path, d->d_name);
+
+               if (stat.st_dev != s->dev)
+                       die("%s does not have correct st_dev!", child_path);
+
+               u64 *dst_inum = S_ISREG(stat.st_mode)
+                       ? genradix_ptr_alloc(&s->hardlinks, stat.st_ino, GFP_KERNEL)
+                       : NULL;
+
+               if (dst_inum && *dst_inum) {
+                       create_link(c, dst, d->d_name, *dst_inum, S_IFREG);
+                       goto next;
+               }
+
+               inode = create_file(c, dst, d->d_name,
+                                   stat.st_uid, stat.st_gid,
+                                   stat.st_mode, stat.st_rdev);
+
+               if (dst_inum)
+                       *dst_inum = inode.inum;
+
+               copy_times(c, &inode, &stat);
+               copy_xattrs(c, &inode, d->d_name);
+
+               /* copy xattrs */
+
+               switch (mode_to_type(stat.st_mode)) {
+               case DT_DIR:
+                       fd = xopen(d->d_name, O_RDONLY|O_NOATIME);
+                       copy_dir(s, c, &inode, fd, child_path);
+                       close(fd);
+                       break;
+               case DT_REG:
+                       inode.i_size = stat.st_size;
+
+                       fd = xopen(d->d_name, O_RDONLY|O_NOATIME);
+                       copy_file(c, &inode, fd, child_path, &s->extents);
+                       close(fd);
+                       break;
+               case DT_LNK:
+                       inode.i_size = stat.st_size;
+
+                       copy_link(c, &inode, d->d_name);
+                       break;
+               case DT_FIFO:
+               case DT_CHR:
+               case DT_BLK:
+               case DT_SOCK:
+               case DT_WHT:
+                       /* nothing else to copy for these: */
+                       break;
+               default:
+                       BUG();
+               }
+
+               update_inode(c, &inode);
+next:
+               free(child_path);
+       }
+
+       if (errno)
+               die("readdir error: %s", strerror(errno));
+}
+
+static ranges reserve_new_fs_space(const char *file_path, unsigned block_size,
+                                  u64 size, u64 *bcachefs_inum, dev_t dev)
+{
+       int fd = open(file_path, O_RDWR|O_CREAT|O_EXCL, 0600);
+       if (fd < 0)
+               die("Error creating %s for bcachefs metadata: %s",
+                   file_path, strerror(errno));
+
+       struct stat statbuf = xfstat(fd);
+
+       if (statbuf.st_dev != dev)
+               die("bcachefs file has incorrect device");
+
+       *bcachefs_inum = statbuf.st_ino;
+
+       if (fallocate(fd, 0, 0, size))
+               die("Error reserving space for bcachefs metadata: %s",
+                   strerror(errno));
+
+       fsync(fd);
+
+       struct fiemap_iter iter;
+       struct fiemap_extent e;
+       ranges extents = { NULL };
+
+       fiemap_for_each(fd, iter, e) {
+               if (e.fe_flags & (FIEMAP_EXTENT_UNKNOWN|
+                                 FIEMAP_EXTENT_ENCODED|
+                                 FIEMAP_EXTENT_NOT_ALIGNED|
+                                 FIEMAP_EXTENT_DATA_INLINE))
+                       die("Unable to continue: metadata file not fully mapped");
+
+               if ((e.fe_physical      & (block_size - 1)) ||
+                   (e.fe_length        & (block_size - 1)))
+                       die("Unable to continue: unaligned extents in metadata file");
+
+               range_add(&extents, e.fe_physical, e.fe_length);
+       }
+       close(fd);
+
+       ranges_sort_merge(&extents);
+       return extents;
+}
+
+static void reserve_old_fs_space(struct cache_set *c,
+                                struct bch_inode_unpacked *root_inode,
+                                ranges *extents)
+{
+       struct cache *ca = c->cache[0];
+       struct bch_inode_unpacked dst;
+       struct hole_iter iter;
+       struct range i;
+
+       dst = create_file(c, root_inode, "old_migrated_filesystem",
+                         0, 0, S_IFREG|0400, 0);
+       dst.i_size = bucket_to_sector(ca, ca->mi.nbuckets) << 9;
+
+       ranges_sort_merge(extents);
+
+       for_each_hole(iter, *extents, bucket_to_sector(ca, ca->mi.nbuckets) << 9, i)
+               link_data(c, &dst, i.start, i.start, i.end - i.start);
+
+       update_inode(c, &dst);
+}
+
+static void copy_fs(struct cache_set *c, int src_fd, const char *src_path,
+                   u64 bcachefs_inum, ranges *extents)
+{
+       syncfs(src_fd);
+
+       struct bch_inode_unpacked root_inode;
+       int ret = bch_inode_find_by_inum(c, BCACHE_ROOT_INO, &root_inode);
+       if (ret)
+               die("error looking up root directory: %s", strerror(-ret));
+
+       if (fchdir(src_fd))
+               die("chdir error: %s", strerror(errno));
+
+       struct stat stat = xfstat(src_fd);
+       copy_times(c, &root_inode, &stat);
+       copy_xattrs(c, &root_inode, ".");
+
+       struct copy_fs_state s = {
+               .bcachefs_inum  = bcachefs_inum,
+               .dev            = stat.st_dev,
+               .extents        = *extents,
+       };
+
+       /* now, copy: */
+       copy_dir(&s, c, &root_inode, src_fd, src_path);
+
+       reserve_old_fs_space(c, &root_inode, &s.extents);
+
+       update_inode(c, &root_inode);
+
+       darray_free(s.extents);
+       genradix_free(&s.hardlinks);
+}
+
+static void find_superblock_space(ranges extents, struct dev_opts *dev)
+{
+       struct range *i;
+       darray_foreach(i, extents) {
+               u64 offset = max(256ULL << 10, i->start);
+
+               if (offset + (128 << 10) <= i->end) {
+                       dev->sb_offset  = offset >> 9;
+                       dev->sb_end     = dev->sb_offset + 256;
+                       return;
+               }
+       }
+
+       die("Couldn't find a valid location for superblock");
+}
+
+static void migrate_usage(void)
+{
+       puts("bcache migrate - migrate an existing filesystem to bcachefs\n"
+            "Usage: bcache migrate [OPTION]...\n"
+            "\n"
+            "Options:\n"
+            "  -f fs                  Root of filesystem to migrate(s)\n"
+            "      --encrypted        Enable whole filesystem encryption (chacha20/poly1305)\n"
+            "      --no_passphrase    Don't encrypt master encryption key\n"
+            "  -h                     Display this help and exit\n"
+            "Report bugs to <linux-bcache@vger.kernel.org>");
+}
+
+static const struct option migrate_opts[] = {
+       { "encrypted",          no_argument, NULL, 'e' },
+       { "no_passphrase",      no_argument, NULL, 'p' },
+       { NULL }
+};
+
+int cmd_migrate(int argc, char *argv[])
+{
+       struct format_opts format_opts = format_opts_default();
+       char *fs_path = NULL;
+       unsigned block_size;
+       bool no_passphrase = false;
+       int opt;
+
+       while ((opt = getopt_long(argc, argv, "f:h",
+                                 migrate_opts, NULL)) != -1)
+               switch (opt) {
+               case 'f':
+                       fs_path = optarg;
+                       break;
+               case 'e':
+                       format_opts.encrypted = true;
+                       break;
+               case 'p':
+                       no_passphrase = true;
+                       break;
+               case 'h':
+                       migrate_usage();
+                       exit(EXIT_SUCCESS);
+               }
+
+       if (!fs_path)
+               die("Please specify a filesytem to migrate");
+
+       if (!path_is_fs_root(fs_path))
+               die("%s is not a filysestem root", fs_path);
+
+       int fs_fd = xopen(fs_path, O_RDONLY|O_NOATIME);
+       struct stat stat = xfstat(fs_fd);
+
+       if (!S_ISDIR(stat.st_mode))
+               die("%s is not a directory", fs_path);
+
+       struct dev_opts dev = { 0 };
+
+       dev.path = dev_t_to_path(stat.st_dev);
+       dev.fd = xopen(dev.path, O_RDWR);
+
+       block_size = min_t(unsigned, stat.st_blksize,
+                          get_blocksize(dev.path, dev.fd) << 9);
+
+       BUG_ON(!is_power_of_2(block_size) || block_size < 512);
+       format_opts.block_size = block_size >> 9;
+
+       u64 bcachefs_inum;
+       char *file_path = mprintf("%s/bcachefs", fs_path);
+
+       ranges extents = reserve_new_fs_space(file_path,
+                               block_size, get_size(dev.path, dev.fd) / 5,
+                               &bcachefs_inum, stat.st_dev);
+
+       find_superblock_space(extents, &dev);
+
+       if (format_opts.encrypted && !no_passphrase) {
+               format_opts.passphrase = read_passphrase("Enter passphrase: ");
+
+               if (isatty(STDIN_FILENO)) {
+                       char *pass2 =
+                               read_passphrase("Enter same passphrase again: ");
+
+                       if (strcmp(format_opts.passphrase, pass2)) {
+                               memzero_explicit(format_opts.passphrase,
+                                                strlen(format_opts.passphrase));
+                               memzero_explicit(pass2, strlen(pass2));
+                               die("Passphrases do not match");
+                       }
+
+                       memzero_explicit(pass2, strlen(pass2));
+                       free(pass2);
+               }
+       }
+
+       struct bch_sb *sb = bcache_format(format_opts, &dev, 1);
+       u64 sb_offset = le64_to_cpu(sb->layout.sb_offset[0]);
+
+       if (format_opts.passphrase)
+               add_bcache_key(sb, format_opts.passphrase);
+
+       free(sb);
+
+       printf("Creating new filesystem on %s in space reserved at %s\n"
+              "To mount, run\n"
+              "  mount -t bcache -o sb=%llu %s dir\n"
+              "\n"
+              "After verifying that the new filesystem is correct, to create a\n"
+              "superblock at the default offset and finish the migration run\n"
+              "  bcache migrate_superblock -d %s -o %llu\n"
+              "\n"
+              "The new filesystem will have a file at /old_migrated_filestem\n"
+              "referencing all disk space that might be used by the existing\n"
+              "filesystem. That file can be deleted once the old filesystem is\n"
+              "no longer needed (and should be deleted prior to running\n"
+              "bcache migrate_superblock)\n",
+              dev.path, file_path, sb_offset, dev.path,
+              dev.path, sb_offset);
+
+       struct bch_opts opts = bch_opts_empty();
+       struct cache_set *c = NULL;
+       char *path[1] = { dev.path };
+       const char *err;
+
+       opts.sb         = sb_offset;
+       opts.nostart    = true;
+       opts.noexcl     = true;
+
+       err = bch_fs_open(path, 1, opts, &c);
+       if (err)
+               die("Error opening new filesystem: %s", err);
+
+       mark_unreserved_space(c, extents);
+
+       err = bch_fs_start(c);
+       if (err)
+               die("Error starting new filesystem: %s", err);
+
+       copy_fs(c, fs_fd, fs_path, bcachefs_inum, &extents);
+
+       bch_fs_stop(c);
+
+       printf("Migrate complete, running fsck:\n");
+       opts.nostart    = false;
+       opts.nochanges  = true;
+       fsck_err_opt    = FSCK_ERR_NO;
+
+       err = bch_fs_open(path, 1, opts, &c);
+       if (err)
+               die("Error opening new filesystem: %s", err);
+
+       bch_fs_stop(c);
+       printf("fsck complete\n");
+       return 0;
+}
+
+static void migrate_superblock_usage(void)
+{
+       puts("bcache migrate_superblock - create default superblock after migrating\n"
+            "Usage: bcache migrate_superblock [OPTION]...\n"
+            "\n"
+            "Options:\n"
+            "  -d device     Device to create superblock for\n"
+            "  -o offset     Offset of existing superblock\n"
+            "  -h            Display this help and exit\n"
+            "Report bugs to <linux-bcache@vger.kernel.org>");
+}
+
+int cmd_migrate_superblock(int argc, char *argv[])
+{
+       char *dev = NULL;
+       u64 offset = 0;
+       int opt, ret;
+
+       while ((opt = getopt(argc, argv, "d:o:h")) != -1)
+               switch (opt) {
+                       case 'd':
+                               dev = optarg;
+                               break;
+                       case 'o':
+                               ret = kstrtou64(optarg, 10, &offset);
+                               if (ret)
+                                       die("Invalid offset");
+                               break;
+                       case 'h':
+                               migrate_superblock_usage();
+                               exit(EXIT_SUCCESS);
+               }
+
+       if (!dev)
+               die("Please specify a device");
+
+       if (!offset)
+               die("Please specify offset of existing superblock");
+
+       int fd = xopen(dev, O_RDWR);
+       struct bch_sb *sb = __bcache_super_read(fd, offset);
+
+       if (sb->layout.nr_superblocks >= ARRAY_SIZE(sb->layout.sb_offset))
+               die("Can't add superblock: no space left in superblock layout");
+
+       for (unsigned i = 0; i < sb->layout.nr_superblocks; i++)
+               if (le64_to_cpu(sb->layout.sb_offset[i]) == BCH_SB_SECTOR)
+                       die("Superblock layout already has default superblock");
+
+       memmove(&sb->layout.sb_offset[1],
+               &sb->layout.sb_offset[0],
+               sb->layout.nr_superblocks * sizeof(u64));
+       sb->layout.nr_superblocks++;
+
+       sb->layout.sb_offset[0] = cpu_to_le64(BCH_SB_SECTOR);
+
+       bcache_super_write(fd, sb);
+       close(fd);
+
+       return 0;
+}
index 74f324802f8a0f842cb5106f834e48d69268e806..6fb1c4f9f9c013df2782baea6f07958aed9dd98a 100644 (file)
--- a/cmd_run.c
+++ b/cmd_run.c
@@ -25,9 +25,6 @@ int cmd_stop(int argc, char *argv[])
                die("Please supply a filesystem");
 
        struct bcache_handle fs = bcache_fs_open(argv[1]);
-
-       if (ioctl(fs.ioctl_fd, BCH_IOCTL_STOP))
-               die("BCH_IOCTL_STOP error: %s", strerror(errno));
-
+       xioctl(fs.ioctl_fd, BCH_IOCTL_STOP);
        return 0;
 }
diff --git a/cmds.h b/cmds.h
index 946acfda0da327e18577b3f6170e22d3d17b68c9..120e83f9f7a8bd433c109d26769c2b6fa2e3bde0 100644 (file)
--- a/cmds.h
+++ b/cmds.h
@@ -29,4 +29,7 @@ int cmd_fsck(int argc, char *argv[]);
 int cmd_dump(int argc, char *argv[]);
 int cmd_list(int argc, char *argv[]);
 
+int cmd_migrate(int argc, char *argv[]);
+int cmd_migrate_superblock(int argc, char *argv[]);
+
 #endif /* _CMDS_H */
index 86da70a17921aa4c9c05e8d494f520130880680c..f38a359da23431d03f4ef4efbf946acc4bcca26d 100644 (file)
--- a/crypto.c
+++ b/crypto.c
 #include <time.h>
 #include <unistd.h>
 
+#include <keyutils.h>
 #include <linux/random.h>
 #include <libscrypt.h>
+#include <uuid/uuid.h>
 
 #include "checksum.h"
 #include "crypto.h"
@@ -75,29 +77,71 @@ void derive_passphrase(struct bch_sb_field_crypt *crypt,
        }
 }
 
-void bch_sb_crypt_init(struct bch_sb *sb,
-                      struct bch_sb_field_crypt *crypt,
-                      const char *passphrase)
+void add_bcache_key(struct bch_sb *sb, const char *passphrase)
 {
-       struct bch_key passphrase_key;
+       struct bch_sb_field_crypt *crypt = bch_sb_get_crypt(sb);
+       if (!crypt)
+               die("filesystem is not encrypted");
 
-       SET_BCH_CRYPT_KDF_TYPE(crypt, BCH_KDF_SCRYPT);
-       SET_BCH_KDF_SCRYPT_N(crypt, ilog2(SCRYPT_N));
-       SET_BCH_KDF_SCRYPT_R(crypt, ilog2(SCRYPT_r));
-       SET_BCH_KDF_SCRYPT_P(crypt, ilog2(SCRYPT_p));
+       struct bch_encrypted_key sb_key = crypt->key;
+       if (!bch_key_is_encrypted(&sb_key))
+               die("filesystem does not have encryption key");
 
+       struct bch_key passphrase_key;
        derive_passphrase(crypt, &passphrase_key, passphrase);
 
+       /* Check if the user supplied the correct passphrase: */
+       if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb),
+                                  &sb_key, sizeof(sb_key)))
+               die("error encrypting key");
+
+       if (bch_key_is_encrypted(&sb_key))
+               die("incorrect passphrase");
+
+       char uuid[40];
+       uuid_unparse_lower(sb->user_uuid.b, uuid);
+
+       char *description = mprintf("bcache:%s", uuid);
+
+       if (add_key("logon", description,
+                   &passphrase_key, sizeof(passphrase_key),
+                   KEY_SPEC_USER_KEYRING) < 0 ||
+           add_key("user", description,
+                   &passphrase_key, sizeof(passphrase_key),
+                   KEY_SPEC_USER_KEYRING) < 0)
+               die("add_key error: %s", strerror(errno));
+
+       memzero_explicit(description, strlen(description));
+       free(description);
+       memzero_explicit(&passphrase_key, sizeof(passphrase_key));
+       memzero_explicit(&sb_key, sizeof(sb_key));
+}
+
+void bch_sb_crypt_init(struct bch_sb *sb,
+                      struct bch_sb_field_crypt *crypt,
+                      const char *passphrase)
+{
        crypt->key.magic = BCH_KEY_MAGIC;
        get_random_bytes(&crypt->key.key, sizeof(crypt->key.key));
 
-       assert(!bch_key_is_encrypted(&crypt->key));
+       if (passphrase) {
+               struct bch_key passphrase_key;
 
-       if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb),
-                                  &crypt->key, sizeof(crypt->key)))
-               die("error encrypting key");
+               SET_BCH_CRYPT_KDF_TYPE(crypt, BCH_KDF_SCRYPT);
+               SET_BCH_KDF_SCRYPT_N(crypt, ilog2(SCRYPT_N));
+               SET_BCH_KDF_SCRYPT_R(crypt, ilog2(SCRYPT_r));
+               SET_BCH_KDF_SCRYPT_P(crypt, ilog2(SCRYPT_p));
 
-       assert(bch_key_is_encrypted(&crypt->key));
+               derive_passphrase(crypt, &passphrase_key, passphrase);
 
-       memzero_explicit(&passphrase_key, sizeof(passphrase_key));
+               assert(!bch_key_is_encrypted(&crypt->key));
+
+               if (bch_chacha_encrypt_key(&passphrase_key, __bch_sb_key_nonce(sb),
+                                          &crypt->key, sizeof(crypt->key)))
+                       die("error encrypting key");
+
+               assert(bch_key_is_encrypted(&crypt->key));
+
+               memzero_explicit(&passphrase_key, sizeof(passphrase_key));
+       }
 }
index 643073ebd166c8bd4a5cb076aafa8b77c1252a57..91a8b9fc9733fe1985562fc090c6e0942e505612 100644 (file)
--- a/crypto.h
+++ b/crypto.h
@@ -1,12 +1,16 @@
 #ifndef _CRYPTO_H
 #define _CRYPTO_H
 
-#include "super-io.h"
 #include "tools-util.h"
 
+struct bch_sb;
+struct bch_sb_field_crypt;
+struct bch_key;
+
 char *read_passphrase(const char *);
 void derive_passphrase(struct bch_sb_field_crypt *,
                       struct bch_key *, const char *);
+void add_bcache_key(struct bch_sb *, const char *);
 void bch_sb_crypt_init(struct bch_sb *sb, struct bch_sb_field_crypt *,
                       const char *);
 
index dbb02742c47bf99168b621a12d04b7eaca4d65b4..d70e2e32449e0414dca2ae24e5972ca212c68f05 100644 (file)
@@ -821,7 +821,7 @@ struct bch_sb_field {
        __le32                  type;
 };
 
-enum bch_sb_field_types {
+enum bch_sb_field_type {
        BCH_SB_FIELD_journal    = 0,
        BCH_SB_FIELD_members    = 1,
        BCH_SB_FIELD_crypt      = 2,
index 3c185945a1b7af47b554512092fc94af1ea7c12f..217ff09440ccb4b909ce10e6eac46b7342cef516 100644 (file)
@@ -110,6 +110,7 @@ struct super_block {
  * NOTE! These match bits 12..15 of stat.st_mode
  * (ie "(i_mode >> 12) & 15").
  */
+#ifndef DT_UNKNOWN
 #define DT_UNKNOWN     0
 #define DT_FIFO                1
 #define DT_CHR         2
@@ -119,6 +120,7 @@ struct super_block {
 #define DT_LNK         10
 #define DT_SOCK                12
 #define DT_WHT         14
+#endif
 
 /*
  * This is the "filldir" function type, used by readdir() to let
index 1a951e97c76c401cf3057cbcc491e522c6d636b4..6ea2deb2c8dc5e3f3ec7074f53862304b68865aa 100644 (file)
@@ -8,7 +8,6 @@
  * interior nodes.
  */
 
-#include <linux/page.h>
 #include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/log2.h>
@@ -41,20 +40,14 @@ struct __genradix {
  * genradix.
  */
 
-#define DECLARE_GENRADIX_TYPE(_name, _type)                    \
-struct _name {                                                 \
-       struct __genradix       tree;                           \
-       _type                   type[0] __aligned(1);           \
-}
-
-#define DECLARE_GENRADIX(_name, _type)                         \
+#define GENRADIX(_type)                                                \
 struct {                                                       \
        struct __genradix       tree;                           \
        _type                   type[0] __aligned(1);           \
-} _name
+}
 
 #define DEFINE_GENRADIX(_name, _type)                          \
-       DECLARE_GENRADIX(_name, _type) = __GENRADIX_INITIALIZER
+       GENRADIX(_type) _name = __GENRADIX_INITIALIZER
 
 #define genradix_init(_radix)                                  \
 do {                                                           \
index 5a98618862d1a05ffbf5a8295f36c151e591728b..2bbd0979ea3cb0e5edc926d33c16e40f67c98d12 100644 (file)
@@ -180,4 +180,9 @@ static inline bool percpu_ref_is_zero(struct percpu_ref *ref)
        return !atomic_long_read(&ref->count);
 }
 
+static inline bool percpu_ref_is_dying(struct percpu_ref *ref)
+{
+       return percpu_ref_is_zero(ref);
+}
+
 #endif /* __TOOLS_LINUX_PERCPU_REFCOUNT_H */
index 6908ead95486fb19b4194e4f2642b1f503edc21d..0cfafbbcd28849e00bfce62f215a0dd6265949a2 100644 (file)
 
 #define BCH_MIN_NR_NBUCKETS    (1 << 10)
 
-/* first bucket should start 1 mb in, in sectors: */
-#define FIRST_BUCKET_OFFSET    (1 << 11)
-
 /* minimum size filesystem we can create, given a bucket size: */
 static u64 min_size(unsigned bucket_size)
 {
-       return (DIV_ROUND_UP(FIRST_BUCKET_OFFSET, bucket_size) +
-               BCH_MIN_NR_NBUCKETS) * bucket_size;
+       return BCH_MIN_NR_NBUCKETS * bucket_size;
 }
 
-static void init_layout(struct bch_sb_layout *l)
+static void init_layout(struct bch_sb_layout *l, unsigned block_size,
+                       u64 start, u64 end)
 {
+       unsigned sb_size;
+       u64 backup; /* offset of 2nd sb */
+
        memset(l, 0, sizeof(*l));
 
+       if (start != BCH_SB_SECTOR)
+               start = round_up(start, block_size);
+       end = round_down(end, block_size);
+
+       if (start >= end)
+               die("insufficient space for superblocks");
+
+       /*
+        * Create two superblocks in the allowed range: reserve a maximum of 64k
+        */
+       sb_size = min_t(u64, 128, end - start / 2);
+
+       backup = start + sb_size;
+       backup = round_up(backup, block_size);
+
+       backup = min(backup, end);
+
+       sb_size = min(end - backup, backup- start);
+       sb_size = rounddown_pow_of_two(sb_size);
+
+       if (sb_size < 8)
+               die("insufficient space for superblocks");
+
        l->magic                = BCACHE_MAGIC;
        l->layout_type          = 0;
        l->nr_superblocks       = 2;
-       l->sb_max_size_bits     = 7;
-       l->sb_offset[0]         = cpu_to_le64(BCH_SB_SECTOR);
-       l->sb_offset[1]         = cpu_to_le64(BCH_SB_SECTOR +
-                                             (1 << l->sb_max_size_bits));
+       l->sb_max_size_bits     = ilog2(sb_size);
+       l->sb_offset[0]         = cpu_to_le64(start);
+       l->sb_offset[1]         = cpu_to_le64(backup);
 }
 
-void bcache_format(struct dev_opts *devs, size_t nr_devs,
-                  unsigned block_size,
-                  unsigned btree_node_size,
-                  unsigned meta_csum_type,
-                  unsigned data_csum_type,
-                  unsigned compression_type,
-                  const char *passphrase,
-                  unsigned meta_replicas,
-                  unsigned data_replicas,
-                  unsigned on_error_action,
-                  unsigned max_journal_entry_size,
-                  char *label,
-                  uuid_le uuid)
+struct bch_sb *bcache_format(struct format_opts opts,
+                            struct dev_opts *devs, size_t nr_devs)
 {
        struct bch_sb *sb;
        struct dev_opts *i;
        struct bch_sb_field_members *mi;
-       unsigned u64s, j;
+       unsigned u64s;
 
        /* calculate block size: */
-       if (!block_size)
+       if (!opts.block_size)
                for (i = devs; i < devs + nr_devs; i++)
-                       block_size = max(block_size,
-                                        get_blocksize(i->path, i->fd));
+                       opts.block_size = max(opts.block_size,
+                                             get_blocksize(i->path, i->fd));
 
        /* calculate bucket sizes: */
        for (i = devs; i < devs + nr_devs; i++) {
+               if (!i->sb_offset) {
+                       i->sb_offset    = BCH_SB_SECTOR;
+                       i->sb_end       = BCH_SB_SECTOR + 256;
+               }
+
                if (!i->size)
                        i->size = get_size(i->path, i->fd) >> 9;
 
                if (!i->bucket_size) {
-                       if (i->size < min_size(block_size))
+                       if (i->size < min_size(opts.block_size))
                                die("cannot format %s, too small (%llu sectors, min %llu)",
-                                   i->path, i->size, min_size(block_size));
+                                   i->path, i->size, min_size(opts.block_size));
 
                        /* Want a bucket size of at least 128k, if possible: */
-                       i->bucket_size = max(block_size, 256U);
+                       i->bucket_size = max(opts.block_size, 256U);
 
                        if (i->size >= min_size(i->bucket_size)) {
                                unsigned scale = max(1,
@@ -99,34 +115,36 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
                        }
                }
 
-               /* first bucket: 1 mb in */
-               i->first_bucket = DIV_ROUND_UP(FIRST_BUCKET_OFFSET, i->bucket_size);
                i->nbuckets     = i->size / i->bucket_size;
 
-               if (i->bucket_size < block_size)
+               if (i->bucket_size < opts.block_size)
                        die("Bucket size cannot be smaller than block size");
 
-               if (i->nbuckets - i->first_bucket < BCH_MIN_NR_NBUCKETS)
+               if (i->nbuckets < BCH_MIN_NR_NBUCKETS)
                        die("Not enough buckets: %llu, need %u (bucket size %u)",
-                           i->nbuckets - i->first_bucket, BCH_MIN_NR_NBUCKETS,
-                           i->bucket_size);
+                           i->nbuckets, BCH_MIN_NR_NBUCKETS, i->bucket_size);
        }
 
        /* calculate btree node size: */
-       if (!btree_node_size) {
+       if (!opts.btree_node_size) {
                /* 256k default btree node size */
-               btree_node_size = 512;
+               opts.btree_node_size = 512;
 
                for (i = devs; i < devs + nr_devs; i++)
-                       btree_node_size = min(btree_node_size, i->bucket_size);
+                       opts.btree_node_size =
+                               min(opts.btree_node_size, i->bucket_size);
        }
 
-       if (!max_journal_entry_size) {
+       if (!opts.max_journal_entry_size) {
                /* 2 MB default: */
-               max_journal_entry_size = 4096;
+               opts.max_journal_entry_size = 4096;
        }
 
-       max_journal_entry_size = roundup_pow_of_two(max_journal_entry_size);
+       opts.max_journal_entry_size =
+               roundup_pow_of_two(opts.max_journal_entry_size);
+
+       if (uuid_is_null(opts.uuid.b))
+               uuid_generate(opts.uuid.b);
 
        sb = calloc(1, sizeof(*sb) +
                    sizeof(struct bch_sb_field_members) +
@@ -135,35 +153,29 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
 
        sb->version     = cpu_to_le64(BCACHE_SB_VERSION_CDEV_V4);
        sb->magic       = BCACHE_MAGIC;
-       sb->block_size  = cpu_to_le16(block_size);
-       sb->user_uuid   = uuid;
+       sb->block_size  = cpu_to_le16(opts.block_size);
+       sb->user_uuid   = opts.uuid;
        sb->nr_devices  = nr_devs;
 
-       init_layout(&sb->layout);
-
        uuid_generate(sb->uuid.b);
 
-       if (label)
-               strncpy((char *) sb->label, label, sizeof(sb->label));
+       if (opts.label)
+               strncpy((char *) sb->label, opts.label, sizeof(sb->label));
 
-       /*
-        * don't have a userspace crc32c implementation handy, just always use
-        * crc64
-        */
-       SET_BCH_SB_CSUM_TYPE(sb,                BCH_CSUM_CRC64);
-       SET_BCH_SB_META_CSUM_TYPE(sb,           meta_csum_type);
-       SET_BCH_SB_DATA_CSUM_TYPE(sb,           data_csum_type);
-       SET_BCH_SB_COMPRESSION_TYPE(sb,         compression_type);
+       SET_BCH_SB_CSUM_TYPE(sb,                opts.meta_csum_type);
+       SET_BCH_SB_META_CSUM_TYPE(sb,           opts.meta_csum_type);
+       SET_BCH_SB_DATA_CSUM_TYPE(sb,           opts.data_csum_type);
+       SET_BCH_SB_COMPRESSION_TYPE(sb,         opts.compression_type);
 
-       SET_BCH_SB_BTREE_NODE_SIZE(sb,          btree_node_size);
+       SET_BCH_SB_BTREE_NODE_SIZE(sb,          opts.btree_node_size);
        SET_BCH_SB_GC_RESERVE(sb,               8);
-       SET_BCH_SB_META_REPLICAS_WANT(sb,       meta_replicas);
-       SET_BCH_SB_META_REPLICAS_HAVE(sb,       meta_replicas);
-       SET_BCH_SB_DATA_REPLICAS_WANT(sb,       data_replicas);
-       SET_BCH_SB_DATA_REPLICAS_HAVE(sb,       data_replicas);
-       SET_BCH_SB_ERROR_ACTION(sb,             on_error_action);
+       SET_BCH_SB_META_REPLICAS_WANT(sb,       opts.meta_replicas);
+       SET_BCH_SB_META_REPLICAS_HAVE(sb,       opts.meta_replicas);
+       SET_BCH_SB_DATA_REPLICAS_WANT(sb,       opts.data_replicas);
+       SET_BCH_SB_DATA_REPLICAS_HAVE(sb,       opts.data_replicas);
+       SET_BCH_SB_ERROR_ACTION(sb,             opts.on_error_action);
        SET_BCH_SB_STR_HASH_TYPE(sb,            BCH_STR_HASH_SIPHASH);
-       SET_BCH_SB_JOURNAL_ENTRY_SIZE(sb,       ilog2(max_journal_entry_size));
+       SET_BCH_SB_JOURNAL_ENTRY_SIZE(sb,       ilog2(opts.max_journal_entry_size));
 
        struct timespec now;
        if (clock_gettime(CLOCK_REALTIME, &now))
@@ -172,7 +184,7 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
        sb->time_base_lo        = cpu_to_le64(now.tv_sec * NSEC_PER_SEC + now.tv_nsec);
        sb->time_precision      = cpu_to_le32(1);
 
-       if (passphrase) {
+       if (opts.encrypted) {
                struct bch_sb_field_crypt *crypt = vstruct_end(sb);
 
                u64s = sizeof(struct bch_sb_field_crypt) / sizeof(u64);
@@ -181,7 +193,7 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
                crypt->field.u64s = cpu_to_le32(u64s);
                crypt->field.type = BCH_SB_FIELD_crypt;
 
-               bch_sb_crypt_init(sb, crypt, passphrase);
+               bch_sb_crypt_init(sb, crypt, opts.passphrase);
                SET_BCH_SB_ENCRYPTION_TYPE(sb, 1);
        }
 
@@ -198,7 +210,7 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
 
                uuid_generate(m->uuid.b);
                m->nbuckets     = cpu_to_le64(i->nbuckets);
-               m->first_bucket = cpu_to_le16(i->first_bucket);
+               m->first_bucket = 0;
                m->bucket_size  = cpu_to_le16(i->bucket_size);
 
                SET_BCH_MEMBER_TIER(m,          i->tier);
@@ -209,42 +221,49 @@ void bcache_format(struct dev_opts *devs, size_t nr_devs,
        for (i = devs; i < devs + nr_devs; i++) {
                sb->dev_idx = i - devs;
 
-               static const char zeroes[BCH_SB_SECTOR << 9];
-               struct nonce nonce = { 0 };
+               init_layout(&sb->layout, opts.block_size,
+                           i->sb_offset, i->sb_end);
 
-               /* Zero start of disk */
-               xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0);
+               if (i->sb_offset == BCH_SB_SECTOR) {
+                       /* Zero start of disk */
+                       static const char zeroes[BCH_SB_SECTOR << 9];
 
-               xpwrite(i->fd, &sb->layout, sizeof(sb->layout),
-                       BCH_SB_LAYOUT_SECTOR << 9);
-
-               for (j = 0; j < sb->layout.nr_superblocks; j++) {
-                       sb->offset = sb->layout.sb_offset[j];
-
-                       sb->csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb),
-                                                  nonce, sb);
-                       xpwrite(i->fd, sb, vstruct_bytes(sb),
-                               le64_to_cpu(sb->offset) << 9);
+                       xpwrite(i->fd, zeroes, BCH_SB_SECTOR << 9, 0);
                }
 
-               fsync(i->fd);
+               bcache_super_write(i->fd, sb);
                close(i->fd);
        }
 
-       bcache_super_print(sb, HUMAN_READABLE);
+       return sb;
+}
+
+void bcache_super_write(int fd, struct bch_sb *sb)
+{
+       struct nonce nonce = { 0 };
+
+       for (unsigned i = 0; i < sb->layout.nr_superblocks; i++) {
+               sb->offset = sb->layout.sb_offset[i];
+
+               if (sb->offset == BCH_SB_SECTOR) {
+                       /* Write backup layout */
+                       xpwrite(fd, &sb->layout, sizeof(sb->layout),
+                               BCH_SB_LAYOUT_SECTOR << 9);
+               }
+
+               sb->csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb), nonce, sb);
+               xpwrite(fd, sb, vstruct_bytes(sb),
+                       le64_to_cpu(sb->offset) << 9);
+       }
 
-       free(sb);
+       fsync(fd);
 }
 
-struct bch_sb *bcache_super_read(const char *path)
+struct bch_sb *__bcache_super_read(int fd, u64 sector)
 {
        struct bch_sb sb, *ret;
 
-       int fd = open(path, O_RDONLY);
-       if (fd < 0)
-               die("couldn't open %s", path);
-
-       xpread(fd, &sb, sizeof(sb), BCH_SB_SECTOR << 9);
+       xpread(fd, &sb, sizeof(sb), sector << 9);
 
        if (memcmp(&sb.magic, &BCACHE_MAGIC, sizeof(sb.magic)))
                die("not a bcache superblock");
@@ -253,11 +272,19 @@ struct bch_sb *bcache_super_read(const char *path)
 
        ret = malloc(bytes);
 
-       xpread(fd, ret, bytes, BCH_SB_SECTOR << 9);
+       xpread(fd, ret, bytes, sector << 9);
 
        return ret;
 }
 
+struct bch_sb *bcache_super_read(const char *path)
+{
+       int fd = xopen(path, O_RDONLY);
+       struct bch_sb *sb = __bcache_super_read(fd, BCH_SB_SECTOR);
+       close(fd);
+       return sb;
+}
+
 void bcache_super_print(struct bch_sb *sb, int units)
 {
        struct bch_sb_field_members *mi;
index 6ec3f42dd1c091f77e93a963123fbab73e58ecaf..779b4708d14751cb4c22c08686b836944e821a7f 100644 (file)
@@ -1,6 +1,7 @@
 #ifndef _LIBBCACHE_H
 #define _LIBBCACHE_H
 
+#include <linux/bcache.h>
 #include <linux/uuid.h>
 #include "tools-util.h"
 #include "vstructs.h"
@@ -18,32 +19,56 @@ enum fsck_err_opts {
 
 extern enum fsck_err_opts fsck_err_opt;
 
+struct format_opts {
+       char            *label;
+       uuid_le         uuid;
+
+       unsigned        on_error_action;
+       unsigned        max_journal_entry_size; /* will be removed */
+
+       unsigned        block_size;
+       unsigned        btree_node_size;
+
+       unsigned        meta_replicas;
+       unsigned        data_replicas;
+
+       unsigned        meta_csum_type;
+       unsigned        data_csum_type;
+       unsigned        compression_type;
+
+       bool            encrypted;
+       char            *passphrase;
+};
+
+static inline struct format_opts format_opts_default()
+{
+       return (struct format_opts) {
+               .on_error_action        = BCH_ON_ERROR_RO,
+               .meta_csum_type         = BCH_CSUM_CRC32C,
+               .data_csum_type         = BCH_CSUM_CRC32C,
+               .meta_replicas          = 1,
+               .data_replicas          = 1,
+       };
+}
+
 struct dev_opts {
        int             fd;
-       const char      *path;
+       char            *path;
        u64             size; /* 512 byte sectors */
        unsigned        bucket_size;
        unsigned        tier;
        bool            discard;
 
-       u64             first_bucket;
        u64             nbuckets;
+
+       u64             sb_offset;
+       u64             sb_end;
 };
 
-void bcache_format(struct dev_opts *devs, size_t nr_devs,
-                  unsigned block_size,
-                  unsigned btree_node_size,
-                  unsigned meta_csum_type,
-                  unsigned data_csum_type,
-                  unsigned compression_type,
-                  const char *passphrase,
-                  unsigned meta_replicas,
-                  unsigned data_replicas,
-                  unsigned on_error_action,
-                  unsigned max_journal_entry_size,
-                  char *label,
-                  uuid_le uuid);
+struct bch_sb *bcache_format(struct format_opts, struct dev_opts *, size_t);
 
+void bcache_super_write(int, struct bch_sb *);
+struct bch_sb *__bcache_super_read(int, u64);
 struct bch_sb *bcache_super_read(const char *);
 
 void bcache_super_print(struct bch_sb *, int);
index 8cb31944917caed3ecc46af9416c05d4d50f0516..93f0c2f14c0fe577dadbb68ef71f4e6a12c076dc 100644 (file)
@@ -73,7 +73,6 @@
 #include <linux/rcupdate.h>
 #include <trace/events/bcache.h>
 
-static size_t bch_bucket_alloc(struct cache *, enum alloc_reserve);
 static void __bch_bucket_free(struct cache *, struct bucket *);
 
 /* Allocation groups: */
@@ -84,12 +83,12 @@ void bch_dev_group_remove(struct cache_group *grp, struct cache *ca)
 
        spin_lock(&grp->lock);
 
-       for (i = 0; i < grp->nr_devices; i++)
+       for (i = 0; i < grp->nr; i++)
                if (rcu_access_pointer(grp->d[i].dev) == ca) {
-                       grp->nr_devices--;
+                       grp->nr--;
                        memmove(&grp->d[i],
                                &grp->d[i + 1],
-                               (grp->nr_devices - i) * sizeof(grp->d[0]));
+                               (grp->nr- i) * sizeof(grp->d[0]));
                        break;
                }
 
@@ -101,13 +100,13 @@ void bch_dev_group_add(struct cache_group *grp, struct cache *ca)
        unsigned i;
 
        spin_lock(&grp->lock);
-       for (i = 0; i < grp->nr_devices; i++)
+       for (i = 0; i < grp->nr; i++)
                if (rcu_access_pointer(grp->d[i].dev) == ca)
                        goto out;
 
-       BUG_ON(grp->nr_devices >= BCH_SB_MEMBERS_MAX);
+       BUG_ON(grp->nr>= BCH_SB_MEMBERS_MAX);
 
-       rcu_assign_pointer(grp->d[grp->nr_devices++].dev, ca);
+       rcu_assign_pointer(grp->d[grp->nr++].dev, ca);
 out:
        spin_unlock(&grp->lock);
 }
@@ -120,25 +119,32 @@ static void pd_controllers_update(struct work_struct *work)
                                           struct cache_set,
                                           pd_controllers_update);
        struct cache *ca;
-       unsigned iter;
-       int i;
+       unsigned i, iter;
 
        /* All units are in bytes */
-       u64 tier_size[BCH_TIER_MAX];
-       u64 tier_free[BCH_TIER_MAX];
-       u64 tier_dirty[BCH_TIER_MAX];
-       u64 tier0_can_free = 0;
+       u64 faster_tiers_size   = 0;
+       u64 faster_tiers_dirty  = 0;
 
-       memset(tier_size, 0, sizeof(tier_size));
-       memset(tier_free, 0, sizeof(tier_free));
-       memset(tier_dirty, 0, sizeof(tier_dirty));
+       u64 fastest_tier_size   = 0;
+       u64 fastest_tier_free   = 0;
+       u64 copygc_can_free     = 0;
 
        rcu_read_lock();
-       for (i = BCH_TIER_MAX - 1; i >= 0; --i)
-               group_for_each_cache_rcu(ca, &c->cache_tiers[i], iter) {
+       for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
+               bch_pd_controller_update(&c->tiers[i].pd,
+                               div_u64(faster_tiers_size *
+                                       c->tiering_percent, 100),
+                               faster_tiers_dirty,
+                               -1);
+
+               group_for_each_cache_rcu(ca, &c->tiers[i].devs, iter) {
                        struct bucket_stats_cache stats = bch_bucket_stats_read_cache(ca);
                        unsigned bucket_bits = ca->bucket_bits + 9;
 
+                       u64 size = (ca->mi.nbuckets -
+                                   ca->mi.first_bucket) << bucket_bits;
+                       u64 dirty = stats.buckets_dirty << bucket_bits;
+                       u64 free = __buckets_free_cache(ca, stats) << bucket_bits;
                        /*
                         * Bytes of internal fragmentation, which can be
                         * reclaimed by copy GC
@@ -149,41 +155,30 @@ static void pd_controllers_update(struct work_struct *work)
                                ((stats.sectors_dirty +
                                  stats.sectors_cached) << 9);
 
-                       u64 dev_size = (ca->mi.nbuckets -
-                                       ca->mi.first_bucket) << bucket_bits;
-
-                       u64 free = __buckets_free_cache(ca, stats) << bucket_bits;
-
                        if (fragmented < 0)
                                fragmented = 0;
 
                        bch_pd_controller_update(&ca->moving_gc_pd,
                                                 free, fragmented, -1);
 
-                       if (i == 0)
-                               tier0_can_free += fragmented;
-
-                       tier_size[i] += dev_size;
-                       tier_free[i] += free;
-                       tier_dirty[i] += stats.buckets_dirty << bucket_bits;
-               }
-       rcu_read_unlock();
-
-       if (tier_size[1]) {
-               u64 target = div_u64(tier_size[0] * c->tiering_percent, 100);
+                       faster_tiers_size               += size;
+                       faster_tiers_dirty              += dirty;
 
-               tier0_can_free = max_t(s64, 0, tier_dirty[0] - target);
+                       if (!c->fastest_tier ||
+                           c->fastest_tier == &c->tiers[i]) {
+                               fastest_tier_size       += size;
+                               fastest_tier_free       += free;
+                       }
 
-               bch_pd_controller_update(&c->tiering_pd,
-                                        target,
-                                        tier_dirty[0],
-                                        -1);
+                       copygc_can_free                 += fragmented;
+               }
        }
 
+       rcu_read_unlock();
+
        /*
         * Throttle foreground writes if tier 0 is running out of free buckets,
-        * and either tiering or copygc can free up space (but don't take both
-        * into account).
+        * and either tiering or copygc can free up space.
         *
         * Target will be small if there isn't any work to do - we don't want to
         * throttle foreground writes if we currently have all the free space
@@ -192,12 +187,15 @@ static void pd_controllers_update(struct work_struct *work)
         * Otherwise, if there's work to do, try to keep 20% of tier0 available
         * for foreground writes.
         */
+       if (c->fastest_tier)
+               copygc_can_free = U64_MAX;
+
        bch_pd_controller_update(&c->foreground_write_pd,
-                                min(tier0_can_free,
-                                    div_u64(tier_size[0] *
+                                min(copygc_can_free,
+                                    div_u64(fastest_tier_size *
                                             c->foreground_target_percent,
                                             100)),
-                                tier_free[0],
+                                fastest_tier_free,
                                 -1);
 
        schedule_delayed_work(&c->pd_controllers_update,
@@ -301,7 +299,8 @@ static int bch_prio_write(struct cache *ca)
                 * it getting gc'd from under us
                 */
                ca->prio_buckets[i] = r;
-               bch_mark_metadata_bucket(ca, ca->buckets + r, false);
+               bch_mark_metadata_bucket(ca, ca->buckets + r,
+                                        BUCKET_PRIOS, false);
                spin_unlock(&ca->prio_buckets_lock);
 
                SET_PSET_CSUM_TYPE(p, bch_meta_checksum_type(c));
@@ -334,6 +333,9 @@ static int bch_prio_write(struct cache *ca)
        do {
                unsigned u64s = jset_u64s(0);
 
+               if (!test_bit(JOURNAL_STARTED, &c->journal.flags))
+                       break;
+
                ret = bch_journal_res_get(j, &res, u64s, u64s);
                if (ret)
                        return ret;
@@ -815,8 +817,7 @@ static void bch_find_empty_buckets(struct cache_set *c, struct cache *ca)
                if (is_available_bucket(m) &&
                    !m.cached_sectors &&
                    !m.had_metadata &&
-                   (!m.wait_on_journal ||
-                    ((s16) last_seq_ondisk - (s16) m.journal_seq >= 0))) {
+                   !bucket_needs_journal_commit(m, last_seq_ondisk)) {
                        spin_lock(&ca->freelist_lock);
 
                        bch_mark_alloc_bucket(ca, g, true);
@@ -850,6 +851,8 @@ static int bch_allocator_thread(void *arg)
 
        set_freezable();
 
+       bch_find_empty_buckets(c, ca);
+
        while (1) {
                /*
                 * First, we pull buckets off of the free_inc list, possibly
@@ -894,7 +897,7 @@ static int bch_allocator_thread(void *arg)
                 * See if we have buckets we can reuse without invalidating them
                 * or forcing a journal commit:
                 */
-               bch_find_empty_buckets(c, ca);
+               //bch_find_empty_buckets(c, ca);
 
                if (fifo_used(&ca->free_inc) * 2 > ca->free_inc.size) {
                        up_read(&c->gc_lock);
@@ -967,7 +970,7 @@ out:
  *
  * Returns index of bucket on success, 0 on failure
  * */
-static size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve)
+size_t bch_bucket_alloc(struct cache *ca, enum alloc_reserve reserve)
 {
        struct bucket *g;
        long r;
@@ -1018,21 +1021,21 @@ static void recalc_alloc_group_weights(struct cache_set *c,
        u64 available_buckets = 1; /* avoid a divide by zero... */
        unsigned i;
 
-       for (i = 0; i < devs->nr_devices; i++) {
+       for (i = 0; i < devs->nr; i++) {
                ca = devs->d[i].dev;
 
                devs->d[i].weight = buckets_free_cache(ca);
                available_buckets += devs->d[i].weight;
        }
 
-       for (i = 0; i < devs->nr_devices; i++) {
+       for (i = 0; i < devs->nr; i++) {
                const unsigned min_weight = U32_MAX >> 4;
                const unsigned max_weight = U32_MAX;
 
                devs->d[i].weight =
                        min_weight +
                        div64_u64(devs->d[i].weight *
-                                 devs->nr_devices *
+                                 devs->nr *
                                  (max_weight - min_weight),
                                  available_buckets);
                devs->d[i].weight = min_t(u64, devs->d[i].weight, max_weight);
@@ -1058,7 +1061,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
        rcu_read_lock();
        spin_lock(&devs->lock);
 
-       for (i = 0; i < devs->nr_devices; i++)
+       for (i = 0; i < devs->nr; i++)
                available += !test_bit(devs->d[i].dev->dev_idx,
                                       caches_used);
 
@@ -1076,7 +1079,7 @@ static enum bucket_alloc_ret bch_bucket_alloc_group(struct cache_set *c,
                }
 
                i++;
-               i %= devs->nr_devices;
+               i %= devs->nr;
 
                ret = FREELIST_EMPTY;
                if (i == fail_idx)
@@ -1136,20 +1139,25 @@ static enum bucket_alloc_ret __bch_bucket_alloc_set(struct cache_set *c,
                                                    enum alloc_reserve reserve,
                                                    long *caches_used)
 {
+       struct bch_tier *tier;
        /*
         * this should implement policy - for a given type of allocation, decide
         * which devices to allocate from:
         *
         * XXX: switch off wp->type and do something more intelligent here
         */
+       if (wp->group)
+               return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
+                                             wp->group, caches_used);
 
-       /* foreground writes: prefer tier 0: */
-       if (wp->group == &c->cache_all)
+       /* foreground writes: prefer fastest tier: */
+       tier = READ_ONCE(c->fastest_tier);
+       if (tier)
                bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
-                                      &c->cache_tiers[0], caches_used);
+                                      &tier->devs, caches_used);
 
        return bch_bucket_alloc_group(c, ob, reserve, nr_replicas,
-                                     wp->group, caches_used);
+                                     &c->cache_all, caches_used);
 }
 
 static int bch_bucket_alloc_set(struct cache_set *c, struct write_point *wp,
@@ -1413,7 +1421,6 @@ struct open_bucket *bch_alloc_sectors_start(struct cache_set *c,
                ? 0 : BTREE_NODE_RESERVE;
        int ret;
 
-       BUG_ON(!wp->group);
        BUG_ON(!reserve);
        BUG_ON(!nr_replicas);
 retry:
@@ -1481,7 +1488,7 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
                                   unsigned nr_replicas, struct open_bucket *ob,
                                   unsigned sectors)
 {
-       struct bch_extent_ptr tmp, *ptr;
+       struct bch_extent_ptr tmp;
        struct cache *ca;
        bool has_data = false;
        unsigned i;
@@ -1501,6 +1508,8 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
        if (nr_replicas < ob->nr_ptrs)
                has_data = true;
 
+       rcu_read_lock();
+
        for (i = 0; i < nr_replicas; i++) {
                EBUG_ON(bch_extent_has_device(extent_i_to_s_c(e), ob->ptrs[i].dev));
 
@@ -1510,10 +1519,12 @@ void bch_alloc_sectors_append_ptrs(struct cache_set *c, struct bkey_i_extent *e,
                extent_ptr_append(e, tmp);
 
                ob->ptr_offset[i] += sectors;
+
+               if ((ca = PTR_CACHE(c, &ob->ptrs[i])))
+                       this_cpu_add(*ca->sectors_written, sectors);
        }
 
-       open_bucket_for_each_online_device(c, ob, ptr, ca)
-               this_cpu_add(*ca->sectors_written, sectors);
+       rcu_read_unlock();
 }
 
 /*
@@ -1586,9 +1597,9 @@ struct open_bucket *bch_alloc_sectors(struct cache_set *c,
 
 /* Startup/shutdown (ro/rw): */
 
-static void bch_recalc_capacity(struct cache_set *c)
+void bch_recalc_capacity(struct cache_set *c)
 {
-       struct cache_group *tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers);
+       struct bch_tier *fastest_tier = NULL, *slowest_tier = NULL, *tier;
        struct cache *ca;
        u64 total_capacity, capacity = 0, reserved_sectors = 0;
        unsigned long ra_pages = 0;
@@ -1604,16 +1615,29 @@ static void bch_recalc_capacity(struct cache_set *c)
 
        c->bdi.ra_pages = ra_pages;
 
+       /* Find fastest, slowest tiers with devices: */
+
+       for (tier = c->tiers;
+            tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
+               if (!tier->devs.nr)
+                       continue;
+               if (!fastest_tier)
+                       fastest_tier = tier;
+               slowest_tier = tier;
+       }
+
+       c->fastest_tier = fastest_tier != slowest_tier ? fastest_tier : NULL;
+
+       c->promote_write_point.group = &fastest_tier->devs;
+
+       if (!fastest_tier)
+               goto set_capacity;
+
        /*
         * Capacity of the cache set is the capacity of all the devices in the
         * slowest (highest) tier - we don't include lower tier devices.
         */
-       for (tier = c->cache_tiers + ARRAY_SIZE(c->cache_tiers) - 1;
-            tier > c->cache_tiers && !tier->nr_devices;
-            --tier)
-               ;
-
-       group_for_each_cache_rcu(ca, tier, i) {
+       group_for_each_cache_rcu(ca, &slowest_tier->devs, i) {
                size_t reserve = 0;
 
                /*
@@ -1649,8 +1673,8 @@ static void bch_recalc_capacity(struct cache_set *c)
                             ca->mi.first_bucket) <<
                        ca->bucket_bits;
        }
+set_capacity:
        rcu_read_unlock();
-
        total_capacity = capacity;
 
        capacity *= (100 - c->opts.gc_reserve_percent);
@@ -1727,7 +1751,7 @@ static bool bch_dev_has_open_write_point(struct cache *ca)
 void bch_dev_allocator_stop(struct cache *ca)
 {
        struct cache_set *c = ca->set;
-       struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+       struct cache_group *tier = &c->tiers[ca->mi.tier].devs;
        struct task_struct *p;
        struct closure cl;
        unsigned i;
@@ -1808,7 +1832,7 @@ void bch_dev_allocator_stop(struct cache *ca)
 int bch_dev_allocator_start(struct cache *ca)
 {
        struct cache_set *c = ca->set;
-       struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
+       struct cache_group *tier = &c->tiers[ca->mi.tier].devs;
        struct task_struct *k;
 
        /*
@@ -1826,6 +1850,7 @@ int bch_dev_allocator_start(struct cache *ca)
 
        bch_dev_group_add(tier, ca);
        bch_dev_group_add(&c->cache_all, ca);
+       bch_dev_group_add(&c->journal.devs, ca);
 
        bch_recalc_capacity(c);
 
@@ -1838,7 +1863,7 @@ int bch_dev_allocator_start(struct cache *ca)
        return 0;
 }
 
-void bch_open_buckets_init(struct cache_set *c)
+void bch_fs_allocator_init(struct cache_set *c)
 {
        unsigned i;
 
@@ -1860,19 +1885,11 @@ void bch_open_buckets_init(struct cache_set *c)
 
        spin_lock_init(&c->cache_all.lock);
 
-       for (i = 0; i < ARRAY_SIZE(c->write_points); i++) {
-               c->write_points[i].throttle = true;
-               c->write_points[i].group = &c->cache_tiers[0];
-       }
-
-       for (i = 0; i < ARRAY_SIZE(c->cache_tiers); i++)
-               spin_lock_init(&c->cache_tiers[i].lock);
+       for (i = 0; i < ARRAY_SIZE(c->tiers); i++)
+               spin_lock_init(&c->tiers[i].devs.lock);
 
-       c->promote_write_point.group = &c->cache_tiers[0];
-
-       c->migration_write_point.group = &c->cache_all;
-
-       c->btree_write_point.group = &c->cache_all;
+       for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
+               c->write_points[i].throttle = true;
 
        c->pd_controllers_update_seconds = 5;
        INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
index 09139a59d268d703b82977700460efd46f595bf7..9573dd2cf240f75d4bf95fd2f7322cd99ad03150 100644 (file)
@@ -27,6 +27,8 @@ int bch_prio_read(struct cache *);
 
 void bch_recalc_min_prio(struct cache *, int);
 
+size_t bch_bucket_alloc(struct cache *, enum alloc_reserve);
+
 void bch_open_bucket_put(struct cache_set *, struct open_bucket *);
 
 struct open_bucket *bch_alloc_sectors_start(struct cache_set *,
@@ -58,7 +60,7 @@ static inline struct cache *cache_group_next_rcu(struct cache_group *devs,
 {
        struct cache *ret = NULL;
 
-       while (*iter < devs->nr_devices &&
+       while (*iter < devs->nr &&
               !(ret = rcu_dereference(devs->d[*iter].dev)))
                (*iter)++;
 
@@ -103,8 +105,9 @@ static inline struct cache *cache_group_next(struct cache_group *devs,
             ((_ca) = __open_bucket_next_online_device(_c, _ob, _ptr, _ca));\
             (_ptr)++)
 
+void bch_recalc_capacity(struct cache_set *);
 void bch_dev_allocator_stop(struct cache *);
 int bch_dev_allocator_start(struct cache *);
-void bch_open_buckets_init(struct cache_set *);
+void bch_fs_allocator_init(struct cache_set *);
 
 #endif /* _BCACHE_ALLOC_H */
index fbe8b75c8251750ab96137fc20e0b57f6cd830fa..f408bd9728d4b3e6e590f67fdd332b7abe605a8a 100644 (file)
@@ -51,7 +51,7 @@ static inline bool allocation_is_metadata(enum alloc_reserve id)
 
 struct cache_group {
        spinlock_t              lock;
-       unsigned                nr_devices;
+       unsigned                nr;
        unsigned                cur_device;
        struct {
                u64             weight;
index babc08dbe4d3ce990aba6a071761c32aa685581c..5b668c718ff11c396ba7fdb7e1ded0ddcc8f661b 100644 (file)
@@ -464,24 +464,10 @@ struct cache {
  * BCH_FS_UNREGISTERING means we're not just shutting down, we're detaching
  * all the backing devices first (their cached data gets invalidated, and they
  * won't automatically reattach).
- *
- * BCH_FS_STOPPING always gets set first when we're closing down a cache set;
- * we'll continue to run normally for awhile with BCH_FS_STOPPING set (i.e.
- * flushing dirty data).
- *
- * BCH_FS_RUNNING means all cache devices have been registered and journal
- * replay is complete.
  */
 enum {
-       /* Startup: */
        BCH_FS_INITIAL_GC_DONE,
-       BCH_FS_RUNNING,
-
-       /* Shutdown: */
        BCH_FS_DETACHING,
-       BCH_FS_STOPPING,
-       BCH_FS_RO,
-       BCH_FS_RO_COMPLETE,
        BCH_FS_EMERGENCY_RO,
        BCH_FS_WRITE_DISABLE_COMPLETE,
        BCH_FS_GC_STOPPING,
@@ -498,6 +484,21 @@ struct btree_debug {
        struct dentry           *failed;
 };
 
+struct bch_tier {
+       unsigned                idx;
+       struct task_struct      *migrate;
+       struct bch_pd_controller pd;
+
+       struct cache_group      devs;
+};
+
+enum bch_fs_state {
+       BCH_FS_STARTING         = 0,
+       BCH_FS_STOPPING,
+       BCH_FS_RO,
+       BCH_FS_RW,
+};
+
 struct cache_set {
        struct closure          cl;
 
@@ -506,7 +507,6 @@ struct cache_set {
        struct kobject          internal;
        struct kobject          opts_dir;
        struct kobject          time_stats;
-       struct completion       *stop_completion;
        unsigned long           flags;
 
        int                     minor;
@@ -514,6 +514,10 @@ struct cache_set {
        struct super_block      *vfs_sb;
        char                    name[40];
 
+       /* ro/rw, add/remove devices: */
+       struct mutex            state_lock;
+       enum bch_fs_state       state;
+
        /* Counts outstanding writes, for clean transition to read-only */
        struct percpu_ref       writes;
        struct work_struct      read_only_work;
@@ -640,7 +644,9 @@ struct cache_set {
         * allocate from:
         */
        struct cache_group      cache_all;
-       struct cache_group      cache_tiers[BCH_TIER_MAX];
+       struct bch_tier         tiers[BCH_TIER_MAX];
+       /* NULL if we only have devices in one tier: */
+       struct bch_tier         *fastest_tier;
 
        u64                     capacity; /* sectors */
 
@@ -753,10 +759,6 @@ struct cache_set {
        unsigned                writeback_pages_max;
        atomic_long_t           nr_inodes;
 
-       /* TIERING */
-       struct task_struct      *tiering_read;
-       struct bch_pd_controller tiering_pd;
-
        /* NOTIFICATIONS */
        struct mutex            uevent_lock;
        struct kobj_uevent_env  uevent_env;
@@ -828,6 +830,11 @@ struct cache_set {
 #undef BCH_TIME_STAT
 };
 
+static inline bool bch_fs_running(struct cache_set *c)
+{
+       return c->state == BCH_FS_RO || c->state == BCH_FS_RW;
+}
+
 static inline unsigned bucket_pages(const struct cache *ca)
 {
        return ca->mi.bucket_size / PAGE_SECTORS;
index 82b07f594a657cbedfe7ce3a4f456c19bf304d7e..ba2e9a8cd891c002d7706e01af4c3878c5d94697 100644 (file)
@@ -375,6 +375,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
        bool found;
        int ret;
 
+       lockdep_assert_held(&c->state_lock);
+
        bdevname(dc->disk_sb.bdev, buf);
 
        if (memcmp(&dc->disk_sb.sb->set_uuid,
@@ -387,11 +389,8 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
                return -EINVAL;
        }
 
-       if (!test_bit(BCH_FS_RUNNING, &c->flags))
-               return 0;
-
-       if (test_bit(BCH_FS_STOPPING, &c->flags)) {
-               pr_err("Can't attach %s: shutting down", buf);
+       if (!bch_fs_running(c)) {
+               pr_err("Can't attach %s: not running", buf);
                return -EINVAL;
        }
 
@@ -497,6 +496,7 @@ void bch_attach_backing_devs(struct cache_set *c)
        struct cached_dev *dc, *t;
 
        lockdep_assert_held(&bch_register_lock);
+       lockdep_assert_held(&c->state_lock);
 
        list_for_each_entry_safe(dc, t, &uncached_devices, list)
                bch_cached_dev_attach(dc, c);
@@ -742,7 +742,7 @@ int bch_blockdev_volumes_start(struct cache_set *c)
        struct bkey_s_c_inode_blockdev inode;
        int ret = 0;
 
-       if (test_bit(BCH_FS_STOPPING, &c->flags))
+       if (!bch_fs_running(c))
                return -EINVAL;
 
        for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, k) {
index 4d5efdbd3970dfe4faac855a16b739a6e11ddc14..4d0c6d4d4a5aa72f522ec7efa3d75ba60afd2a2b 100644 (file)
@@ -11,8 +11,9 @@
 
 #define DEF_BTREE_ID(kwd, val, name) name,
 
-const char *bch_btree_id_names[BTREE_ID_NR] = {
+const char * const bch_btree_ids[] = {
        DEFINE_BCH_BTREE_IDS()
+       NULL
 };
 
 #undef DEF_BTREE_ID
@@ -311,7 +312,7 @@ static unsigned long bch_mca_count(struct shrinker *shrink,
        return mca_can_free(c) * btree_pages(c);
 }
 
-void bch_btree_cache_free(struct cache_set *c)
+void bch_fs_btree_exit(struct cache_set *c)
 {
        struct btree *b;
        unsigned i;
@@ -358,7 +359,7 @@ void bch_btree_cache_free(struct cache_set *c)
                rhashtable_destroy(&c->btree_cache_table);
 }
 
-int bch_btree_cache_alloc(struct cache_set *c)
+int bch_fs_btree_init(struct cache_set *c)
 {
        unsigned i;
        int ret;
index c26489d129cd505d48dabe443b9d3ebf4a4a0ff3..4d67704b953ae3b43443d4e754612af045cdc2d9 100644 (file)
@@ -6,7 +6,7 @@
 
 struct btree_iter;
 
-extern const char *bch_btree_id_names[BTREE_ID_NR];
+extern const char * const bch_btree_ids[];
 
 void bch_recalc_btree_reserve(struct cache_set *);
 
@@ -22,8 +22,8 @@ struct btree *mca_alloc(struct cache_set *);
 struct btree *bch_btree_node_get(struct btree_iter *, const struct bkey_i *,
                                 unsigned, enum six_lock_type);
 
-void bch_btree_cache_free(struct cache_set *);
-int bch_btree_cache_alloc(struct cache_set *);
+void bch_fs_btree_exit(struct cache_set *);
+int bch_fs_btree_init(struct cache_set *);
 
 #define for_each_cached_btree(_b, _c, _tbl, _iter, _pos)               \
        for ((_tbl) = rht_dereference_rcu((_c)->btree_cache_table.tbl,  \
index 0eb7290c427cce3a3b811fa29d1a8df741adb6c6..b90807f7be317a69ca6c5d5e71fa126d86e17b67 100644 (file)
@@ -262,30 +262,72 @@ static void bch_mark_allocator_buckets(struct cache_set *c)
        }
 }
 
+static void mark_metadata_sectors(struct cache *ca, u64 start, u64 end,
+                                 enum bucket_data_type type)
+{
+       u64 b = start >> ca->bucket_bits;
+
+       do {
+               bch_mark_metadata_bucket(ca, ca->buckets + b, type, true);
+               b++;
+       } while (b < end >> ca->bucket_bits);
+}
+
 /*
  * Mark non btree metadata - prios, journal
  */
-static void bch_mark_metadata(struct cache_set *c)
+static void bch_mark_dev_metadata(struct cache_set *c, struct cache *ca)
 {
-       struct cache *ca;
-       unsigned i, j;
+       struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
+       unsigned i;
        u64 b;
 
-       for_each_cache(ca, c, i) {
-               for (j = 0; j < ca->journal.nr; j++) {
-                       b = ca->journal.buckets[j];
-                       bch_mark_metadata_bucket(ca, ca->buckets + b, true);
-               }
+       /* Mark superblocks: */
+       for (i = 0; i < layout->nr_superblocks; i++) {
+               if (layout->sb_offset[i] == BCH_SB_SECTOR)
+                       mark_metadata_sectors(ca, 0, BCH_SB_SECTOR,
+                                             BUCKET_SB);
+
+               mark_metadata_sectors(ca,
+                                     layout->sb_offset[i],
+                                     layout->sb_offset[i] +
+                                     (1 << layout->sb_max_size_bits),
+                                     BUCKET_SB);
+       }
 
-               spin_lock(&ca->prio_buckets_lock);
+       spin_lock(&c->journal.lock);
 
-               for (j = 0; j < prio_buckets(ca) * 2; j++) {
-                       b = ca->prio_buckets[j];
-                       bch_mark_metadata_bucket(ca, ca->buckets + b, true);
-               }
+       for (i = 0; i < ca->journal.nr; i++) {
+               b = ca->journal.buckets[i];
+               bch_mark_metadata_bucket(ca, ca->buckets + b,
+                                        BUCKET_JOURNAL, true);
+       }
+
+       spin_unlock(&c->journal.lock);
+
+       spin_lock(&ca->prio_buckets_lock);
 
-               spin_unlock(&ca->prio_buckets_lock);
+       for (i = 0; i < prio_buckets(ca) * 2; i++) {
+               b = ca->prio_buckets[i];
+               if (b)
+                       bch_mark_metadata_bucket(ca, ca->buckets + b,
+                                                BUCKET_PRIOS, true);
        }
+
+       spin_unlock(&ca->prio_buckets_lock);
+}
+
+static void bch_mark_metadata(struct cache_set *c)
+{
+       struct cache *ca;
+       unsigned i;
+
+       mutex_lock(&c->sb_lock);
+
+       for_each_cache(ca, c, i)
+               bch_mark_dev_metadata(c, ca);
+
+       mutex_unlock(&c->sb_lock);
 }
 
 /* Also see bch_pending_btree_node_free_insert_done() */
@@ -389,7 +431,7 @@ void bch_gc(struct cache_set *c)
                for_each_bucket(g, ca) {
                        bucket_cmpxchg(g, new, ({
                                new.owned_by_allocator  = 0;
-                               new.is_metadata         = 0;
+                               new.data_type           = 0;
                                new.cached_sectors      = 0;
                                new.dirty_sectors       = 0;
                        }));
@@ -750,9 +792,6 @@ void bch_coalesce(struct cache_set *c)
        u64 start_time;
        enum btree_id id;
 
-       if (btree_gc_coalesce_disabled(c))
-               return;
-
        if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
                return;
 
@@ -811,7 +850,8 @@ static int bch_gc_thread(void *arg)
                last_kick = atomic_read(&c->kick_gc);
 
                bch_gc(c);
-               bch_coalesce(c);
+               if (!btree_gc_coalesce_disabled(c))
+                       bch_coalesce(c);
 
                debug_check_no_locks_held();
        }
@@ -823,18 +863,24 @@ void bch_gc_thread_stop(struct cache_set *c)
 {
        set_bit(BCH_FS_GC_STOPPING, &c->flags);
 
-       if (!IS_ERR_OR_NULL(c->gc_thread))
+       if (c->gc_thread)
                kthread_stop(c->gc_thread);
+
+       c->gc_thread = NULL;
+       clear_bit(BCH_FS_GC_STOPPING, &c->flags);
 }
 
 int bch_gc_thread_start(struct cache_set *c)
 {
-       clear_bit(BCH_FS_GC_STOPPING, &c->flags);
+       struct task_struct *p;
 
-       c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
-       if (IS_ERR(c->gc_thread))
-               return PTR_ERR(c->gc_thread);
+       BUG_ON(c->gc_thread);
 
+       p = kthread_create(bch_gc_thread, c, "bcache_gc");
+       if (IS_ERR(p))
+               return PTR_ERR(p);
+
+       c->gc_thread = p;
        wake_up_process(c->gc_thread);
        return 0;
 }
@@ -883,12 +929,13 @@ int bch_initial_gc(struct cache_set *c, struct list_head *journal)
 {
        enum btree_id id;
 
-       if (journal) {
-               for (id = 0; id < BTREE_ID_NR; id++)
-                       bch_initial_gc_btree(c, id);
+       bch_mark_metadata(c);
 
+       for (id = 0; id < BTREE_ID_NR; id++)
+               bch_initial_gc_btree(c, id);
+
+       if (journal)
                bch_journal_mark(c, journal);
-       }
 
        /*
         * Skip past versions that might have possibly been used (as nonces),
@@ -897,8 +944,6 @@ int bch_initial_gc(struct cache_set *c, struct list_head *journal)
        if (c->sb.encryption_type)
                atomic64_add(1 << 16, &c->key_version);
 
-       bch_mark_metadata(c);
-
        gc_pos_set(c, gc_phase(GC_PHASE_DONE));
        set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
 
index 315cfbec8625a61363196d96464b566176837e58..ec4ee54a9b4ddac0e05c3a1ea560f02934cf358a 100644 (file)
@@ -66,6 +66,7 @@
 #include "alloc.h"
 #include "btree_gc.h"
 #include "buckets.h"
+#include "error.h"
 
 #include <linux/preempt.h>
 #include <trace/events/bcache.h>
@@ -102,6 +103,10 @@ static void bch_fs_stats_verify(struct cache_set *c) {}
 
 #endif
 
+/*
+ * Clear journal_seq_valid for buckets for which it's not needed, to prevent
+ * wraparound:
+ */
 void bch_bucket_seq_cleanup(struct cache_set *c)
 {
        u16 last_seq_ondisk = c->journal.last_seq_ondisk;
@@ -113,12 +118,11 @@ void bch_bucket_seq_cleanup(struct cache_set *c)
        for_each_cache(ca, c, i)
                for_each_bucket(g, ca) {
                        bucket_cmpxchg(g, m, ({
-                               if (!m.wait_on_journal ||
-                                   ((s16) last_seq_ondisk -
-                                    (s16) m.journal_seq < 0))
+                               if (!m.journal_seq_valid ||
+                                   bucket_needs_journal_commit(m, last_seq_ondisk))
                                        break;
 
-                               m.wait_on_journal = 0;
+                               m.journal_seq_valid = 0;
                        }));
                }
 }
@@ -186,17 +190,18 @@ bch_bucket_stats_read_cache_set(struct cache_set *c)
 
 static inline int is_meta_bucket(struct bucket_mark m)
 {
-       return !m.owned_by_allocator && m.is_metadata;
+       return m.data_type != BUCKET_DATA;
 }
 
 static inline int is_dirty_bucket(struct bucket_mark m)
 {
-       return !m.owned_by_allocator && !m.is_metadata && !!m.dirty_sectors;
+       return m.data_type == BUCKET_DATA && !!m.dirty_sectors;
 }
 
 static inline int is_cached_bucket(struct bucket_mark m)
 {
-       return !m.owned_by_allocator && !m.dirty_sectors && !!m.cached_sectors;
+       return m.data_type == BUCKET_DATA &&
+               !m.dirty_sectors && !!m.cached_sectors;
 }
 
 void bch_fs_stats_apply(struct cache_set *c,
@@ -236,29 +241,37 @@ void bch_fs_stats_apply(struct cache_set *c,
        memset(stats, 0, sizeof(*stats));
 }
 
+static bool bucket_became_unavailable(struct cache_set *c,
+                                     struct bucket_mark old,
+                                     struct bucket_mark new)
+{
+       return is_available_bucket(old) &&
+              !is_available_bucket(new) &&
+              c->gc_pos.phase == GC_PHASE_DONE;
+}
+
 static void bucket_stats_update(struct cache *ca,
                        struct bucket_mark old, struct bucket_mark new,
-                       bool may_make_unavailable,
                        struct bucket_stats_cache_set *bch_alloc_stats)
 {
        struct cache_set *c = ca->set;
        struct bucket_stats_cache *cache_stats;
 
-       BUG_ON(!may_make_unavailable &&
-              is_available_bucket(old) &&
-              !is_available_bucket(new) &&
-              c->gc_pos.phase == GC_PHASE_DONE);
+       bch_fs_inconsistent_on(old.data_type && new.data_type &&
+                       old.data_type != new.data_type, c,
+                       "different types of metadata in same bucket: %u, %u",
+                       old.data_type, new.data_type);
 
        if (bch_alloc_stats) {
                bch_alloc_stats->s[S_COMPRESSED][S_CACHED] +=
                        (int) new.cached_sectors - (int) old.cached_sectors;
 
                bch_alloc_stats->s[S_COMPRESSED]
-                       [old.is_metadata ? S_META : S_DIRTY] -=
+                       [is_meta_bucket(old) ? S_META : S_DIRTY] -=
                        old.dirty_sectors;
 
                bch_alloc_stats->s[S_COMPRESSED]
-                       [new.is_metadata ? S_META : S_DIRTY] +=
+                       [is_meta_bucket(new) ? S_META : S_DIRTY] +=
                        new.dirty_sectors;
        }
 
@@ -268,12 +281,12 @@ static void bucket_stats_update(struct cache *ca,
        cache_stats->sectors_cached +=
                (int) new.cached_sectors - (int) old.cached_sectors;
 
-       if (old.is_metadata)
+       if (is_meta_bucket(old))
                cache_stats->sectors_meta -= old.dirty_sectors;
        else
                cache_stats->sectors_dirty -= old.dirty_sectors;
 
-       if (new.is_metadata)
+       if (is_meta_bucket(new))
                cache_stats->sectors_meta += new.dirty_sectors;
        else
                cache_stats->sectors_dirty += new.dirty_sectors;
@@ -290,6 +303,15 @@ static void bucket_stats_update(struct cache *ca,
                bch_wake_allocator(ca);
 }
 
+#define bucket_data_cmpxchg(ca, g, new, expr)                  \
+({                                                             \
+       struct bucket_stats_cache_set _stats = { 0 };           \
+       struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
+                                                               \
+       bucket_stats_update(ca, _old, new, &_stats);            \
+       _old;                                                   \
+})
+
 void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
 {
        struct bucket_stats_cache_set stats = { 0 };
@@ -297,16 +319,17 @@ void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
 
        old = bucket_cmpxchg(g, new, ({
                new.owned_by_allocator  = 1;
-               new.is_metadata         = 0;
+               new.had_metadata        = 0;
+               new.data_type           = 0;
                new.cached_sectors      = 0;
                new.dirty_sectors       = 0;
                new.copygc              = 0;
                new.gen++;
        }));
 
-       BUG_ON(old.dirty_sectors);
+       bucket_stats_update(ca, old, new, &stats);
 
-       bucket_stats_update(ca, old, new, true, &stats);
+       BUG_ON(old.dirty_sectors);
 
        /*
         * Ick:
@@ -329,45 +352,45 @@ void bch_invalidate_bucket(struct cache *ca, struct bucket *g)
 
 void bch_mark_free_bucket(struct cache *ca, struct bucket *g)
 {
-       struct bucket_stats_cache_set stats = { 0 };
        struct bucket_mark old, new;
 
-       old = bucket_cmpxchg(g, new, ({
+       old = bucket_data_cmpxchg(ca, g, new, ({
                new.owned_by_allocator  = 0;
-               new.is_metadata         = 0;
+               new.data_type           = 0;
                new.cached_sectors      = 0;
                new.dirty_sectors       = 0;
        }));
 
-       bucket_stats_update(ca, old, new, false, &stats);
+       BUG_ON(bucket_became_unavailable(ca->set, old, new));
 }
 
 void bch_mark_alloc_bucket(struct cache *ca, struct bucket *g,
                           bool owned_by_allocator)
 {
-       struct bucket_stats_cache_set stats = { 0 };
-       struct bucket_mark old, new;
+       struct bucket_mark new;
 
-       old = bucket_cmpxchg(g, new, new.owned_by_allocator = owned_by_allocator);
-
-       bucket_stats_update(ca, old, new, true, &stats);
+       bucket_data_cmpxchg(ca, g, new, ({
+               new.owned_by_allocator = owned_by_allocator;
+       }));
 }
 
 void bch_mark_metadata_bucket(struct cache *ca, struct bucket *g,
+                             enum bucket_data_type type,
                              bool may_make_unavailable)
 {
-       struct bucket_stats_cache_set stats = { 0 };
        struct bucket_mark old, new;
 
-       old = bucket_cmpxchg(g, new, ({
-               new.is_metadata = 1;
+       BUG_ON(!type);
+
+       old = bucket_data_cmpxchg(ca, g, new, ({
+               new.data_type = type;
                new.had_metadata = 1;
        }));
 
        BUG_ON(old.cached_sectors);
        BUG_ON(old.dirty_sectors);
-
-       bucket_stats_update(ca, old, new, may_make_unavailable, &stats);
+       BUG_ON(!may_make_unavailable &&
+              bucket_became_unavailable(ca->set, old, new));
 }
 
 #define saturated_add(ca, dst, src, max)                       \
@@ -487,22 +510,26 @@ static void bch_mark_pointer(struct cache_set *c,
 
                if (!new.dirty_sectors &&
                    !new.cached_sectors) {
-                       new.is_metadata = false;
+                       new.data_type   = 0;
 
                        if (journal_seq) {
-                               new.wait_on_journal = true;
+                               new.journal_seq_valid = 1;
                                new.journal_seq = journal_seq;
                        }
                } else {
-                       new.is_metadata = (type == S_META);
+                       new.data_type = type == S_META
+                               ? BUCKET_BTREE : BUCKET_DATA;
                }
 
-               new.had_metadata |= new.is_metadata;
+               new.had_metadata |= is_meta_bucket(new);
        } while ((v = cmpxchg(&g->_mark.counter,
                              old.counter,
                              new.counter)) != old.counter);
 
-       bucket_stats_update(ca, old, new, may_make_unavailable, NULL);
+       bucket_stats_update(ca, old, new, NULL);
+
+       BUG_ON(!may_make_unavailable &&
+              bucket_became_unavailable(c, old, new));
 
        if (saturated &&
            atomic_long_add_return(saturated,
index 9c6e438561fdb9b6995695b97c4fddcbbc0f0a4c..6d70103efb42a191b703972c7765dc170b2dd0e1 100644 (file)
@@ -235,8 +235,16 @@ static inline u64 sectors_available(struct cache_set *c)
 static inline bool is_available_bucket(struct bucket_mark mark)
 {
        return (!mark.owned_by_allocator &&
-               !mark.is_metadata &&
-               !mark.dirty_sectors);
+               mark.data_type == BUCKET_DATA &&
+               !mark.dirty_sectors &&
+               !mark.nouse);
+}
+
+static inline bool bucket_needs_journal_commit(struct bucket_mark m,
+                                              u16 last_seq_ondisk)
+{
+       return m.journal_seq_valid &&
+               ((s16) m.journal_seq - (s16) last_seq_ondisk > 0);
 }
 
 void bch_bucket_seq_cleanup(struct cache_set *);
@@ -244,7 +252,8 @@ void bch_bucket_seq_cleanup(struct cache_set *);
 void bch_invalidate_bucket(struct cache *, struct bucket *);
 void bch_mark_free_bucket(struct cache *, struct bucket *);
 void bch_mark_alloc_bucket(struct cache *, struct bucket *, bool);
-void bch_mark_metadata_bucket(struct cache *, struct bucket *, bool);
+void bch_mark_metadata_bucket(struct cache *, struct bucket *,
+                             enum bucket_data_type, bool);
 
 void __bch_gc_mark_key(struct cache_set *, struct bkey_s_c, s64, bool,
                       struct bucket_stats_cache_set *);
index 6bbdcd26fba4e63c1b24ec5be5de1d69f2709ba4..f42e09d8a0b9ee2291ea27861c1644fa8e3558fa 100644 (file)
@@ -1,6 +1,14 @@
 #ifndef _BUCKETS_TYPES_H
 #define _BUCKETS_TYPES_H
 
+enum bucket_data_type {
+       BUCKET_DATA     = 0,
+       BUCKET_BTREE,
+       BUCKET_PRIOS,
+       BUCKET_JOURNAL,
+       BUCKET_SB,
+};
+
 struct bucket_mark {
        union {
        struct {
@@ -12,23 +20,30 @@ struct bucket_mark {
 
                /* generation copygc is going to move this bucket into */
                unsigned        copygc:1;
-               unsigned        wait_on_journal:1;
+
+               unsigned        journal_seq_valid:1;
 
                /*
-                * If this bucket ever had metadata in it, the allocator must
-                * increment its gen before we reuse it:
+                * If this bucket had metadata while at the current generation
+                * number, the allocator must increment its gen before we reuse
+                * it:
                 */
                unsigned        had_metadata:1;
 
                unsigned        owned_by_allocator:1;
-               unsigned        is_metadata:1;
 
-               u16             cached_sectors;
+               unsigned        data_type:3;
+
+               unsigned        nouse:1;
+
                u16             dirty_sectors;
+               u16             cached_sectors;
 
                /*
                 * low bits of journal sequence number when this bucket was most
-                * recently modified:
+                * recently modified: if journal_seq_valid is set, this bucket
+                * can't be reused until the journal sequence number written to
+                * disk is >= the bucket's journal sequence number:
                 */
                u16             journal_seq;
        };
index b142d7b27a291eb622b919aef7fc9a7691dfc25d..049aa9108d47594f43f748d93e9d92dd626e94a2 100644 (file)
@@ -107,7 +107,7 @@ static long bch_global_ioctl(unsigned cmd, void __user *arg)
 
 static long bch_ioctl_stop(struct cache_set *c)
 {
-       bch_fs_stop(c);
+       bch_fs_stop_async(c);
        return 0;
 }
 
index dae52d497b1a009ef9a5baf5a0e44d68fe52eb0e..92036db4d676339d56a3690d789f64a8efad2bf1 100644 (file)
@@ -539,15 +539,12 @@ int bch_enable_encryption(struct cache_set *c, bool keyed)
        if (ret)
                goto err;
 
-       crypt = container_of_or_null(bch_fs_sb_field_resize(c, NULL,
-                                               sizeof(*crypt) / sizeof(u64)),
-                                    struct bch_sb_field_crypt, field);
+       crypt = bch_fs_sb_resize_crypt(c, sizeof(*crypt) / sizeof(u64));
        if (!crypt) {
                ret = -ENOMEM; /* XXX this technically could be -ENOSPC */
                goto err;
        }
 
-       crypt->field.type = BCH_SB_FIELD_crypt;
        crypt->key = key;
 
        /* write superblock */
@@ -560,7 +557,7 @@ err:
        return ret;
 }
 
-void bch_fs_encryption_free(struct cache_set *c)
+void bch_fs_encryption_exit(struct cache_set *c)
 {
        if (!IS_ERR_OR_NULL(c->poly1305))
                crypto_free_shash(c->poly1305);
index 137c9155078931b54e5df114e0fc65b62bf58be4..9d4da08d79fa9f3f110bc1e15914f1951d095ce8 100644 (file)
@@ -43,7 +43,7 @@ void bch_encrypt_bio(struct cache_set *, unsigned,
 int bch_disable_encryption(struct cache_set *);
 int bch_enable_encryption(struct cache_set *, bool);
 
-void bch_fs_encryption_free(struct cache_set *);
+void bch_fs_encryption_exit(struct cache_set *);
 int bch_fs_encryption_init(struct cache_set *);
 
 static inline unsigned bch_data_checksum_type(struct cache_set *c)
index f81a81431d5dcefb69e25b71073f080a585bbf72..89da31e533e859e1cf4aad4ed1ab470f47b7f0f6 100644 (file)
@@ -434,10 +434,10 @@ int bch_check_set_has_compressed_data(struct cache_set *c,
                break;
        }
 
-       return bch_compress_init(c);
+       return bch_fs_compress_init(c);
 }
 
-void bch_compress_free(struct cache_set *c)
+void bch_fs_compress_exit(struct cache_set *c)
 {
        vfree(c->zlib_workspace);
        mempool_exit(&c->lz4_workspace_pool);
@@ -450,15 +450,11 @@ void bch_compress_free(struct cache_set *c)
        max_t(size_t, zlib_inflate_workspacesize(),                     \
              zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL))
 
-int bch_compress_init(struct cache_set *c)
+int bch_fs_compress_init(struct cache_set *c)
 {
        unsigned order = get_order(BCH_ENCODED_EXTENT_MAX << 9);
        int ret, cpu;
 
-       if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
-           !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
-               return 0;
-
        if (!c->bio_decompress_worker) {
                c->bio_decompress_worker = alloc_percpu(*c->bio_decompress_worker);
                if (!c->bio_decompress_worker)
@@ -474,6 +470,10 @@ int bch_compress_init(struct cache_set *c)
                }
        }
 
+       if (!bch_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
+           !bch_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
+               return 0;
+
        if (!mempool_initialized(&c->compression_bounce[READ])) {
                ret = mempool_init_page_pool(&c->compression_bounce[READ],
                                             1, order);
index 485acd95249e7a81d0996e759a7ddd06a0bee0c0..4604b0659dd3beff768aa9fda041a184cf0d3bb9 100644 (file)
@@ -9,7 +9,7 @@ void bch_bio_compress(struct cache_set *, struct bio *, size_t *,
                      struct bio *, size_t *, unsigned *);
 
 int bch_check_set_has_compressed_data(struct cache_set *, unsigned);
-void bch_compress_free(struct cache_set *);
-int bch_compress_init(struct cache_set *);
+void bch_fs_compress_exit(struct cache_set *);
+int bch_fs_compress_init(struct cache_set *);
 
 #endif /* _BCACHE_COMPRESS_H */
index d25c32aea29aba99b3cebd833b2a581b8d9ab06e..16cc72b9a84a679c296321e04a0e77453ae36ad9 100644 (file)
@@ -409,13 +409,13 @@ static const struct file_operations bfloat_failed_debug_ops = {
        .read           = bch_read_bfloat_failed,
 };
 
-void bch_debug_exit_cache_set(struct cache_set *c)
+void bch_fs_debug_exit(struct cache_set *c)
 {
        if (!IS_ERR_OR_NULL(c->debug))
                debugfs_remove_recursive(c->debug);
 }
 
-void bch_debug_init_cache_set(struct cache_set *c)
+void bch_fs_debug_init(struct cache_set *c)
 {
        struct btree_debug *bd;
        char name[100];
@@ -432,18 +432,18 @@ void bch_debug_init_cache_set(struct cache_set *c)
             bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
             bd++) {
                bd->id = bd - c->btree_debug;
-               bd->btree = debugfs_create_file(bch_btree_id_names[bd->id],
+               bd->btree = debugfs_create_file(bch_btree_ids[bd->id],
                                                0400, c->debug, bd,
                                                &btree_debug_ops);
 
                snprintf(name, sizeof(name), "%s-formats",
-                        bch_btree_id_names[bd->id]);
+                        bch_btree_ids[bd->id]);
 
                bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd,
                                                       &btree_format_debug_ops);
 
                snprintf(name, sizeof(name), "%s-bfloat-failed",
-                        bch_btree_id_names[bd->id]);
+                        bch_btree_ids[bd->id]);
 
                bd->failed = debugfs_create_file(name, 0400, c->debug, bd,
                                                 &bfloat_failed_debug_ops);
index a3635e609436f32d2e70eb32075fab521ab0881d..d34a95a023c2c231674cd59519ac943ea30c6c2f 100644 (file)
@@ -52,11 +52,11 @@ static inline void bch_btree_verify(struct cache_set *c, struct btree *b)
 }
 
 #ifdef CONFIG_DEBUG_FS
-void bch_debug_exit_cache_set(struct cache_set *);
-void bch_debug_init_cache_set(struct cache_set *);
+void bch_fs_debug_exit(struct cache_set *);
+void bch_fs_debug_init(struct cache_set *);
 #else
-static inline void bch_debug_exit_cache_set(struct cache_set *c) {}
-static inline void bch_debug_init_cache_set(struct cache_set *c) {}
+static inline void bch_fs_debug_exit(struct cache_set *c) {}
+static inline void bch_fs_debug_init(struct cache_set *c) {}
 #endif
 
 void bch_debug_exit(void);
index 9f39be1b6349f4c5b1dd38fd687aa91027007114..f4109da6ebebfe66cfec8332d7164588c7a9a604 100644 (file)
@@ -14,7 +14,7 @@ void bch_inconsistent_error(struct cache_set *c)
        case BCH_ON_ERROR_RO:
                if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
                        /* XXX do something better here? */
-                       bch_fs_stop(c);
+                       bch_fs_stop_async(c);
                        return;
                }
 
@@ -120,7 +120,7 @@ void bch_nonfatal_io_error_work(struct work_struct *work)
        } else {
                bch_notify_dev_error(ca, true);
 
-               mutex_lock(&bch_register_lock);
+               mutex_lock(&c->state_lock);
                dev = bch_dev_may_remove(ca);
                if (dev
                    ? bch_dev_read_only(ca)
@@ -129,7 +129,7 @@ void bch_nonfatal_io_error_work(struct work_struct *work)
                                "too many IO errors on %s, setting %s RO",
                                bdevname(ca->disk_sb.bdev, buf),
                                dev ? "device" : "filesystem");
-               mutex_unlock(&bch_register_lock);
+               mutex_unlock(&c->state_lock);
        }
 }
 
index 523f3f4870b6a02caf625eba72f2df24ffbe0ef8..c5e0e37584922728b9e88e1874d0406ea8baa150 100644 (file)
@@ -547,7 +547,7 @@ static void btree_ptr_debugcheck(struct cache_set *c, struct btree *b,
                        do {
                                seq = read_seqcount_begin(&c->gc_pos_lock);
                                bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
-                                      !g->mark.is_metadata;
+                                      g->mark.data_type != BUCKET_BTREE;
                        } while (read_seqcount_retry(&c->gc_pos_lock, seq));
 
                        err = "inconsistent";
@@ -602,6 +602,7 @@ bch_btree_pick_ptr(struct cache_set *c, const struct btree *b)
        struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
        const union bch_extent_crc *crc;
        const struct bch_extent_ptr *ptr;
+       struct extent_pick_ptr pick = { .ca = NULL };
        struct cache *ca;
 
        rcu_read_lock();
@@ -621,15 +622,19 @@ bch_btree_pick_ptr(struct cache_set *c, const struct btree *b)
                                PTR_BUCKET_NR(ca, ptr)))
                        continue;
 
-               percpu_ref_get(&ca->ref);
-               rcu_read_unlock();
+               if (pick.ca && pick.ca->mi.tier < ca->mi.tier)
+                       continue;
 
-               return (struct extent_pick_ptr) { .ptr = *ptr, .ca = ca };
+               pick.ca         = ca;
+               pick.ptr        = *ptr;
        }
 
+       if (pick.ca)
+               percpu_ref_get(&pick.ca->ref);
+
        rcu_read_unlock();
 
-       return (struct extent_pick_ptr) { .ca = NULL, };
+       return pick;
 }
 
 const struct bkey_ops bch_bkey_btree_ops = {
@@ -1880,7 +1885,7 @@ static void bch_extent_debugcheck_extent(struct cache_set *c, struct btree *b,
                                if (stale)
                                        break;
 
-                               bad = (mark.is_metadata ||
+                               bad = (mark.data_type != BUCKET_DATA ||
                                       (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
                                        !mark.owned_by_allocator &&
                                        !(ptr->cached
@@ -2193,17 +2198,21 @@ void bch_extent_pick_ptr_avoiding(struct cache_set *c, struct bkey_s_c k,
                rcu_read_lock();
                ret->ca = NULL;
 
-               extent_for_each_online_device_crc(c, e, crc, ptr, ca)
-                       if (!ptr_stale(ca, ptr)) {
-                               *ret = (struct extent_pick_ptr) {
-                                       .crc = crc_to_128(e.k, crc),
-                                       .ptr = *ptr,
-                                       .ca = ca,
-                               };
-
-                               if (ca != avoid)
-                                       break;
-                       }
+               extent_for_each_online_device_crc(c, e, crc, ptr, ca) {
+                       if (ptr_stale(ca, ptr))
+                               continue;
+
+                       if (ret->ca &&
+                           (ca == avoid ||
+                            ret->ca->mi.tier < ca->mi.tier))
+                               continue;
+
+                       *ret = (struct extent_pick_ptr) {
+                               .crc = crc_to_128(e.k, crc),
+                               .ptr = *ptr,
+                               .ca = ca,
+                       };
+               }
 
                if (ret->ca)
                        percpu_ref_get(&ret->ca->ref);
index e9585fd5ebd3ced76d9a5917c37e33df2c9361bf..e2f1427f264a0fb2910fac4f58c00350db856ee3 100644 (file)
@@ -545,9 +545,9 @@ struct nlink {
        u32     dir_count;
 };
 
-DECLARE_GENRADIX_TYPE(nlinks, struct nlink);
+typedef GENRADIX(struct nlink) nlink_table;
 
-static void inc_link(struct cache_set *c, struct nlinks *links,
+static void inc_link(struct cache_set *c, nlink_table *links,
                     u64 range_start, u64 *range_end,
                     u64 inum, bool dir)
 {
@@ -570,7 +570,7 @@ static void inc_link(struct cache_set *c, struct nlinks *links,
 }
 
 noinline_for_stack
-static int bch_gc_walk_dirents(struct cache_set *c, struct nlinks *links,
+static int bch_gc_walk_dirents(struct cache_set *c, nlink_table *links,
                               u64 range_start, u64 *range_end)
 {
        struct btree_iter iter;
@@ -776,7 +776,7 @@ fsck_err:
 noinline_for_stack
 static int bch_gc_walk_inodes(struct cache_set *c,
                              struct bch_inode_unpacked *lostfound_inode,
-                             struct nlinks *links,
+                             nlink_table *links,
                              u64 range_start, u64 range_end)
 {
        struct btree_iter iter;
@@ -850,7 +850,7 @@ noinline_for_stack
 static int check_inode_nlinks(struct cache_set *c,
                              struct bch_inode_unpacked *lostfound_inode)
 {
-       struct nlinks links;
+       nlink_table links;
        u64 this_iter_range_start, next_iter_range_start = 0;
        int ret = 0;
 
index ab0d9728bc0d8b51f5be9d1f81a37c449f2b4b49..ec70a3e39f75cae65ae278a2ed99056b7e165a27 100644 (file)
@@ -1257,13 +1257,17 @@ static struct cache_set *bch_open_as_blockdevs(const char *_dev_name,
                if (!c)
                        goto err_unlock;
 
-               if (!test_bit(BCH_FS_RUNNING, &c->flags)) {
+               mutex_lock(&c->state_lock);
+
+               if (!bch_fs_running(c)) {
+                       mutex_unlock(&c->state_lock);
                        err = "incomplete cache set";
                        c = NULL;
                        goto err_unlock;
                }
 
                closure_get(&c->cl);
+               mutex_unlock(&c->state_lock);
                mutex_unlock(&bch_register_lock);
        }
 
@@ -1291,22 +1295,19 @@ static int bch_remount(struct super_block *sb, int *flags, char *data)
        if (ret)
                return ret;
 
-       mutex_lock(&bch_register_lock);
-
        if (opts.read_only >= 0 &&
            opts.read_only != c->opts.read_only) {
                const char *err = NULL;
 
                if (opts.read_only) {
-                       bch_fs_read_only_sync(c);
+                       bch_fs_read_only(c);
 
                        sb->s_flags |= MS_RDONLY;
                } else {
                        err = bch_fs_read_write(c);
                        if (err) {
                                bch_err(c, "error going rw: %s", err);
-                               ret = -EINVAL;
-                               goto unlock;
+                               return -EINVAL;
                        }
 
                        sb->s_flags &= ~MS_RDONLY;
@@ -1318,9 +1319,6 @@ static int bch_remount(struct super_block *sb, int *flags, char *data)
        if (opts.errors >= 0)
                c->opts.errors = opts.errors;
 
-unlock:
-       mutex_unlock(&bch_register_lock);
-
        return ret;
 }
 
@@ -1449,7 +1447,7 @@ static void bch_kill_sb(struct super_block *sb)
        generic_shutdown_super(sb);
 
        if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags))
-               bch_fs_stop_sync(c);
+               bch_fs_stop(c);
        else
                closure_put(&c->cl);
 }
@@ -1464,7 +1462,7 @@ static struct file_system_type bcache_fs_type = {
 
 MODULE_ALIAS_FS("bcache");
 
-void bch_fs_exit(void)
+void bch_vfs_exit(void)
 {
        unregister_filesystem(&bcache_fs_type);
        if (bch_dio_write_bioset)
@@ -1477,7 +1475,7 @@ void bch_fs_exit(void)
                kmem_cache_destroy(bch_inode_cache);
 }
 
-int __init bch_fs_init(void)
+int __init bch_vfs_init(void)
 {
        int ret = -ENOMEM;
 
@@ -1504,6 +1502,6 @@ int __init bch_fs_init(void)
 
        return 0;
 err:
-       bch_fs_exit();
+       bch_vfs_exit();
        return ret;
 }
index 933fb6ded96e4ff4dd4be274f9fff3911830ed9e..2a29b1329d386f66c476699be99be34486fb5222 100644 (file)
@@ -52,13 +52,13 @@ int __must_check __bch_write_inode(struct cache_set *, struct bch_inode_info *,
 int __must_check bch_write_inode(struct cache_set *,
                                 struct bch_inode_info *);
 
-void bch_fs_exit(void);
-int bch_fs_init(void);
+void bch_vfs_exit(void);
+int bch_vfs_init(void);
 
 #else
 
-static inline void bch_fs_exit(void) {}
-static inline int bch_fs_init(void) { return 0; }
+static inline void bch_vfs_exit(void) {}
+static inline int bch_vfs_init(void) { return 0; }
 
 #endif
 
index be99a973f7c5c3c048fe6c1c82a0af38c2cd4965..a3df379499867de1f704d9e0f48740b5b58ed6e0 100644 (file)
@@ -722,9 +722,7 @@ void bch_wake_delayed_writes(unsigned long data)
        spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
 
        while ((op = c->write_wait_head)) {
-               if (!test_bit(BCH_FS_RO, &c->flags) &&
-                   !test_bit(BCH_FS_STOPPING, &c->flags) &&
-                   time_after(op->expires, jiffies)) {
+               if (time_after(op->expires, jiffies)) {
                        mod_timer(&c->foreground_write_wakeup, op->expires);
                        break;
                }
@@ -1068,9 +1066,7 @@ static void __bch_read_endio(struct cache_set *c, struct bch_read_bio *rbio)
                return;
        }
 
-       if (rbio->promote &&
-           !test_bit(BCH_FS_RO, &c->flags) &&
-           !test_bit(BCH_FS_STOPPING, &c->flags)) {
+       if (rbio->promote) {
                struct cache_promote_op *promote = rbio->promote;
                struct closure *cl = &promote->cl;
 
@@ -1133,13 +1129,26 @@ static void bch_read_endio(struct bio *bio)
                preempt_disable();
                d = this_cpu_ptr(c->bio_decompress_worker);
                llist_add(&rbio->list, &d->bio_list);
-               queue_work(system_unbound_wq, &d->work);
+               queue_work(system_highpri_wq, &d->work);
                preempt_enable();
        } else {
                __bch_read_endio(c, rbio);
        }
 }
 
+static bool should_promote(struct cache_set *c,
+                          struct extent_pick_ptr *pick, unsigned flags)
+{
+       if (!(flags & BCH_READ_PROMOTE))
+               return false;
+
+       if (percpu_ref_is_dying(&c->writes))
+               return false;
+
+       return c->fastest_tier &&
+               c->fastest_tier < c->tiers + pick->ca->mi.tier;
+}
+
 void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
                          struct bvec_iter iter, struct bkey_s_c k,
                          struct extent_pick_ptr *pick, unsigned flags)
@@ -1158,7 +1167,7 @@ void bch_read_extent_iter(struct cache_set *c, struct bch_read_bio *orig,
         * XXX: multiple promotes can race with each other, wastefully. Keep a
         * list of outstanding promotes?
         */
-       if ((flags & BCH_READ_PROMOTE) && pick->ca->mi.tier) {
+       if (should_promote(c, pick, flags)) {
                /*
                 * biovec needs to be big enough to hold decompressed data, if
                 * the bch_write_extent() has to decompress/recompress it:
index 99dd9f2685a7a83c491c478088f9ba5166f85e3e..b28383763fc3e2c608db351f31dfd5275d6b648a 100644 (file)
@@ -545,8 +545,7 @@ static int journal_entry_validate(struct cache_set *c,
                return BCH_FSCK_UNKNOWN_VERSION;
        }
 
-       if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9 ||
-                       bytes > c->journal.entry_size_max, c,
+       if (mustfix_fsck_err_on(bytes > bucket_sectors_left << 9, c,
                        "journal entry too big (%zu bytes), sector %lluu",
                        bytes, sector)) {
                /* XXX: note we might have missing journal entries */
@@ -1406,13 +1405,7 @@ void bch_journal_start(struct cache_set *c)
 {
        struct journal *j = &c->journal;
        struct journal_seq_blacklist *bl;
-       struct cache *ca;
        u64 new_seq = 0;
-       unsigned i;
-
-       for_each_cache(ca, c, i)
-               if (is_journal_device(ca))
-                       bch_dev_group_add(&c->journal.devs, ca);
 
        list_for_each_entry(bl, &j->seq_blacklist, list)
                new_seq = max(new_seq, bl->seq);
@@ -1534,48 +1527,111 @@ err:
        return ret;
 }
 
-static int bch_set_nr_journal_buckets(struct cache *ca, unsigned nr)
+static int bch_set_nr_journal_buckets(struct cache_set *c, struct cache *ca,
+                                     unsigned nr, bool write_super)
 {
+       struct journal *j = &c->journal;
        struct journal_device *ja = &ca->journal;
-       struct bch_sb_field_journal *journal_buckets =
-               bch_sb_get_journal(ca->disk_sb.sb);
-       struct bch_sb_field *f;
-       u64 *p;
+       struct bch_sb_field_journal *journal_buckets;
+       struct disk_reservation disk_res = { 0, 0 };
+       struct closure cl;
+       u64 *new_bucket_seq = NULL, *new_buckets = NULL;
+       int ret = 0;
 
-       p = krealloc(ja->bucket_seq, nr * sizeof(u64),
-                    GFP_KERNEL|__GFP_ZERO);
-       if (!p)
-               return -ENOMEM;
+       closure_init_stack(&cl);
 
-       ja->bucket_seq = p;
+       mutex_lock(&c->sb_lock);
 
-       p = krealloc(ja->buckets, nr * sizeof(u64),
-                    GFP_KERNEL|__GFP_ZERO);
-       if (!p)
-               return -ENOMEM;
+       /* don't handle reducing nr of buckets yet: */
+       if (nr <= ja->nr)
+               goto err;
 
-       ja->buckets = p;
+       /*
+        * note: journal buckets aren't really counted as _sectors_ used yet, so
+        * we don't need the disk reservation to avoid the BUG_ON() in buckets.c
+        * when space used goes up without a reservation - but we do need the
+        * reservation to ensure we'll actually be able to allocate:
+        */
 
-       f = bch_dev_sb_field_resize(&ca->disk_sb, &journal_buckets->field, nr +
-                                   sizeof(*journal_buckets) / sizeof(u64));
-       if (!f)
-               return -ENOMEM;
-       f->type = BCH_SB_FIELD_journal;
+       ret = ENOSPC;
+       if (bch_disk_reservation_get(c, &disk_res,
+                       (nr - ja->nr) << ca->bucket_bits, 0))
+               goto err;
 
-       ja->nr = nr;
-       return 0;
+       ret = -ENOMEM;
+       new_buckets     = kzalloc(nr * sizeof(u64), GFP_KERNEL);
+       new_bucket_seq  = kzalloc(nr * sizeof(u64), GFP_KERNEL);
+       if (!new_buckets || !new_bucket_seq)
+               goto err;
+
+       journal_buckets = bch_sb_resize_journal(&ca->disk_sb,
+                               nr + sizeof(*journal_buckets) / sizeof(u64));
+       if (!journal_buckets)
+               goto err;
+
+       spin_lock(&j->lock);
+       memcpy(new_buckets,     ja->buckets,    ja->nr * sizeof(u64));
+       memcpy(new_bucket_seq,  ja->bucket_seq, ja->nr * sizeof(u64));
+       swap(new_buckets,       ja->buckets);
+       swap(new_bucket_seq,    ja->bucket_seq);
+
+       while (ja->nr < nr) {
+               /* must happen under journal lock, to avoid racing with gc: */
+               u64 b = bch_bucket_alloc(ca, RESERVE_NONE);
+               if (!b) {
+                       if (!closure_wait(&c->freelist_wait, &cl)) {
+                               spin_unlock(&j->lock);
+                               closure_sync(&cl);
+                               spin_lock(&j->lock);
+                       }
+                       continue;
+               }
+
+               bch_mark_metadata_bucket(ca, &ca->buckets[b],
+                                        BUCKET_JOURNAL, false);
+               bch_mark_alloc_bucket(ca, &ca->buckets[b], false);
+
+               memmove(ja->buckets + ja->last_idx + 1,
+                       ja->buckets + ja->last_idx,
+                       (ja->nr - ja->last_idx) * sizeof(u64));
+               memmove(ja->bucket_seq + ja->last_idx + 1,
+                       ja->bucket_seq + ja->last_idx,
+                       (ja->nr - ja->last_idx) * sizeof(u64));
+               memmove(journal_buckets->buckets + ja->last_idx + 1,
+                       journal_buckets->buckets + ja->last_idx,
+                       (ja->nr - ja->last_idx) * sizeof(u64));
+
+               ja->buckets[ja->last_idx] = b;
+               journal_buckets->buckets[ja->last_idx] = cpu_to_le64(b);
+
+               if (ja->last_idx < ja->nr) {
+                       if (ja->cur_idx >= ja->last_idx)
+                               ja->cur_idx++;
+                       ja->last_idx++;
+               }
+               ja->nr++;
+
+       }
+       spin_unlock(&j->lock);
+
+       BUG_ON(bch_validate_journal_layout(ca->disk_sb.sb, ca->mi));
+
+       if (write_super)
+               bch_write_super(c);
+
+       ret = 0;
+err:
+       mutex_unlock(&c->sb_lock);
+
+       kfree(new_bucket_seq);
+       kfree(new_buckets);
+       bch_disk_reservation_put(c, &disk_res);
+
+       return ret;
 }
 
 int bch_dev_journal_alloc(struct cache *ca)
 {
-       struct journal_device *ja = &ca->journal;
-       struct bch_sb_field_journal *journal_buckets;
-       int ret;
-       unsigned i;
-
-       if (ca->mi.tier != 0)
-               return 0;
-
        if (dynamic_fault("bcache:add:journal_alloc"))
                return -ENOMEM;
 
@@ -1583,26 +1639,12 @@ int bch_dev_journal_alloc(struct cache *ca)
         * clamp journal size to 1024 buckets or 512MB (in sectors), whichever
         * is smaller:
         */
-       ret = bch_set_nr_journal_buckets(ca,
+       return bch_set_nr_journal_buckets(ca->set, ca,
                        clamp_t(unsigned, ca->mi.nbuckets >> 8,
                                BCH_JOURNAL_BUCKETS_MIN,
                                min(1 << 10,
-                                   (1 << 20) / ca->mi.bucket_size)));
-       if (ret)
-               return ret;
-
-       journal_buckets = bch_sb_get_journal(ca->disk_sb.sb);
-
-       for (i = 0; i < ja->nr; i++) {
-               u64 bucket = ca->mi.first_bucket + i;
-
-               ja->buckets[i] = bucket;
-               journal_buckets->buckets[i] = cpu_to_le64(bucket);
-
-               bch_mark_metadata_bucket(ca, &ca->buckets[bucket], true);
-       }
-
-       return 0;
+                                   (1 << 20) / ca->mi.bucket_size)),
+                       false);
 }
 
 /* Journalling */
@@ -1726,14 +1768,12 @@ void bch_journal_pin_add_if_older(struct journal *j,
             fifo_entry_idx(&j->pin, pin->pin_list))) {
                if (journal_pin_active(pin))
                        __journal_pin_drop(j, pin);
-               __journal_pin_add(j, src_pin->pin_list,
-                                 pin, NULL);
+               __journal_pin_add(j, src_pin->pin_list, pin, flush_fn);
        }
 
        spin_unlock_irq(&j->pin_lock);
 }
 
-
 static struct journal_entry_pin *
 journal_get_next_pin(struct journal *j, u64 seq_to_flush)
 {
@@ -1766,6 +1806,29 @@ journal_get_next_pin(struct journal *j, u64 seq_to_flush)
        return ret;
 }
 
+static bool journal_has_pins(struct journal *j)
+{
+       bool ret;
+
+       spin_lock(&j->lock);
+       journal_reclaim_fast(j);
+       ret = fifo_used(&j->pin) > 1 ||
+               atomic_read(&fifo_peek_front(&j->pin).count) > 1;
+       spin_unlock(&j->lock);
+
+       return ret;
+}
+
+void bch_journal_flush_pins(struct journal *j)
+{
+       struct journal_entry_pin *pin;
+
+       while ((pin = journal_get_next_pin(j, U64_MAX)))
+               pin->flush(j, pin);
+
+       wait_event(j->wait, !journal_has_pins(j) || bch_journal_error(j));
+}
+
 static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
 {
        bool ret;
@@ -1895,8 +1958,10 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
        struct cache_set *c = container_of(j, struct cache_set, journal);
        struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
        struct bch_extent_ptr *ptr;
+       struct journal_device *ja;
        struct cache *ca;
-       unsigned iter, replicas, replicas_want =
+       bool swapped;
+       unsigned i, replicas, replicas_want =
                READ_ONCE(c->opts.metadata_replicas);
 
        spin_lock(&j->lock);
@@ -1921,12 +1986,27 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 
        replicas = bch_extent_nr_ptrs(e.c);
 
+       spin_lock(&j->devs.lock);
+
+       /* Sort by tier: */
+       do {
+               swapped = false;
+
+               for (i = 0; i + 1 < j->devs.nr; i++)
+                       if (j->devs.d[i + 0].dev->mi.tier >
+                           j->devs.d[i + 1].dev->mi.tier) {
+                               swap(j->devs.d[i], j->devs.d[i + 1]);
+                               swapped = true;
+                       }
+       } while (swapped);
+
        /*
-        * Determine location of the next journal write:
-        * XXX: sort caches by free journal space
+        * Pick devices for next journal write:
+        * XXX: sort devices by free journal space?
         */
-       group_for_each_cache_rcu(ca, &j->devs, iter) {
-               struct journal_device *ja = &ca->journal;
+       for (i = 0; i < j->devs.nr; i++) {
+               ca = j->devs.d[i].dev;
+               ja = &ca->journal;
 
                if (replicas >= replicas_want)
                        break;
@@ -1954,7 +2034,7 @@ static int journal_write_alloc(struct journal *j, unsigned sectors)
 
                trace_bcache_journal_next_bucket(ca, ja->cur_idx, ja->last_idx);
        }
-
+       spin_unlock(&j->devs.lock);
        rcu_read_unlock();
 
        j->prev_buf_sectors = 0;
@@ -2468,50 +2548,6 @@ int bch_journal_flush(struct journal *j)
        return bch_journal_flush_seq(j, seq);
 }
 
-void bch_journal_free(struct journal *j)
-{
-       unsigned order = get_order(j->entry_size_max);
-
-       free_pages((unsigned long) j->buf[1].data, order);
-       free_pages((unsigned long) j->buf[0].data, order);
-       free_fifo(&j->pin);
-}
-
-int bch_journal_alloc(struct journal *j, unsigned entry_size_max)
-{
-       static struct lock_class_key res_key;
-       unsigned order = get_order(entry_size_max);
-
-       spin_lock_init(&j->lock);
-       spin_lock_init(&j->pin_lock);
-       init_waitqueue_head(&j->wait);
-       INIT_DELAYED_WORK(&j->write_work, journal_write_work);
-       INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
-       mutex_init(&j->blacklist_lock);
-       INIT_LIST_HEAD(&j->seq_blacklist);
-       spin_lock_init(&j->devs.lock);
-       mutex_init(&j->reclaim_lock);
-
-       lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
-
-       j->entry_size_max       = entry_size_max;
-       j->write_delay_ms       = 100;
-       j->reclaim_delay_ms     = 100;
-
-       bkey_extent_init(&j->key);
-
-       atomic64_set(&j->reservations.counter,
-               ((union journal_res_state)
-                { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
-
-       if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
-           !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
-           !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
-               return -ENOMEM;
-
-       return 0;
-}
-
 ssize_t bch_journal_print_debug(struct journal *j, char *buf)
 {
        union journal_res_state *s = &j->reservations;
@@ -2643,13 +2679,31 @@ int bch_journal_move(struct cache *ca)
        return ret;
 }
 
-void bch_journal_free_cache(struct cache *ca)
+void bch_fs_journal_stop(struct journal *j)
+{
+       if (!test_bit(JOURNAL_STARTED, &j->flags))
+               return;
+
+       /*
+        * Empty out the journal by first flushing everything pinning existing
+        * journal entries, then force a brand new empty journal entry to be
+        * written:
+        */
+       bch_journal_flush_pins(j);
+       bch_journal_flush_async(j, NULL);
+       bch_journal_meta(j);
+
+       cancel_delayed_work_sync(&j->write_work);
+       cancel_delayed_work_sync(&j->reclaim_work);
+}
+
+void bch_dev_journal_exit(struct cache *ca)
 {
        kfree(ca->journal.buckets);
        kfree(ca->journal.bucket_seq);
 }
 
-int bch_journal_init_cache(struct cache *ca)
+int bch_dev_journal_init(struct cache *ca)
 {
        struct journal_device *ja = &ca->journal;
        struct bch_sb_field_journal *journal_buckets =
@@ -2679,3 +2733,47 @@ int bch_journal_init_cache(struct cache *ca)
 
        return 0;
 }
+
+void bch_fs_journal_exit(struct journal *j)
+{
+       unsigned order = get_order(j->entry_size_max);
+
+       free_pages((unsigned long) j->buf[1].data, order);
+       free_pages((unsigned long) j->buf[0].data, order);
+       free_fifo(&j->pin);
+}
+
+int bch_fs_journal_init(struct journal *j, unsigned entry_size_max)
+{
+       static struct lock_class_key res_key;
+       unsigned order = get_order(entry_size_max);
+
+       spin_lock_init(&j->lock);
+       spin_lock_init(&j->pin_lock);
+       init_waitqueue_head(&j->wait);
+       INIT_DELAYED_WORK(&j->write_work, journal_write_work);
+       INIT_DELAYED_WORK(&j->reclaim_work, journal_reclaim_work);
+       mutex_init(&j->blacklist_lock);
+       INIT_LIST_HEAD(&j->seq_blacklist);
+       spin_lock_init(&j->devs.lock);
+       mutex_init(&j->reclaim_lock);
+
+       lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
+
+       j->entry_size_max       = entry_size_max;
+       j->write_delay_ms       = 100;
+       j->reclaim_delay_ms     = 100;
+
+       bkey_extent_init(&j->key);
+
+       atomic64_set(&j->reservations.counter,
+               ((union journal_res_state)
+                { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
+
+       if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+           !(j->buf[0].data = (void *) __get_free_pages(GFP_KERNEL, order)) ||
+           !(j->buf[1].data = (void *) __get_free_pages(GFP_KERNEL, order)))
+               return -ENOMEM;
+
+       return 0;
+}
index 02a6e6766d3ece60f616dd8b0f24b5936270dc36..d3a1db0c41eb3c79ffaff8faa09ad2009a51dc07 100644 (file)
 #include <linux/hash.h>
 
 #include "journal_types.h"
-//#include "super-io.h"
 
 /*
  * Only used for holding the journal entries we read in btree_journal_read()
@@ -136,6 +135,7 @@ void bch_journal_pin_add_if_older(struct journal *,
                                  struct journal_entry_pin *,
                                  struct journal_entry_pin *,
                                  journal_pin_flush_fn);
+void bch_journal_flush_pins(struct journal *);
 
 struct closure;
 struct cache_set;
@@ -330,11 +330,6 @@ static inline int bch_journal_error(struct journal *j)
                ? -EIO : 0;
 }
 
-static inline bool is_journal_device(struct cache *ca)
-{
-       return ca->mi.state == BCH_MEMBER_STATE_ACTIVE && ca->mi.tier == 0;
-}
-
 static inline bool journal_flushes_device(struct cache *ca)
 {
        return true;
@@ -356,9 +351,6 @@ static inline void bch_journal_set_replay_done(struct journal *j)
        spin_unlock(&j->lock);
 }
 
-void bch_journal_free(struct journal *);
-int bch_journal_alloc(struct journal *, unsigned);
-
 ssize_t bch_journal_print_debug(struct journal *, char *);
 
 int bch_dev_journal_alloc(struct cache *);
@@ -372,7 +364,10 @@ static inline unsigned bch_nr_journal_buckets(struct bch_sb_field_journal *j)
 
 int bch_journal_move(struct cache *);
 
-void bch_journal_free_cache(struct cache *);
-int bch_journal_init_cache(struct cache *);
+void bch_fs_journal_stop(struct journal *);
+void bch_dev_journal_exit(struct cache *);
+int bch_dev_journal_init(struct cache *);
+void bch_fs_journal_exit(struct journal *);
+int bch_fs_journal_init(struct journal *, unsigned);
 
 #endif /* _BCACHE_JOURNAL_H */
index e40dfbcadc906a378ff503c49af32e05cbbd2a63..27f5c63cc943784a8818adf2459f0498150a4217 100644 (file)
@@ -191,7 +191,7 @@ static void bch_moving_gc(struct cache *ca)
                }
 
                if (g->mark.owned_by_allocator ||
-                   g->mark.is_metadata)
+                   g->mark.data_type != BUCKET_DATA)
                        continue;
 
                sectors_used = bucket_sectors_used(g);
@@ -258,18 +258,21 @@ static int bch_moving_gc_thread(void *arg)
        return 0;
 }
 
-void bch_moving_init_cache(struct cache *ca)
+void bch_moving_gc_stop(struct cache *ca)
 {
-       bch_pd_controller_init(&ca->moving_gc_pd);
-       ca->moving_gc_pd.d_term = 0;
+       ca->moving_gc_pd.rate.rate = UINT_MAX;
+       bch_ratelimit_reset(&ca->moving_gc_pd.rate);
+
+       if (ca->moving_gc_read)
+               kthread_stop(ca->moving_gc_read);
+       ca->moving_gc_read = NULL;
 }
 
-int bch_moving_gc_thread_start(struct cache *ca)
+int bch_moving_gc_start(struct cache *ca)
 {
        struct task_struct *t;
 
-       /* The moving gc read thread must be stopped */
-       BUG_ON(ca->moving_gc_read != NULL);
+       BUG_ON(ca->moving_gc_read);
 
        if (ca->set->opts.nochanges)
                return 0;
@@ -287,12 +290,8 @@ int bch_moving_gc_thread_start(struct cache *ca)
        return 0;
 }
 
-void bch_moving_gc_stop(struct cache *ca)
+void bch_dev_moving_gc_init(struct cache *ca)
 {
-       ca->moving_gc_pd.rate.rate = UINT_MAX;
-       bch_ratelimit_reset(&ca->moving_gc_pd.rate);
-
-       if (ca->moving_gc_read)
-               kthread_stop(ca->moving_gc_read);
-       ca->moving_gc_read = NULL;
+       bch_pd_controller_init(&ca->moving_gc_pd);
+       ca->moving_gc_pd.d_term = 0;
 }
index 5f15308593d42a1392f3c5dc0fce5021ad891e1b..e8ae95e5cfd15eeb5b9cf5d58d1edb5e5e5fa06b 100644 (file)
@@ -23,8 +23,8 @@
 #define COPYGC_SECTORS_PER_ITER(ca)                                    \
        ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca))
 
-void bch_moving_init_cache(struct cache *);
 void bch_moving_gc_stop(struct cache *);
-int bch_moving_gc_thread_start(struct cache *);
+int bch_moving_gc_start(struct cache *);
+void bch_dev_moving_gc_init(struct cache *);
 
 #endif
index 95184db1a45c91184f8ccf41e1df11042d8937b8..9b10310d1e21f966036a1f4c862d1daf2ab46da2 100644 (file)
@@ -86,11 +86,17 @@ enum opt_type {
        BCH_OPT(noreplay,               0444,   NO_SB_OPT,              \
                s8,  OPT_BOOL())                                        \
        BCH_OPT(norecovery,             0444,   NO_SB_OPT,              \
-               s8,  OPT_BOOL())
+               s8,  OPT_BOOL())                                        \
+       BCH_OPT(noexcl,                 0444,   NO_SB_OPT,              \
+               s8,  OPT_BOOL())                                        \
+       BCH_OPT(sb,                     0444,   NO_SB_OPT,              \
+               s64, OPT_UINT(0, S64_MAX))                              \
 
 #define BCH_OPTS()                                                     \
        BCH_OPT(read_only,              0444,   NO_SB_OPT,              \
                s8,  OPT_BOOL())                                        \
+       BCH_OPT(nostart,                0444,   NO_SB_OPT,              \
+               s8,  OPT_BOOL())                                        \
        BCH_VISIBLE_OPTS()
 
 struct bch_opts {
@@ -145,6 +151,8 @@ static inline void bch_opts_apply(struct bch_opts *dst, struct bch_opts src)
 #undef BCH_OPT
 }
 
+#define opt_defined(_opt)              ((_opt) >= 0)
+
 void bch_opt_set(struct bch_opts *, enum bch_opt_id, u64);
 struct bch_opts bch_sb_opts(struct bch_sb *);
 
index be27d3ee6c1cf171302a4de998d98dab00398a5b..f50a5ee8b104416f3e07a383a9b4038f18a81425 100644 (file)
@@ -10,6 +10,7 @@
 #include "vstructs.h"
 
 #include <linux/backing-dev.h>
+#include <linux/sort.h>
 
 static inline void __bch_sb_layout_size_assert(void)
 {
@@ -17,7 +18,7 @@ static inline void __bch_sb_layout_size_assert(void)
 }
 
 struct bch_sb_field *bch_sb_field_get(struct bch_sb *sb,
-                                     enum bch_sb_field_types type)
+                                     enum bch_sb_field_type type)
 {
        struct bch_sb_field *f;
 
@@ -34,7 +35,7 @@ void bch_free_super(struct bcache_superblock *sb)
        if (sb->bio)
                bio_put(sb->bio);
        if (!IS_ERR_OR_NULL(sb->bdev))
-               blkdev_put(sb->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+               blkdev_put(sb->bdev, sb->mode);
 
        free_pages((unsigned long) sb->sb, sb->page_order);
        memset(sb, 0, sizeof(*sb));
@@ -74,7 +75,7 @@ static int __bch_super_realloc(struct bcache_superblock *sb, unsigned order)
        return 0;
 }
 
-int bch_dev_sb_realloc(struct bcache_superblock *sb, unsigned u64s)
+static int bch_sb_realloc(struct bcache_superblock *sb, unsigned u64s)
 {
        u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s);
        u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
@@ -140,13 +141,29 @@ static struct bch_sb_field *__bch_sb_field_resize(struct bch_sb *sb,
        le32_add_cpu(&sb->u64s, u64s - old_u64s);
 
        return f;
+}
+
+struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *sb,
+                                        enum bch_sb_field_type type,
+                                        unsigned u64s)
+{
+       struct bch_sb_field *f = bch_sb_field_get(sb->sb, type);
+       ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
+       ssize_t d = -old_u64s + u64s;
 
+       if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
+               return NULL;
+
+       f = __bch_sb_field_resize(sb->sb, f, u64s);
+       f->type = type;
+       return f;
 }
 
 struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *c,
-                                           struct bch_sb_field *f,
+                                           enum bch_sb_field_type type,
                                            unsigned u64s)
 {
+       struct bch_sb_field *f = bch_sb_field_get(c->disk_sb, type);
        ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
        ssize_t d = -old_u64s + u64s;
        struct cache *ca;
@@ -160,26 +177,15 @@ struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *c,
        for_each_cache(ca, c, i) {
                struct bcache_superblock *sb = &ca->disk_sb;
 
-               if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
+               if (bch_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) {
                        percpu_ref_put(&ca->ref);
                        return NULL;
                }
        }
 
-       return __bch_sb_field_resize(c->disk_sb, f, u64s);
-}
-
-struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *sb,
-                                            struct bch_sb_field *f,
-                                            unsigned u64s)
-{
-       ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0;
-       ssize_t d = -old_u64s + u64s;
-
-       if (bch_dev_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d))
-               return NULL;
-
-       return __bch_sb_field_resize(sb->sb, f, u64s);
+       f = __bch_sb_field_resize(c->disk_sb, f, u64s);
+       f->type = type;
+       return f;
 }
 
 static const char *validate_sb_layout(struct bch_sb_layout *layout)
@@ -203,9 +209,6 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout)
 
        prev_offset = le64_to_cpu(layout->sb_offset[0]);
 
-       if (prev_offset != BCH_SB_SECTOR)
-               return "Invalid superblock layout: doesn't have default superblock location";
-
        for (i = 1; i < layout->nr_superblocks; i++) {
                offset = le64_to_cpu(layout->sb_offset[i]);
 
@@ -217,16 +220,70 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout)
        return NULL;
 }
 
+static int u64_cmp(const void *_l, const void *_r)
+{
+       u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
+
+       return l < r ? -1 : l > r ? 1 : 0;
+}
+
+const char *bch_validate_journal_layout(struct bch_sb *sb,
+                                       struct cache_member_cpu mi)
+{
+       struct bch_sb_field_journal *journal;
+       const char *err;
+       unsigned nr;
+       unsigned i;
+       u64 *b;
+
+       journal = bch_sb_get_journal(sb);
+       if (!journal)
+               return NULL;
+
+       nr = bch_nr_journal_buckets(journal);
+       if (!nr)
+               return NULL;
+
+       b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
+       if (!b)
+               return "cannot allocate memory";
+
+       for (i = 0; i < nr; i++)
+               b[i] = le64_to_cpu(journal->buckets[i]);
+
+       sort(b, nr, sizeof(u64), u64_cmp, NULL);
+
+       err = "journal bucket at sector 0";
+       if (!b[0])
+               goto err;
+
+       err = "journal bucket before first bucket";
+       if (b[0] < mi.first_bucket)
+               goto err;
+
+       err = "journal bucket past end of device";
+       if (b[nr - 1] >= mi.nbuckets)
+               goto err;
+
+       err = "duplicate journal buckets";
+       for (i = 0; i + 1 < nr; i++)
+               if (b[i] == b[i + 1])
+                       goto err;
+
+       err = NULL;
+err:
+       kfree(b);
+       return err;
+}
+
 const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
 {
        struct bch_sb *sb = disk_sb->sb;
        struct bch_sb_field *f;
        struct bch_sb_field_members *sb_mi;
-       struct bch_sb_field_journal *journal;
        struct cache_member_cpu mi;
        const char *err;
        u16 block_size;
-       unsigned i;
 
        switch (le64_to_cpu(sb->version)) {
        case BCACHE_SB_VERSION_CDEV_V4:
@@ -324,14 +381,6 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
 
        mi = cache_mi_to_cpu_mi(sb_mi->members + sb->dev_idx);
 
-       for (i = 0; i < sb->layout.nr_superblocks; i++) {
-               u64 offset = le64_to_cpu(sb->layout.sb_offset[i]);
-               u64 max_size = 1 << sb->layout.sb_max_size_bits;
-
-               if (offset + max_size > mi.first_bucket * mi.bucket_size)
-                       return "Invalid superblock: first bucket comes before end of super";
-       }
-
        if (mi.nbuckets > LONG_MAX)
                return "Too many buckets";
 
@@ -347,16 +396,9 @@ const char *bch_validate_cache_super(struct bcache_superblock *disk_sb)
            mi.bucket_size * mi.nbuckets)
                return "Invalid superblock: device too small";
 
-       /* Validate journal buckets: */
-       journal = bch_sb_get_journal(sb);
-       if (journal) {
-               for (i = 0; i < bch_nr_journal_buckets(journal); i++) {
-                       u64 b = le64_to_cpu(journal->buckets[i]);
-
-                       if (b <  mi.first_bucket || b >= mi.nbuckets)
-                               return "bad journal bucket";
-               }
-       }
+       err = bch_validate_journal_layout(sb, mi);
+       if (err)
+               return err;
 
        return NULL;
 }
@@ -382,19 +424,19 @@ static bool bch_is_open_cache(struct block_device *bdev)
 
 static bool bch_is_open(struct block_device *bdev)
 {
-       lockdep_assert_held(&bch_register_lock);
+       bool ret;
+
+       mutex_lock(&bch_register_lock);
+       ret = bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
+       mutex_unlock(&bch_register_lock);
 
-       return bch_is_open_cache(bdev) || bch_is_open_backing_dev(bdev);
+       return ret;
 }
 
-static const char *bch_blkdev_open(const char *path, void *holder,
-                                  struct bch_opts opts,
-                                  struct block_device **ret)
+static const char *bch_blkdev_open(const char *path, fmode_t mode,
+                                  void *holder, struct block_device **ret)
 {
        struct block_device *bdev;
-       fmode_t mode = opts.nochanges > 0
-               ? FMODE_READ
-               : FMODE_READ|FMODE_WRITE|FMODE_EXCL;
        const char *err;
 
        *ret = NULL;
@@ -548,7 +590,7 @@ int bch_sb_from_cache_set(struct cache_set *c, struct cache *ca)
        unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s;
        int ret;
 
-       ret = bch_dev_sb_realloc(&ca->disk_sb, u64s);
+       ret = bch_sb_realloc(&ca->disk_sb, u64s);
        if (ret)
                return ret;
 
@@ -567,7 +609,7 @@ static const char *read_one_super(struct bcache_superblock *sb, u64 offset)
 reread:
        bio_reset(sb->bio);
        sb->bio->bi_bdev = sb->bdev;
-       sb->bio->bi_iter.bi_sector = BCH_SB_SECTOR;
+       sb->bio->bi_iter.bi_sector = offset;
        sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order;
        bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META);
        bch_bio_map(sb->bio, sb->sb);
@@ -610,15 +652,21 @@ const char *bch_read_super(struct bcache_superblock *sb,
                           struct bch_opts opts,
                           const char *path)
 {
+       u64 offset = opt_defined(opts.sb) ? opts.sb : BCH_SB_SECTOR;
        struct bch_sb_layout layout;
        const char *err;
        unsigned i;
 
-       lockdep_assert_held(&bch_register_lock);
-
        memset(sb, 0, sizeof(*sb));
+       sb->mode = FMODE_READ;
+
+       if (!(opt_defined(opts.noexcl) && opts.noexcl))
+               sb->mode |= FMODE_EXCL;
 
-       err = bch_blkdev_open(path, &sb, opts, &sb->bdev);
+       if (!(opt_defined(opts.nochanges) && opts.nochanges))
+               sb->mode |= FMODE_WRITE;
+
+       err = bch_blkdev_open(path, sb->mode, sb, &sb->bdev);
        if (err)
                return err;
 
@@ -630,11 +678,16 @@ const char *bch_read_super(struct bcache_superblock *sb,
        if (bch_fs_init_fault("read_super"))
                goto err;
 
-       err = read_one_super(sb, BCH_SB_SECTOR);
+       err = read_one_super(sb, offset);
        if (!err)
                goto got_super;
 
-       pr_err("error reading default super: %s", err);
+       if (offset != BCH_SB_SECTOR) {
+               pr_err("error reading superblock: %s", err);
+               goto err;
+       }
+
+       pr_err("error reading default superblock: %s", err);
 
        /*
         * Error reading primary superblock - read location of backup
@@ -747,6 +800,9 @@ void bch_write_super(struct cache_set *c)
 
        lockdep_assert_held(&c->sb_lock);
 
+       if (c->opts.nochanges)
+               return;
+
        closure_init_stack(cl);
 
        le64_add_cpu(&c->disk_sb->seq, 1);
index 665de811fa8dd46670fc81135b6d3cd19dad9466..ae1e8b9dc304f3e675280d075dcaec211be66fee 100644 (file)
@@ -6,16 +6,35 @@
 
 #include <asm/byteorder.h>
 
-struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_types);
-
-#define BCH_SB_FIELD_TYPE(_name)                               \
-static inline struct bch_sb_field_##_name *                    \
-bch_sb_get_##_name(struct bch_sb *sb)                          \
-{                                                              \
-       struct bch_sb_field *f =                                \
-               bch_sb_field_get(sb, BCH_SB_FIELD_##_name);     \
-                                                               \
-       return container_of_or_null(f, struct bch_sb_field_##_name, field);\
+struct bch_sb_field *bch_sb_field_get(struct bch_sb *, enum bch_sb_field_type);
+struct bch_sb_field *bch_sb_field_resize(struct bcache_superblock *,
+                                        enum bch_sb_field_type, unsigned);
+struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *,
+                                        enum bch_sb_field_type, unsigned);
+
+#define field_to_type(_f, _name)                                       \
+       container_of_or_null(_f, struct bch_sb_field_##_name, field)
+
+#define BCH_SB_FIELD_TYPE(_name)                                       \
+static inline struct bch_sb_field_##_name *                            \
+bch_sb_get_##_name(struct bch_sb *sb)                                  \
+{                                                                      \
+       return field_to_type(bch_sb_field_get(sb,                       \
+                               BCH_SB_FIELD_##_name), _name);          \
+}                                                                      \
+                                                                       \
+static inline struct bch_sb_field_##_name *                            \
+bch_sb_resize_##_name(struct bcache_superblock *sb, unsigned u64s)     \
+{                                                                      \
+       return field_to_type(bch_sb_field_resize(sb,                    \
+                               BCH_SB_FIELD_##_name, u64s), _name);    \
+}                                                                      \
+                                                                       \
+static inline struct bch_sb_field_##_name *                            \
+bch_fs_sb_resize_##_name(struct cache_set *c, unsigned u64s)           \
+{                                                                      \
+       return field_to_type(bch_fs_sb_field_resize(c,                  \
+                               BCH_SB_FIELD_##_name, u64s), _name);    \
 }
 
 BCH_SB_FIELD_TYPE(journal);
@@ -85,14 +104,11 @@ int bch_fs_mi_update(struct cache_set *, struct bch_member *, unsigned);
 int bch_sb_to_cache_set(struct cache_set *, struct bch_sb *);
 int bch_sb_from_cache_set(struct cache_set *, struct cache *);
 
-struct bch_sb_field *bch_fs_sb_field_resize(struct cache_set *,
-                               struct bch_sb_field *, unsigned);
-struct bch_sb_field *bch_dev_sb_field_resize(struct bcache_superblock *,
-                               struct bch_sb_field *, unsigned);
-
 void bch_free_super(struct bcache_superblock *);
 int bch_super_realloc(struct bcache_superblock *, unsigned);
 
+const char *bch_validate_journal_layout(struct bch_sb *,
+                                       struct cache_member_cpu);
 const char *bch_validate_cache_super(struct bcache_superblock *);
 
 const char *bch_read_super(struct bcache_superblock *,
index fab3480570baf1b43d0041512f3dad04cd1d33b3..5535639cf829509cb8a3d111a2530d735a6beaa8 100644 (file)
@@ -69,7 +69,7 @@ static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait);
 struct workqueue_struct *bcache_io_wq;
 struct crypto_shash *bch_sha256;
 
-static void bch_dev_stop(struct cache *);
+static void bch_dev_free(struct cache *);
 static int bch_dev_online(struct cache *);
 
 static int bch_congested_fn(void *data, int bdi_bits)
@@ -92,8 +92,11 @@ static int bch_congested_fn(void *data, int bdi_bits)
                        }
                }
        } else {
-               /* Writes only go to tier 0: */
-               group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) {
+               /* Writes prefer fastest tier: */
+               struct bch_tier *tier = READ_ONCE(c->fastest_tier);
+               struct cache_group *grp = tier ? &tier->devs : &c->cache_all;
+
+               group_for_each_cache_rcu(ca, grp, i) {
                        bdi = blk_get_backing_dev_info(ca->disk_sb.bdev);
 
                        if (bdi_congested(bdi, bdi_bits)) {
@@ -107,7 +110,7 @@ static int bch_congested_fn(void *data, int bdi_bits)
        return ret;
 }
 
-/* Cache set RO/RW: */
+/* Filesystem RO/RW: */
 
 /*
  * For startup/shutdown of RW stuff, the dependencies are:
@@ -129,9 +132,7 @@ static void __bch_fs_read_only(struct cache_set *c)
        struct cache *ca;
        unsigned i;
 
-       c->tiering_pd.rate.rate = UINT_MAX;
-       bch_ratelimit_reset(&c->tiering_pd.rate);
-       bch_tiering_read_stop(c);
+       bch_tiering_stop(c);
 
        for_each_cache(ca, c, i)
                bch_moving_gc_stop(ca);
@@ -143,20 +144,7 @@ static void __bch_fs_read_only(struct cache_set *c)
        for_each_cache(ca, c, i)
                bch_dev_allocator_stop(ca);
 
-       /*
-        * Write a journal entry after flushing the btree, so we don't end up
-        * replaying everything we just flushed:
-        */
-       if (test_bit(JOURNAL_STARTED, &c->journal.flags)) {
-               int ret;
-
-               bch_journal_flush_async(&c->journal, NULL);
-               ret = bch_journal_meta(&c->journal);
-               BUG_ON(ret && !bch_journal_error(&c->journal));
-       }
-
-       cancel_delayed_work_sync(&c->journal.write_work);
-       cancel_delayed_work_sync(&c->journal.reclaim_work);
+       bch_fs_journal_stop(&c->journal);
 }
 
 static void bch_writes_disabled(struct percpu_ref *writes)
@@ -167,12 +155,27 @@ static void bch_writes_disabled(struct percpu_ref *writes)
        wake_up(&bch_read_only_wait);
 }
 
-static void bch_fs_read_only_work(struct work_struct *work)
+void bch_fs_read_only(struct cache_set *c)
 {
-       struct cache_set *c =
-               container_of(work, struct cache_set, read_only_work);
+       mutex_lock(&c->state_lock);
+       if (c->state != BCH_FS_STARTING &&
+           c->state != BCH_FS_RW)
+               goto out;
+
+       if (test_bit(BCH_FS_ERROR, &c->flags))
+               goto out;
 
-       percpu_ref_put(&c->writes);
+       trace_fs_read_only(c);
+
+       /*
+        * Block new foreground-end write operations from starting - any new
+        * writes will return -EROFS:
+        *
+        * (This is really blocking new _allocations_, writes to previously
+        * allocated space can still happen until stopping the allocator in
+        * bch_dev_allocator_stop()).
+        */
+       percpu_ref_kill(&c->writes);
 
        del_timer(&c->foreground_write_wakeup);
        cancel_delayed_work(&c->pd_controllers_update);
@@ -180,98 +183,77 @@ static void bch_fs_read_only_work(struct work_struct *work)
        c->foreground_write_pd.rate.rate = UINT_MAX;
        bch_wake_delayed_writes((unsigned long) c);
 
-       if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
-               /*
-                * If we're not doing an emergency shutdown, we want to wait on
-                * outstanding writes to complete so they don't see spurious
-                * errors due to shutting down the allocator:
-                */
-               wait_event(bch_read_only_wait,
-                          test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+       /*
+        * If we're not doing an emergency shutdown, we want to wait on
+        * outstanding writes to complete so they don't see spurious errors due
+        * to shutting down the allocator:
+        *
+        * If we are doing an emergency shutdown outstanding writes may
+        * hang until we shutdown the allocator so we don't want to wait
+        * on outstanding writes before shutting everything down - but
+        * we do need to wait on them before returning and signalling
+        * that going RO is complete:
+        */
+       wait_event(bch_read_only_wait,
+                  test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
+                  test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
 
-               __bch_fs_read_only(c);
+       __bch_fs_read_only(c);
 
-               if (!bch_journal_error(&c->journal) &&
-                   !test_bit(BCH_FS_ERROR, &c->flags)) {
-                       mutex_lock(&c->sb_lock);
-                       SET_BCH_SB_CLEAN(c->disk_sb, true);
-                       bch_write_super(c);
-                       mutex_unlock(&c->sb_lock);
-               }
-       } else {
-               /*
-                * If we are doing an emergency shutdown outstanding writes may
-                * hang until we shutdown the allocator so we don't want to wait
-                * on outstanding writes before shutting everything down - but
-                * we do need to wait on them before returning and signalling
-                * that going RO is complete:
-                */
-               __bch_fs_read_only(c);
+       wait_event(bch_read_only_wait,
+                  test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+
+       clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
 
-               wait_event(bch_read_only_wait,
-                          test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+       if (!bch_journal_error(&c->journal) &&
+           !test_bit(BCH_FS_ERROR, &c->flags)) {
+               mutex_lock(&c->sb_lock);
+               SET_BCH_SB_CLEAN(c->disk_sb, true);
+               bch_write_super(c);
+               mutex_unlock(&c->sb_lock);
        }
 
+       c->state = BCH_FS_RO;
        bch_notify_fs_read_only(c);
        trace_fs_read_only_done(c);
-
-       set_bit(BCH_FS_RO_COMPLETE, &c->flags);
-       wake_up(&bch_read_only_wait);
+out:
+       mutex_unlock(&c->state_lock);
 }
 
-bool bch_fs_read_only(struct cache_set *c)
+static void bch_fs_read_only_work(struct work_struct *work)
 {
-       if (test_and_set_bit(BCH_FS_RO, &c->flags))
-               return false;
-
-       trace_fs_read_only(c);
-
-       percpu_ref_get(&c->writes);
+       struct cache_set *c =
+               container_of(work, struct cache_set, read_only_work);
 
-       /*
-        * Block new foreground-end write operations from starting - any new
-        * writes will return -EROFS:
-        *
-        * (This is really blocking new _allocations_, writes to previously
-        * allocated space can still happen until stopping the allocator in
-        * bch_dev_allocator_stop()).
-        */
-       percpu_ref_kill(&c->writes);
+       bch_fs_read_only(c);
+}
 
-       queue_work(system_freezable_wq, &c->read_only_work);
-       return true;
+static void bch_fs_read_only_async(struct cache_set *c)
+{
+       queue_work(system_long_wq, &c->read_only_work);
 }
 
 bool bch_fs_emergency_read_only(struct cache_set *c)
 {
        bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
 
-       bch_fs_read_only(c);
+       bch_fs_read_only_async(c);
        bch_journal_halt(&c->journal);
 
        wake_up(&bch_read_only_wait);
        return ret;
 }
 
-void bch_fs_read_only_sync(struct cache_set *c)
-{
-       /* so we don't race with bch_fs_read_write() */
-       lockdep_assert_held(&bch_register_lock);
-
-       bch_fs_read_only(c);
-
-       wait_event(bch_read_only_wait,
-                  test_bit(BCH_FS_RO_COMPLETE, &c->flags) &&
-                  test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
-}
-
-static const char *__bch_fs_read_write(struct cache_set *c)
+const char *bch_fs_read_write(struct cache_set *c)
 {
        struct cache *ca;
-       const char *err;
+       const char *err = NULL;
        unsigned i;
 
-       lockdep_assert_held(&bch_register_lock);
+       mutex_lock(&c->state_lock);
+       if (c->state != BCH_FS_STARTING &&
+           c->state != BCH_FS_RO)
+               goto out;
 
        err = "error starting allocator thread";
        for_each_cache(ca, c, i)
@@ -285,67 +267,43 @@ static const char *__bch_fs_read_write(struct cache_set *c)
        if (bch_gc_thread_start(c))
                goto err;
 
-       for_each_cache(ca, c, i) {
-               if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
-                       continue;
-
-               err = "error starting moving GC thread";
-               if (bch_moving_gc_thread_start(ca)) {
+       err = "error starting moving GC thread";
+       for_each_cache(ca, c, i)
+               if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
+                   bch_moving_gc_start(ca)) {
                        percpu_ref_put(&ca->ref);
                        goto err;
                }
-       }
 
        err = "error starting tiering thread";
-       if (bch_tiering_read_start(c))
+       if (bch_tiering_start(c))
                goto err;
 
        schedule_delayed_work(&c->pd_controllers_update, 5 * HZ);
 
-       return NULL;
+       if (c->state != BCH_FS_STARTING)
+               percpu_ref_reinit(&c->writes);
+
+       c->state = BCH_FS_RW;
+       err = NULL;
+out:
+       mutex_unlock(&c->state_lock);
+       return err;
 err:
        __bch_fs_read_only(c);
-       return err;
-}
-
-const char *bch_fs_read_write(struct cache_set *c)
-{
-       const char *err;
-
-       lockdep_assert_held(&bch_register_lock);
-
-       if (!test_bit(BCH_FS_RO_COMPLETE, &c->flags))
-               return NULL;
-
-       err = __bch_fs_read_write(c);
-       if (err)
-               return err;
-
-       percpu_ref_reinit(&c->writes);
-
-       clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
-       clear_bit(BCH_FS_EMERGENCY_RO, &c->flags);
-       clear_bit(BCH_FS_RO_COMPLETE, &c->flags);
-       clear_bit(BCH_FS_RO, &c->flags);
-       return NULL;
+       goto out;
 }
 
-/* Cache set startup/shutdown: */
+/* Filesystem startup/shutdown: */
 
 static void bch_fs_free(struct cache_set *c)
 {
-       del_timer_sync(&c->foreground_write_wakeup);
-       cancel_delayed_work_sync(&c->pd_controllers_update);
-       cancel_work_sync(&c->read_only_work);
-       cancel_work_sync(&c->bio_submit_work);
-       cancel_work_sync(&c->read_retry_work);
-
-       bch_fs_encryption_free(c);
-       bch_btree_cache_free(c);
-       bch_journal_free(&c->journal);
+       bch_fs_encryption_exit(c);
+       bch_fs_btree_exit(c);
+       bch_fs_journal_exit(&c->journal);
        bch_io_clock_exit(&c->io_clock[WRITE]);
        bch_io_clock_exit(&c->io_clock[READ]);
-       bch_compress_free(c);
+       bch_fs_compress_exit(c);
        bch_fs_blockdev_exit(c);
        bdi_destroy(&c->bdi);
        lg_lock_free(&c->bucket_stats_lock);
@@ -372,6 +330,52 @@ static void bch_fs_free(struct cache_set *c)
        module_put(THIS_MODULE);
 }
 
+static void bch_fs_exit(struct cache_set *c)
+{
+       unsigned i;
+
+       del_timer_sync(&c->foreground_write_wakeup);
+       cancel_delayed_work_sync(&c->pd_controllers_update);
+       cancel_work_sync(&c->read_only_work);
+       cancel_work_sync(&c->bio_submit_work);
+       cancel_work_sync(&c->read_retry_work);
+
+       for (i = 0; i < c->sb.nr_devices; i++)
+               if (c->cache[i])
+                       bch_dev_free(c->cache[i]);
+
+       closure_debug_destroy(&c->cl);
+       kobject_put(&c->kobj);
+}
+
+static void bch_fs_offline(struct cache_set *c)
+{
+       struct cache *ca;
+       unsigned i;
+
+       mutex_lock(&bch_register_lock);
+       list_del(&c->list);
+       mutex_unlock(&bch_register_lock);
+
+       if (c->kobj.state_in_sysfs)
+               kobject_del(&c->kobj);
+
+       for_each_cache(ca, c, i)
+               if (ca->kobj.state_in_sysfs)
+                       kobject_del(&ca->kobj);
+
+       bch_fs_debug_exit(c);
+       bch_fs_chardev_exit(c);
+
+       bch_cache_accounting_destroy(&c->accounting);
+
+       kobject_put(&c->time_stats);
+       kobject_put(&c->opts_dir);
+       kobject_put(&c->internal);
+
+       __bch_fs_read_only(c);
+}
+
 /*
  * should be __bch_fs_stop4 - block devices are closed, now we can finally
  * free it
@@ -379,15 +383,9 @@ static void bch_fs_free(struct cache_set *c)
 void bch_fs_release(struct kobject *kobj)
 {
        struct cache_set *c = container_of(kobj, struct cache_set, kobj);
-       struct completion *stop_completion = c->stop_completion;
 
        bch_notify_fs_stopped(c);
-       bch_info(c, "stopped");
-
        bch_fs_free(c);
-
-       if (stop_completion)
-               complete(stop_completion);
 }
 
 /*
@@ -396,18 +394,8 @@ void bch_fs_release(struct kobject *kobj)
 static void __bch_fs_stop3(struct closure *cl)
 {
        struct cache_set *c = container_of(cl, struct cache_set, cl);
-       struct cache *ca;
-       unsigned i;
 
-       mutex_lock(&bch_register_lock);
-       for_each_cache(ca, c, i)
-               bch_dev_stop(ca);
-
-       list_del(&c->list);
-       mutex_unlock(&bch_register_lock);
-
-       closure_debug_destroy(&c->cl);
-       kobject_put(&c->kobj);
+       bch_fs_exit(c);
 }
 
 /*
@@ -418,28 +406,14 @@ static void __bch_fs_stop2(struct closure *cl)
 {
        struct cache_set *c = container_of(cl, struct cache_set, caching);
 
-       bch_debug_exit_cache_set(c);
-       bch_fs_chardev_exit(c);
-
-       if (c->kobj.state_in_sysfs)
-               kobject_del(&c->kobj);
-
-       bch_cache_accounting_destroy(&c->accounting);
-
-       kobject_put(&c->time_stats);
-       kobject_put(&c->opts_dir);
-       kobject_put(&c->internal);
-
-       mutex_lock(&bch_register_lock);
-       bch_fs_read_only_sync(c);
-       mutex_unlock(&bch_register_lock);
+       bch_fs_offline(c);
 
        closure_return(cl);
 }
 
 /*
- * First phase of the shutdown process that's kicked off by bch_fs_stop(); we
- * haven't waited for anything to stop yet, we're just punting to process
+ * First phase of the shutdown process that's kicked off by bch_fs_stop_async();
+ * we haven't waited for anything to stop yet, we're just punting to process
  * context to shut down block devices:
  */
 static void __bch_fs_stop1(struct closure *cl)
@@ -451,29 +425,42 @@ static void __bch_fs_stop1(struct closure *cl)
        continue_at(cl, __bch_fs_stop2, system_wq);
 }
 
-void bch_fs_stop(struct cache_set *c)
+void bch_fs_stop_async(struct cache_set *c)
 {
-       if (!test_and_set_bit(BCH_FS_STOPPING, &c->flags))
+       mutex_lock(&c->state_lock);
+       if (c->state != BCH_FS_STOPPING) {
+               c->state = BCH_FS_STOPPING;
                closure_queue(&c->caching);
+       }
+       mutex_unlock(&c->state_lock);
 }
 
-void bch_fs_stop_sync(struct cache_set *c)
+void bch_fs_stop(struct cache_set *c)
 {
-       DECLARE_COMPLETION_ONSTACK(complete);
+       mutex_lock(&c->state_lock);
+       BUG_ON(c->state == BCH_FS_STOPPING);
+       c->state = BCH_FS_STOPPING;
+       mutex_unlock(&c->state_lock);
+
+       bch_blockdevs_stop(c);
+
+       closure_sync(&c->caching);
+       closure_debug_destroy(&c->caching);
+
+       bch_fs_offline(c);
 
-       c->stop_completion = &complete;
-       bch_fs_stop(c);
        closure_put(&c->cl);
+       closure_sync(&c->cl);
 
-       /* Killable? */
-       wait_for_completion(&complete);
+       bch_fs_exit(c);
+       kobject_put(&c->kobj);
 }
 
 /* Stop, detaching from backing devices: */
 void bch_fs_detach(struct cache_set *c)
 {
        if (!test_and_set_bit(BCH_FS_DETACHING, &c->flags))
-               bch_fs_stop(c);
+               bch_fs_stop_async(c);
 }
 
 static unsigned bch_fs_nr_devices(struct cache_set *c)
@@ -520,6 +507,7 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
        c->minor                = -1;
 
+       mutex_init(&c->state_lock);
        mutex_init(&c->sb_lock);
        INIT_RADIX_TREE(&c->devices, GFP_KERNEL);
        mutex_init(&c->btree_cache_lock);
@@ -534,8 +522,8 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        BCH_TIME_STATS()
 #undef BCH_TIME_STAT
 
-       bch_open_buckets_init(c);
-       bch_tiering_init_cache_set(c);
+       bch_fs_allocator_init(c);
+       bch_fs_tiering_init(c);
 
        INIT_LIST_HEAD(&c->list);
        INIT_LIST_HEAD(&c->cached_devs);
@@ -636,10 +624,10 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
            bch_fs_blockdev_init(c) ||
            bch_io_clock_init(&c->io_clock[READ]) ||
            bch_io_clock_init(&c->io_clock[WRITE]) ||
-           bch_journal_alloc(&c->journal, journal_entry_bytes) ||
-           bch_btree_cache_alloc(c) ||
+           bch_fs_journal_init(&c->journal, journal_entry_bytes) ||
+           bch_fs_btree_init(c) ||
            bch_fs_encryption_init(c) ||
-           bch_compress_init(c) ||
+           bch_fs_compress_init(c) ||
            bch_check_set_has_compressed_data(c, c->opts.compression))
                goto err;
 
@@ -664,6 +652,7 @@ static struct cache_set *bch_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        closure_init(&c->caching, &c->cl);
        set_closure_fn(&c->caching, __bch_fs_stop1, system_wq);
 
+       closure_get(&c->cl);
        continue_at_noreturn(&c->cl, __bch_fs_stop3, system_wq);
        return c;
 err:
@@ -671,7 +660,20 @@ err:
        return NULL;
 }
 
-static int bch_fs_online(struct cache_set *c)
+static struct cache_set *bch_fs_lookup(uuid_le uuid)
+{
+       struct cache_set *c;
+
+       lockdep_assert_held(&bch_register_lock);
+
+       list_for_each_entry(c, &bch_fs_list, list)
+               if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
+                       return c;
+
+       return NULL;
+}
+
+static const char *__bch_fs_online(struct cache_set *c)
 {
        struct cache *ca;
        unsigned i;
@@ -680,31 +682,58 @@ static int bch_fs_online(struct cache_set *c)
        lockdep_assert_held(&bch_register_lock);
 
        if (!list_empty(&c->list))
-               return 0;
+               return NULL;
 
-       list_add(&c->list, &bch_fs_list);
+       if (bch_fs_lookup(c->sb.uuid))
+               return "filesystem UUID already open";
 
        ret = bch_fs_chardev_init(c);
        if (ret)
-               return ret;
+               return "error creating character device";
+
+       bch_fs_debug_init(c);
 
        if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ||
            kobject_add(&c->internal, &c->kobj, "internal") ||
            kobject_add(&c->opts_dir, &c->kobj, "options") ||
            kobject_add(&c->time_stats, &c->kobj, "time_stats") ||
            bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
-               return -1;
+               return "error creating sysfs objects";
 
        for_each_cache(ca, c, i)
                if (bch_dev_online(ca)) {
                        percpu_ref_put(&ca->ref);
-                       return -1;
+                       return "error creating sysfs objects";
                }
 
+       mutex_lock(&c->state_lock);
+
+       if (bch_blockdev_volumes_start(c)) {
+               mutex_unlock(&c->state_lock);
+               return "can't bring up blockdev volumes";
+       }
+
+       bch_attach_backing_devs(c);
+
+       mutex_unlock(&c->state_lock);
+
+       list_add(&c->list, &bch_fs_list);
+
        return 0;
 }
 
-static const char *bch_fs_start(struct cache_set *c)
+static const char *bch_fs_online(struct cache_set *c)
+{
+       const char *err;
+
+       mutex_lock(&bch_register_lock);
+       err = __bch_fs_online(c);
+       mutex_unlock(&bch_register_lock);
+
+       return err;
+}
+
+static const char *__bch_fs_start(struct cache_set *c)
 {
        const char *err = "cannot allocate memory";
        struct bch_sb_field_members *mi;
@@ -715,11 +744,7 @@ static const char *bch_fs_start(struct cache_set *c)
        struct jset *j;
        int ret = -EINVAL;
 
-       lockdep_assert_held(&bch_register_lock);
-       BUG_ON(test_bit(BCH_FS_RUNNING, &c->flags));
-
-       /* We don't want bch_fatal_error() to free underneath us */
-       closure_get(&c->caching);
+       BUG_ON(c->state != BCH_FS_STARTING);
 
        /*
         * Make sure that each cache object's mi is up to date before
@@ -826,6 +851,16 @@ static const char *bch_fs_start(struct cache_set *c)
 
                bch_notice(c, "initializing new filesystem");
 
+               bch_initial_gc(c, NULL);
+
+               err = "error starting allocator thread";
+               for_each_cache(ca, c, i)
+                       if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
+                           bch_dev_allocator_start(ca)) {
+                               percpu_ref_put(&ca->ref);
+                               goto err;
+                       }
+
                err = "unable to allocate journal buckets";
                for_each_cache(ca, c, i)
                        if (bch_dev_journal_alloc(ca)) {
@@ -833,8 +868,6 @@ static const char *bch_fs_start(struct cache_set *c)
                                goto err;
                        }
 
-               bch_initial_gc(c, NULL);
-
                /*
                 * journal_res_get() will crash if called before this has
                 * set up the journal.pin FIFO and journal.cur pointer:
@@ -842,14 +875,6 @@ static const char *bch_fs_start(struct cache_set *c)
                bch_journal_start(c);
                bch_journal_set_replay_done(&c->journal);
 
-               err = "error starting allocator thread";
-               for_each_cache(ca, c, i)
-                       if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE &&
-                           bch_dev_allocator_start(ca)) {
-                               percpu_ref_put(&ca->ref);
-                               goto err;
-                       }
-
                err = "cannot allocate new btree root";
                for (id = 0; id < BTREE_ID_NR; id++)
                        if (bch_btree_root_alloc(c, id, &cl)) {
@@ -877,10 +902,14 @@ static const char *bch_fs_start(struct cache_set *c)
                        goto err;
        }
 recovery_done:
+       err = "dynamic fault";
+       if (bch_fs_init_fault("fs_start"))
+               goto err;
+
        if (c->opts.read_only) {
-               bch_fs_read_only_sync(c);
+               bch_fs_read_only(c);
        } else {
-               err = __bch_fs_read_write(c);
+               err = bch_fs_read_write(c);
                if (err)
                        goto err;
        }
@@ -901,27 +930,9 @@ recovery_done:
        bch_write_super(c);
        mutex_unlock(&c->sb_lock);
 
-       err = "dynamic fault";
-       if (bch_fs_init_fault("fs_start"))
-               goto err;
-
-       err = "error creating kobject";
-       if (bch_fs_online(c))
-               goto err;
-
-       err = "can't bring up blockdev volumes";
-       if (bch_blockdev_volumes_start(c))
-               goto err;
-
-       bch_debug_init_cache_set(c);
-       set_bit(BCH_FS_RUNNING, &c->flags);
-       bch_attach_backing_devs(c);
-
-       bch_notify_fs_read_write(c);
        err = NULL;
 out:
        bch_journal_entries_free(&journal);
-       closure_put(&c->caching);
        return err;
 err:
        switch (ret) {
@@ -955,6 +966,11 @@ err:
        goto out;
 }
 
+const char *bch_fs_start(struct cache_set *c)
+{
+       return __bch_fs_start(c) ?: bch_fs_online(c);
+}
+
 static const char *bch_dev_may_add(struct bch_sb *sb, struct cache_set *c)
 {
        struct bch_sb_field_members *sb_mi;
@@ -999,7 +1015,7 @@ static const char *bch_dev_in_fs(struct bch_sb *sb, struct cache_set *c)
        return NULL;
 }
 
-/* Cache device */
+/* Device startup/shutdown, ro/rw: */
 
 bool bch_dev_read_only(struct cache *ca)
 {
@@ -1009,14 +1025,14 @@ bool bch_dev_read_only(struct cache *ca)
 
        bdevname(ca->disk_sb.bdev, buf);
 
-       lockdep_assert_held(&bch_register_lock);
+       lockdep_assert_held(&c->state_lock);
 
        if (ca->mi.state != BCH_MEMBER_STATE_ACTIVE)
                return false;
 
        if (!bch_dev_may_remove(ca)) {
                bch_err(c, "required member %s going RO, forcing fs RO", buf);
-               bch_fs_read_only_sync(c);
+               bch_fs_read_only(c);
        }
 
        trace_bcache_cache_read_only(ca);
@@ -1053,7 +1069,7 @@ bool bch_dev_read_only(struct cache *ca)
 
 static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
 {
-       lockdep_assert_held(&bch_register_lock);
+       lockdep_assert_held(&c->state_lock);
 
        if (ca->mi.state == BCH_MEMBER_STATE_ACTIVE)
                return NULL;
@@ -1066,12 +1082,11 @@ static const char *__bch_dev_read_write(struct cache_set *c, struct cache *ca)
        if (bch_dev_allocator_start(ca))
                return "error starting allocator thread";
 
-       if (bch_moving_gc_thread_start(ca))
+       if (bch_moving_gc_start(ca))
                return "error starting moving GC thread";
 
-       bch_dev_group_add(&c->journal.devs, ca);
-
-       wake_up_process(c->tiering_read);
+       if (bch_tiering_start(c))
+               return "error starting tiering thread";
 
        bch_notify_dev_read_write(ca);
        trace_bcache_cache_read_write_done(ca);
@@ -1099,22 +1114,15 @@ const char *bch_dev_read_write(struct cache *ca)
        return NULL;
 }
 
-/*
- * bch_dev_stop has already returned, so we no longer hold the register
- * lock at the point this is called.
- */
-
 void bch_dev_release(struct kobject *kobj)
 {
        struct cache *ca = container_of(kobj, struct cache, kobj);
 
-       percpu_ref_exit(&ca->ref);
        kfree(ca);
 }
 
-static void bch_dev_free_work(struct work_struct *work)
+static void bch_dev_free(struct cache *ca)
 {
-       struct cache *ca = container_of(work, struct cache, free_work);
        struct cache_set *c = ca->set;
        unsigned i;
 
@@ -1131,15 +1139,7 @@ static void bch_dev_free_work(struct work_struct *work)
                kobject_del(&ca->kobj);
 
        bch_free_super(&ca->disk_sb);
-
-       /*
-        * bch_dev_stop can be called in the middle of initialization
-        * of the struct cache object.
-        * As such, not all the sub-structures may be initialized.
-        * However, they were zeroed when the object was allocated.
-        */
-
-       bch_journal_free_cache(ca);
+       bch_dev_journal_exit(ca);
        free_percpu(ca->sectors_written);
        bioset_exit(&ca->replica_set);
        free_percpu(ca->bucket_stats_percpu);
@@ -1155,12 +1155,20 @@ static void bch_dev_free_work(struct work_struct *work)
        for (i = 0; i < RESERVE_NR; i++)
                free_fifo(&ca->free[i]);
 
+       percpu_ref_exit(&ca->ref);
        kobject_put(&ca->kobj);
 
        if (c)
                kobject_put(&c->kobj);
 }
 
+static void bch_dev_free_work(struct work_struct *work)
+{
+       struct cache *ca = container_of(work, struct cache, free_work);
+
+       bch_dev_free(ca);
+}
+
 static void bch_dev_percpu_ref_release(struct percpu_ref *ref)
 {
        struct cache *ca = container_of(ref, struct cache, ref);
@@ -1193,12 +1201,10 @@ static void bch_dev_stop(struct cache *ca)
 {
        struct cache_set *c = ca->set;
 
-       lockdep_assert_held(&bch_register_lock);
+       lockdep_assert_held(&c->state_lock);
 
-       if (c) {
-               BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca);
-               rcu_assign_pointer(c->cache[ca->dev_idx], NULL);
-       }
+       BUG_ON(rcu_access_pointer(c->cache[ca->dev_idx]) != ca);
+       rcu_assign_pointer(c->cache[ca->dev_idx], NULL);
 
        call_rcu(&ca->free_rcu, bch_dev_free_rcu);
 }
@@ -1281,7 +1287,8 @@ static void bch_dev_remove_work(struct work_struct *work)
         */
        closure_get(&c->cl);
 
-       mutex_lock(&bch_register_lock);
+       mutex_lock(&c->state_lock);
+
        bch_dev_stop(ca);
 
        /*
@@ -1290,8 +1297,6 @@ static void bch_dev_remove_work(struct work_struct *work)
         */
        synchronize_rcu();
 
-       lockdep_assert_held(&bch_register_lock);
-
        /*
         * Free this device's slot in the bch_member array - all pointers to
         * this device must be gone:
@@ -1301,23 +1306,20 @@ static void bch_dev_remove_work(struct work_struct *work)
        memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid));
 
        bch_write_super(c);
-       mutex_unlock(&c->sb_lock);
 
-       mutex_unlock(&bch_register_lock);
+       mutex_unlock(&c->sb_lock);
+       mutex_unlock(&c->state_lock);
 
        closure_put(&c->cl);
 }
 
-bool bch_dev_remove(struct cache *ca, bool force)
+static bool __bch_dev_remove(struct cache_set *c, struct cache *ca, bool force)
 {
-       mutex_lock(&bch_register_lock);
-
        if (test_bit(BCH_DEV_REMOVING, &ca->flags))
                return false;
 
        if (!bch_dev_may_remove(ca)) {
-               bch_err(ca->set, "Can't remove last device in tier %u",
-                       ca->mi.tier);
+               bch_err(ca->set, "Can't remove last RW device");
                bch_notify_dev_remove_failed(ca);
                return false;
        }
@@ -1327,23 +1329,32 @@ bool bch_dev_remove(struct cache *ca, bool force)
 
        if (force)
                set_bit(BCH_DEV_FORCE_REMOVE, &ca->flags);
+
        set_bit(BCH_DEV_REMOVING, &ca->flags);
        bch_notify_dev_removing(ca);
 
-       mutex_unlock(&bch_register_lock);
-
        /* Migrate the data and finish removal asynchronously: */
 
        queue_work(system_long_wq, &ca->remove_work);
        return true;
 }
 
+bool bch_dev_remove(struct cache *ca, bool force)
+{
+       struct cache_set *c = ca->set;
+       bool ret;
+
+       mutex_lock(&c->state_lock);
+       ret = __bch_dev_remove(c, ca, force);
+       mutex_unlock(&c->state_lock);
+
+       return ret;
+}
+
 static int bch_dev_online(struct cache *ca)
 {
        char buf[12];
 
-       lockdep_assert_held(&bch_register_lock);
-
        sprintf(buf, "cache%u", ca->dev_idx);
 
        if (kobject_add(&ca->kobj,
@@ -1386,7 +1397,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
        kobject_init(&ca->kobj, &bch_dev_ktype);
 
        spin_lock_init(&ca->self.lock);
-       ca->self.nr_devices = 1;
+       ca->self.nr = 1;
        rcu_assign_pointer(ca->self.d[0].dev, ca);
        ca->dev_idx = sb->sb->dev_idx;
 
@@ -1395,10 +1406,11 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
        spin_lock_init(&ca->freelist_lock);
        spin_lock_init(&ca->prio_buckets_lock);
        mutex_init(&ca->heap_lock);
-       bch_moving_init_cache(ca);
+       bch_dev_moving_gc_init(ca);
 
        ca->disk_sb = *sb;
-       ca->disk_sb.bdev->bd_holder = ca;
+       if (sb->mode & FMODE_EXCL)
+               ca->disk_sb.bdev->bd_holder = ca;
        memset(sb, 0, sizeof(*sb));
 
        INIT_WORK(&ca->io_error_work, bch_nonfatal_io_error_work);
@@ -1444,7 +1456,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
            bioset_init(&ca->replica_set, 4,
                        offsetof(struct bch_write_bio, bio)) ||
            !(ca->sectors_written = alloc_percpu(*ca->sectors_written)) ||
-           bch_journal_init_cache(ca))
+           bch_dev_journal_init(ca))
                goto err;
 
        ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
@@ -1482,7 +1494,7 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
        err = "error creating kobject";
        if (c->kobj.state_in_sysfs &&
            bch_dev_online(ca))
-               goto err;
+               pr_warn("error creating sysfs objects");
 
        if (ret)
                *ret = ca;
@@ -1490,49 +1502,34 @@ static const char *bch_dev_alloc(struct bcache_superblock *sb,
                kobject_put(&ca->kobj);
        return NULL;
 err:
-       bch_dev_stop(ca);
+       bch_dev_free(ca);
        return err;
 }
 
-static struct cache_set *bch_fs_lookup(uuid_le uuid)
-{
-       struct cache_set *c;
-
-       lockdep_assert_held(&bch_register_lock);
-
-       list_for_each_entry(c, &bch_fs_list, list)
-               if (!memcmp(&c->disk_sb->uuid, &uuid, sizeof(uuid_le)))
-                       return c;
-
-       return NULL;
-}
-
 int bch_dev_add(struct cache_set *c, const char *path)
 {
        struct bcache_superblock sb;
        const char *err;
        struct cache *ca;
-       struct bch_sb_field *f;
        struct bch_sb_field_members *mi, *dev_mi;
        struct bch_member saved_mi;
        unsigned dev_idx, nr_devices, u64s;
        int ret = -EINVAL;
 
-       mutex_lock(&bch_register_lock);
-
        err = bch_read_super(&sb, c->opts, path);
        if (err)
-               goto err_unlock_register;
+               return -EINVAL;
 
        err = bch_validate_cache_super(&sb);
        if (err)
-               goto err_unlock_register;
-
-       mutex_lock(&c->sb_lock);
+               return -EINVAL;
 
        err = bch_dev_may_add(sb.sb, c);
        if (err)
-               goto err_unlock;
+               return -EINVAL;
+
+       mutex_lock(&c->state_lock);
+       mutex_lock(&c->sb_lock);
 
        /*
         * Preserve the old cache member information (esp. tier)
@@ -1571,17 +1568,14 @@ have_slot:
                sizeof(struct bch_member) * nr_devices) / sizeof(u64);
        err = "no space in superblock for member info";
 
-       f = bch_fs_sb_field_resize(c, &mi->field, u64s);
-       if (!f)
+       mi = bch_fs_sb_resize_members(c, u64s);
+       if (!mi)
                goto err_unlock;
 
-       mi = container_of(f, struct bch_sb_field_members, field);
-
-       f = bch_dev_sb_field_resize(&sb, &dev_mi->field, u64s);
-       if (!f)
+       dev_mi = bch_sb_resize_members(&sb, u64s);
+       if (!dev_mi)
                goto err_unlock;
 
-       dev_mi = container_of(f, struct bch_sb_field_members, field);
        memcpy(dev_mi, mi, u64s * sizeof(u64));
        dev_mi->members[dev_idx] = saved_mi;
 
@@ -1619,14 +1613,13 @@ have_slot:
 
        kobject_put(&ca->kobj);
        mutex_unlock(&c->sb_lock);
-       mutex_unlock(&bch_register_lock);
+       mutex_unlock(&c->state_lock);
        return 0;
 err_put:
        bch_dev_stop(ca);
 err_unlock:
        mutex_unlock(&c->sb_lock);
-err_unlock_register:
-       mutex_unlock(&bch_register_lock);
+       mutex_unlock(&c->state_lock);
        bch_free_super(&sb);
 
        bch_err(c, "Unable to add device: %s", err);
@@ -1639,11 +1632,8 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices,
        const char *err;
        struct cache_set *c = NULL;
        struct bcache_superblock *sb;
-       uuid_le uuid;
        unsigned i;
 
-       memset(&uuid, 0, sizeof(uuid_le));
-
        if (!nr_devices)
                return "need at least one device";
 
@@ -1655,60 +1645,49 @@ const char *bch_fs_open(char * const *devices, unsigned nr_devices,
        if (!sb)
                goto err;
 
-       /*
-        * bch_read_super() needs to happen under register_lock, so that the
-        * exclusive open is atomic with adding the new cache set to the list of
-        * cache sets:
-        */
-       mutex_lock(&bch_register_lock);
-
        for (i = 0; i < nr_devices; i++) {
                err = bch_read_super(&sb[i], opts, devices[i]);
                if (err)
-                       goto err_unlock;
+                       goto err;
 
                err = "attempting to register backing device";
                if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version)))
-                       goto err_unlock;
+                       goto err;
 
                err = bch_validate_cache_super(&sb[i]);
                if (err)
-                       goto err_unlock;
+                       goto err;
        }
 
-       err = "cache set already registered";
-       if (bch_fs_lookup(sb->sb->uuid))
-               goto err_unlock;
-
        err = "cannot allocate memory";
        c = bch_fs_alloc(sb[0].sb, opts);
        if (!c)
-               goto err_unlock;
+               goto err;
 
        for (i = 0; i < nr_devices; i++) {
                err = bch_dev_alloc(&sb[i], c, NULL);
                if (err)
-                       goto err_unlock;
+                       goto err;
        }
 
        err = "insufficient devices";
        if (bch_fs_nr_online_devices(c) != bch_fs_nr_devices(c))
-               goto err_unlock;
+               goto err;
 
-       err = bch_fs_start(c);
-       if (err)
-               goto err_unlock;
+       if (!c->opts.nostart) {
+               err = __bch_fs_start(c);
+               if (err)
+                       goto err;
+       }
 
-       err = "error creating kobject";
-       if (bch_fs_online(c))
-               goto err_unlock;
+       err = bch_fs_online(c);
+       if (err)
+               goto err;
 
-       if (ret) {
-               closure_get(&c->cl);
+       if (ret)
                *ret = c;
-       }
-
-       mutex_unlock(&bch_register_lock);
+       else
+               closure_put(&c->cl);
 
        err = NULL;
 out:
@@ -1717,20 +1696,18 @@ out:
        if (err)
                c = NULL;
        return err;
-err_unlock:
+err:
        if (c)
                bch_fs_stop(c);
-       mutex_unlock(&bch_register_lock);
-err:
+
        for (i = 0; i < nr_devices; i++)
                bch_free_super(&sb[i]);
        goto out;
 }
 
 static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
-                                 struct bch_opts opts)
+                                            struct bch_opts opts)
 {
-       char name[BDEVNAME_SIZE];
        const char *err;
        struct cache_set *c;
        bool allocated_cache_set = false;
@@ -1739,17 +1716,19 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
        if (err)
                return err;
 
-       bdevname(sb->bdev, name);
-
+       mutex_lock(&bch_register_lock);
        c = bch_fs_lookup(sb->sb->uuid);
        if (c) {
+               closure_get(&c->cl);
+
                err = bch_dev_in_fs(sb->sb, c);
                if (err)
-                       return err;
+                       goto err;
        } else {
                c = bch_fs_alloc(sb->sb, opts);
+               err = "cannot allocate memory";
                if (!c)
-                       return "cannot allocate memory";
+                       goto err;
 
                allocated_cache_set = true;
        }
@@ -1758,21 +1737,29 @@ static const char *__bch_fs_open_incremental(struct bcache_superblock *sb,
        if (err)
                goto err;
 
-       if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c)) {
-               err = bch_fs_start(c);
+       if (bch_fs_nr_online_devices(c) == bch_fs_nr_devices(c) &&
+           !c->opts.nostart) {
+               err = __bch_fs_start(c);
                if (err)
                        goto err;
-       } else {
-               err = "error creating kobject";
-               if (bch_fs_online(c))
-                       goto err;
        }
 
-       bch_info(c, "started");
+       err = __bch_fs_online(c);
+       if (err)
+               goto err;
+
+       closure_put(&c->cl);
+       mutex_unlock(&bch_register_lock);
+
        return NULL;
 err:
+       mutex_unlock(&bch_register_lock);
+
        if (allocated_cache_set)
                bch_fs_stop(c);
+       else if (c)
+               closure_put(&c->cl);
+
        return err;
 }
 
@@ -1782,20 +1769,20 @@ const char *bch_fs_open_incremental(const char *path)
        struct bch_opts opts = bch_opts_empty();
        const char *err;
 
-       mutex_lock(&bch_register_lock);
-
        err = bch_read_super(&sb, opts, path);
        if (err)
-               goto err;
+               return err;
 
-       if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version)))
+       if (__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) {
+               mutex_lock(&bch_register_lock);
                err = bch_backing_dev_register(&sb);
-       else
+               mutex_unlock(&bch_register_lock);
+       } else {
                err = __bch_fs_open_incremental(&sb, opts);
+       }
 
        bch_free_super(&sb);
-err:
-       mutex_unlock(&bch_register_lock);
+
        return err;
 }
 
@@ -1854,10 +1841,10 @@ static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
                        pr_info("Setting all devices read only:");
 
                list_for_each_entry(c, &bch_fs_list, list)
-                       bch_fs_read_only(c);
+                       bch_fs_read_only_async(c);
 
                list_for_each_entry(c, &bch_fs_list, list)
-                       bch_fs_read_only_sync(c);
+                       bch_fs_read_only(c);
 
                mutex_unlock(&bch_register_lock);
        }
@@ -1882,7 +1869,7 @@ kobj_attribute_write(reboot,              reboot_test);
 static void bcache_exit(void)
 {
        bch_debug_exit();
-       bch_fs_exit();
+       bch_vfs_exit();
        bch_blockdev_exit();
        bch_chardev_exit();
        if (bcache_kset)
@@ -1917,7 +1904,7 @@ static int __init bcache_init(void)
            sysfs_create_files(&bcache_kset->kobj, files) ||
            bch_chardev_init() ||
            bch_blockdev_init() ||
-           bch_fs_init() ||
+           bch_vfs_init() ||
            bch_debug_init())
                goto err;
 
index bcf7d9837aae04d99e9eb84bfd4a38e052223e49..bafd88e087e92128fd55ea5a192dc77566d56572 100644 (file)
@@ -57,27 +57,11 @@ static inline struct cache *bch_get_next_cache(struct cache_set *c,
 static inline bool bch_dev_may_remove(struct cache *ca)
 {
        struct cache_set *c = ca->set;
-       struct cache_group *tier = &c->cache_tiers[ca->mi.tier];
-
-       /*
-        * Right now, we can't remove the last device from a tier,
-        * - For tier 0, because all metadata lives in tier 0 and because
-        *   there is no way to have foreground writes go directly to tier 1.
-        * - For tier 1, because the code doesn't completely support an
-        *   empty tier 1.
-        */
-
-       /*
-        * Turning a device read-only removes it from the cache group,
-        * so there may only be one read-write device in a tier, and yet
-        * the device we are removing is in the same tier, so we have
-        * to check for identity.
-        * Removing the last RW device from a tier requires turning the
-        * whole cache set RO.
-        */
-
-       return tier->nr_devices != 1 ||
-               rcu_access_pointer(tier->d[0].dev) != ca;
+       struct cache_group *grp = &c->cache_all;
+
+       /* Can't remove the last RW device: */
+       return grp->nr != 1 ||
+               rcu_access_pointer(grp->d[0].dev) != ca;
 }
 
 void bch_dev_release(struct kobject *);
@@ -89,15 +73,15 @@ int bch_dev_add(struct cache_set *, const char *);
 
 void bch_fs_detach(struct cache_set *);
 
-bool bch_fs_read_only(struct cache_set *);
 bool bch_fs_emergency_read_only(struct cache_set *);
-void bch_fs_read_only_sync(struct cache_set *);
+void bch_fs_read_only(struct cache_set *);
 const char *bch_fs_read_write(struct cache_set *);
 
 void bch_fs_release(struct kobject *);
+void bch_fs_stop_async(struct cache_set *);
 void bch_fs_stop(struct cache_set *);
-void bch_fs_stop_sync(struct cache_set *);
 
+const char *bch_fs_start(struct cache_set *);
 const char *bch_fs_open(char * const *, unsigned, struct bch_opts,
                        struct cache_set **);
 const char *bch_fs_open_incremental(const char *path);
index 41eaf0dd50d28565868b71987e96c826df471dff..69c747de65fb9c1ec04efaa9d768fe2c8ce90807 100644 (file)
@@ -6,6 +6,7 @@ struct bcache_superblock {
        struct block_device     *bdev;
        struct bio              *bio;
        unsigned                page_order;
+       fmode_t                 mode;
 };
 
 #endif /* _BCACHE_SUPER_TYPES_H */
index 9f45a6b0f4ba72e8cdaaf5dc7aed9a8b6929dd16..48f9f1f67ed787957d9e5658edb513a203789e1d 100644 (file)
@@ -22,6 +22,7 @@
 #include "opts.h"
 #include "request.h"
 #include "super-io.h"
+#include "tier.h"
 #include "writeback.h"
 
 #include <linux/blkdev.h>
@@ -121,6 +122,8 @@ rw_attribute(cache_replacement_policy);
 rw_attribute(foreground_write_ratelimit_enabled);
 rw_attribute(copy_gc_enabled);
 sysfs_pd_controller_attribute(copy_gc);
+
+rw_attribute(tier);
 rw_attribute(tiering_enabled);
 rw_attribute(tiering_percent);
 sysfs_pd_controller_attribute(tiering);
@@ -134,7 +137,6 @@ rw_attribute(foreground_target_percent);
 rw_attribute(size);
 read_attribute(meta_replicas_have);
 read_attribute(data_replicas_have);
-read_attribute(tier);
 
 #define BCH_DEBUG_PARAM(name, description)                             \
        rw_attribute(name);
@@ -680,7 +682,8 @@ SHOW(bch_fs)
 
        sysfs_printf(tiering_enabled,           "%i", c->tiering_enabled);
        sysfs_print(tiering_percent,            c->tiering_percent);
-       sysfs_pd_controller_show(tiering,       &c->tiering_pd);
+
+       sysfs_pd_controller_show(tiering,       &c->tiers[1].pd); /* XXX */
 
        sysfs_printf(meta_replicas_have, "%u",  c->sb.meta_replicas_have);
        sysfs_printf(data_replicas_have, "%u",  c->sb.data_replicas_have);
@@ -694,7 +697,7 @@ SHOW(bch_fs)
        BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
-       if (!test_bit(BCH_FS_RUNNING, &c->flags))
+       if (!bch_fs_running(c))
                return -EPERM;
 
        if (attr == &sysfs_bset_tree_stats)
@@ -723,7 +726,7 @@ STORE(__bch_fs)
        }
 
        if (attr == &sysfs_stop) {
-               bch_fs_stop(c);
+               bch_fs_stop_async(c);
                return size;
        }
 
@@ -773,25 +776,18 @@ STORE(__bch_fs)
                ssize_t ret = strtoul_safe(buf, c->tiering_enabled)
                        ?: (ssize_t) size;
 
-               if (c->tiering_read)
-                       wake_up_process(c->tiering_read);
+               bch_tiering_start(c); /* issue wakeups */
                return ret;
        }
 
        sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd);
 
-       if (attr == &sysfs_journal_flush) {
-               bch_journal_meta_async(&c->journal, NULL);
-
-               return size;
-       }
-
        sysfs_strtoul(pd_controllers_update_seconds,
                      c->pd_controllers_update_seconds);
        sysfs_strtoul(foreground_target_percent, c->foreground_target_percent);
 
        sysfs_strtoul(tiering_percent,          c->tiering_percent);
-       sysfs_pd_controller_store(tiering,      &c->tiering_pd);
+       sysfs_pd_controller_store(tiering,      &c->tiers[1].pd); /* XXX */
 
        /* Debugging: */
 
@@ -799,11 +795,14 @@ STORE(__bch_fs)
        BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
 
-       if (!test_bit(BCH_FS_RUNNING, &c->flags))
+       if (!bch_fs_running(c))
                return -EPERM;
 
-       if (test_bit(BCH_FS_STOPPING, &c->flags))
-               return -EINTR;
+       if (attr == &sysfs_journal_flush) {
+               bch_journal_meta_async(&c->journal, NULL);
+
+               return size;
+       }
 
        if (attr == &sysfs_blockdev_volume_create) {
                u64 v = strtoi_h_or_return(buf);
@@ -836,9 +835,9 @@ STORE(bch_fs)
 {
        struct cache_set *c = container_of(kobj, struct cache_set, kobj);
 
-       mutex_lock(&bch_register_lock);
+       mutex_lock(&c->state_lock);
        size = __bch_fs_store(kobj, attr, buf, size);
-       mutex_unlock(&bch_register_lock);
+       mutex_unlock(&c->state_lock);
 
        if (attr == &sysfs_add_device) {
                char *path = kstrdup(buf, GFP_KERNEL);
@@ -1273,6 +1272,31 @@ STORE(__bch_dev)
                mutex_unlock(&c->sb_lock);
        }
 
+       if (attr == &sysfs_tier) {
+               unsigned prev_tier;
+               unsigned v = strtoul_restrict_or_return(buf,
+                                       0, BCH_TIER_MAX - 1);
+
+               mutex_lock(&c->sb_lock);
+               prev_tier = ca->mi.tier;
+
+               if (v == ca->mi.tier) {
+                       mutex_unlock(&c->sb_lock);
+                       return size;
+               }
+
+               mi = &bch_sb_get_members(c->disk_sb)->members[ca->dev_idx];
+               SET_BCH_MEMBER_TIER(mi, v);
+               bch_write_super(c);
+
+               bch_dev_group_remove(&c->tiers[prev_tier].devs, ca);
+               bch_dev_group_add(&c->tiers[ca->mi.tier].devs, ca);
+               mutex_unlock(&c->sb_lock);
+
+               bch_recalc_capacity(c);
+               bch_tiering_start(c);
+       }
+
        if (attr == &sysfs_state_rw) {
                char name[BDEVNAME_SIZE];
                const char *err = NULL;
index 4686459433e9d12725b2dde1dd79150f96ca6243..0ab1770856e598c62d534a2ac1a246fa558f01d7 100644 (file)
@@ -16,8 +16,7 @@
 #include <trace/events/bcache.h>
 
 struct tiering_state {
-       struct cache_group      *tier;
-       unsigned                tier_idx;
+       struct bch_tier         *tier;
        unsigned                sectors;
        unsigned                stripe_size;
        unsigned                dev_idx;
@@ -42,7 +41,7 @@ static bool tiering_pred(struct cache_set *c,
                mi = cache_member_info_get(c);
                extent_for_each_ptr(e, ptr)
                        if (ptr->dev < mi->nr_devices &&
-                           mi->m[ptr->dev].tier >= s->tier_idx)
+                           mi->m[ptr->dev].tier >= s->tier->idx)
                                replicas++;
                cache_member_info_put();
 
@@ -69,15 +68,15 @@ static void tier_next_device(struct cache_set *c, struct tiering_state *s)
                s->sectors = 0;
                s->dev_idx++;
 
-               spin_lock(&s->tier->lock);
-               if (s->dev_idx >= s->tier->nr_devices)
+               spin_lock(&s->tier->devs.lock);
+               if (s->dev_idx >= s->tier->devs.nr)
                        s->dev_idx = 0;
 
-               if (s->tier->nr_devices) {
-                       s->ca = s->tier->d[s->dev_idx].dev;
+               if (s->tier->devs.nr) {
+                       s->ca = s->tier->devs.d[s->dev_idx].dev;
                        percpu_ref_get(&s->ca->ref);
                }
-               spin_unlock(&s->tier->lock);
+               spin_unlock(&s->tier->devs.lock);
        }
 }
 
@@ -103,13 +102,13 @@ static int issue_tiering_move(struct cache_set *c,
  * tiering_next_cache - issue a move to write an extent to the next cache
  * device in round robin order
  */
-static s64 read_tiering(struct cache_set *c, struct cache_group *tier)
+static s64 read_tiering(struct cache_set *c, struct bch_tier *tier)
 {
        struct moving_context ctxt;
        struct tiering_state s;
        struct btree_iter iter;
        struct bkey_s_c k;
-       unsigned nr_devices = READ_ONCE(tier->nr_devices);
+       unsigned nr_devices = READ_ONCE(tier->devs.nr);
        int ret;
 
        if (!nr_devices)
@@ -119,10 +118,9 @@ static s64 read_tiering(struct cache_set *c, struct cache_group *tier)
 
        memset(&s, 0, sizeof(s));
        s.tier          = tier;
-       s.tier_idx      = tier - c->cache_tiers;
        s.stripe_size   = 2048; /* 1 mb for now */
 
-       bch_move_ctxt_init(&ctxt, &c->tiering_pd.rate,
+       bch_move_ctxt_init(&ctxt, &tier->pd.rate,
                           nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE);
        bch_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN);
 
@@ -164,8 +162,8 @@ next:
 
 static int bch_tiering_thread(void *arg)
 {
-       struct cache_set *c = arg;
-       struct cache_group *tier = &c->cache_tiers[1];
+       struct bch_tier *tier = arg;
+       struct cache_set *c = container_of(tier, struct cache_set, tiers[tier->idx]);
        struct io_clock *clock = &c->io_clock[WRITE];
        struct cache *ca;
        u64 tier_capacity, available_sectors;
@@ -176,20 +174,20 @@ static int bch_tiering_thread(void *arg)
 
        while (!kthread_should_stop()) {
                if (kthread_wait_freezable(c->tiering_enabled &&
-                                          tier->nr_devices))
+                                          tier->devs.nr))
                        break;
 
                while (1) {
-                       struct cache_group *faster_tier;
+                       struct bch_tier *faster_tier;
 
                        last = atomic_long_read(&clock->now);
 
                        tier_capacity = available_sectors = 0;
                        rcu_read_lock();
-                       for (faster_tier = c->cache_tiers;
+                       for (faster_tier = c->tiers;
                             faster_tier != tier;
                             faster_tier++) {
-                               group_for_each_cache_rcu(ca, faster_tier, i) {
+                               group_for_each_cache_rcu(ca, &faster_tier->devs, i) {
                                        tier_capacity +=
                                                (ca->mi.nbuckets -
                                                 ca->mi.first_bucket) << ca->bucket_bits;
@@ -216,32 +214,73 @@ static int bch_tiering_thread(void *arg)
        return 0;
 }
 
-void bch_tiering_init_cache_set(struct cache_set *c)
+static void __bch_tiering_stop(struct bch_tier *tier)
 {
-       bch_pd_controller_init(&c->tiering_pd);
+       tier->pd.rate.rate = UINT_MAX;
+       bch_ratelimit_reset(&tier->pd.rate);
+
+       if (tier->migrate)
+               kthread_stop(tier->migrate);
+
+       tier->migrate = NULL;
+}
+
+void bch_tiering_stop(struct cache_set *c)
+{
+       struct bch_tier *tier;
+
+       for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++)
+               __bch_tiering_stop(tier);
+}
+
+static int __bch_tiering_start(struct bch_tier *tier)
+{
+       if (!tier->migrate) {
+               struct task_struct *p =
+                       kthread_create(bch_tiering_thread, tier,
+                                      "bch_tier[%u]", tier->idx);
+               if (IS_ERR(p))
+                       return PTR_ERR(p);
+
+               tier->migrate = p;
+       }
+
+       wake_up_process(tier->migrate);
+       return 0;
 }
 
-int bch_tiering_read_start(struct cache_set *c)
+int bch_tiering_start(struct cache_set *c)
 {
-       struct task_struct *t;
+       struct bch_tier *tier;
+       bool have_faster_tier = false;
 
        if (c->opts.nochanges)
                return 0;
 
-       t = kthread_create(bch_tiering_thread, c, "bch_tier_read");
-       if (IS_ERR(t))
-               return PTR_ERR(t);
+       for (tier = c->tiers; tier < c->tiers + ARRAY_SIZE(c->tiers); tier++) {
+               if (!tier->devs.nr)
+                       continue;
 
-       c->tiering_read = t;
-       wake_up_process(c->tiering_read);
+               if (have_faster_tier) {
+                       int ret = __bch_tiering_start(tier);
+                       if (ret)
+                               return ret;
+               } else {
+                       __bch_tiering_stop(tier);
+               }
+
+               have_faster_tier = true;
+       }
 
        return 0;
 }
 
-void bch_tiering_read_stop(struct cache_set *c)
+void bch_fs_tiering_init(struct cache_set *c)
 {
-       if (!IS_ERR_OR_NULL(c->tiering_read)) {
-               kthread_stop(c->tiering_read);
-               c->tiering_read = NULL;
+       unsigned i;
+
+       for (i = 0; i < ARRAY_SIZE(c->tiers); i++) {
+               c->tiers[i].idx = i;
+               bch_pd_controller_init(&c->tiers[i].pd);
        }
 }
index 89c2bffde9575fe7264de0026e7af161e0b6ab24..b53e83d96b94685033f29efe28883f9bfa0d0625 100644 (file)
@@ -1,8 +1,8 @@
 #ifndef _BCACHE_TIER_H
 #define _BCACHE_TIER_H
 
-void bch_tiering_init_cache_set(struct cache_set *);
-int bch_tiering_read_start(struct cache_set *);
-void bch_tiering_read_stop(struct cache_set *);
+void bch_tiering_stop(struct cache_set *);
+int bch_tiering_start(struct cache_set *);
+void bch_fs_tiering_init(struct cache_set *);
 
 #endif
index 0bae9b0d2c29bc91bbea7dbd4262dff122a9b806..93459d0b84c92dddf414a1de94f6a3620cece485 100644 (file)
@@ -20,8 +20,14 @@ int submit_bio_wait(struct bio *bio)
        ssize_t ret;
        unsigned i;
 
-       if (bio->bi_opf & REQ_PREFLUSH)
-               fdatasync(bio->bi_bdev->bd_fd);
+       if (bio->bi_opf & REQ_PREFLUSH) {
+               ret = fdatasync(bio->bi_bdev->bd_fd);
+               if (ret) {
+                       fprintf(stderr, "fsync error: %s\n",
+                               strerror(errno));
+                       return -EIO;
+               }
+       }
 
        i = 0;
        bio_for_each_segment(bv, bio, iter)
@@ -49,10 +55,22 @@ int submit_bio_wait(struct bio *bio)
                BUG();
        }
 
-       if (bio->bi_opf & REQ_FUA)
-               fdatasync(bio->bi_bdev->bd_fd);
+       if (ret != bio->bi_iter.bi_size) {
+               fprintf(stderr, "IO error: %li (%s)\n",
+                       ret, strerror(errno));
+               return -EIO;
+       }
 
-       return ret == bio->bi_iter.bi_size ? 0 : -EIO;
+       if (bio->bi_opf & REQ_FUA) {
+               ret = fdatasync(bio->bi_bdev->bd_fd);
+               if (ret) {
+                       fprintf(stderr, "fsync error: %s\n",
+                               strerror(errno));
+                       return -EIO;
+               }
+       }
+
+       return 0;
 }
 
 void generic_make_request(struct bio *bio)
diff --git a/qcow2.c b/qcow2.c
index cbc8d4c4004ce24d35835ed86c1416c63ea46dd5..b7aa8c26e0ca037f2d66de6057613b23a64796fd 100644 (file)
--- a/qcow2.c
+++ b/qcow2.c
@@ -2,7 +2,6 @@
 #include <errno.h>
 #include <sys/types.h>
 #include <unistd.h>
-#include <linux/sort.h>
 
 #include "qcow2.h"
 #include "tools-util.h"
@@ -69,18 +68,7 @@ static void add_l2(struct qcow2_image *img, u64 src_blk, u64 dst_offset)
        img->l2_table[l2_index] = cpu_to_be64(dst_offset|QCOW_OFLAG_COPIED);
 }
 
-static int range_cmp(const void *_l, const void *_r)
-{
-       const struct range *l = _l, *r = _r;
-
-       if (l->start < r->start)
-               return -1;
-       if (l->start > r->start)
-               return  1;
-       return 0;
-}
-
-void qcow2_write_image(int infd, int outfd, sparse_data *data,
+void qcow2_write_image(int infd, int outfd, ranges *data,
                       unsigned block_size)
 {
        u64 image_size = get_size(NULL, infd);
@@ -98,30 +86,11 @@ void qcow2_write_image(int infd, int outfd, sparse_data *data,
        struct range *r;
        char *buf = xmalloc(block_size);
        u64 src_offset, dst_offset;
-       sparse_data m;
 
        assert(is_power_of_2(block_size));
 
-       sort(&darray_item(*data, 0),
-            darray_size(*data),
-            sizeof(darray_item(*data, 0)),
-            range_cmp, NULL);
-
-       /* Round to blocksize, merge contiguous ranges: */
-       darray_init(m);
-       darray_foreach(r, *data) {
-               struct range *l = m.size ?  &m.item[m.size - 1] : NULL;
-
-               r->start = round_down(r->start, block_size);
-               r->end  = round_up(r->end, block_size);
-
-               if (l && l->end >= r->start)
-                       l->end = max(l->end, r->end);
-               else
-                       darray_append(m, *r);
-       }
-       darray_free(*data);
-       *data = m;
+       ranges_roundup(data, block_size);
+       ranges_sort_merge(data);
 
        /* Write data: */
        darray_foreach(r, *data)
diff --git a/qcow2.h b/qcow2.h
index c6f0b6ba7eb69be5ae103e29ab705db6c79e0f9d..0943d55cdf71647b037857a27f80a4b7623173b8 100644 (file)
--- a/qcow2.h
+++ b/qcow2.h
@@ -2,23 +2,8 @@
 #define _QCOW2_H
 
 #include <linux/types.h>
-#include "ccan/darray/darray.h"
+#include "tools-util.h"
 
-struct range {
-       u64             start;
-       u64             end;
-};
-
-typedef darray(struct range) sparse_data;
-
-static inline void data_add(sparse_data *data, u64 offset, u64 size)
-{
-       darray_append(*data, (struct range) {
-               .start = offset,
-               .end = offset + size
-       });
-}
-
-void qcow2_write_image(int, int, sparse_data *, unsigned);
+void qcow2_write_image(int, int, ranges *, unsigned);
 
 #endif /* _QCOW2_H */
index 0a95fbe917413608b51a270b585ad87df23a3d1e..07fb82d141ac9955d6eb82b52c1922c5aa48f9cb 100644 (file)
@@ -1,4 +1,3 @@
-#include <alloca.h>
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
@@ -19,6 +18,7 @@
 #include "ccan/crc/crc.h"
 
 #include "linux/bcache-ioctl.h"
+#include "linux/sort.h"
 #include "tools-util.h"
 #include "util.h"
 
@@ -59,20 +59,12 @@ struct units_buf __pr_units(u64 v, enum units units)
 
 char *read_file_str(int dirfd, const char *path)
 {
-       int fd = openat(dirfd, path, O_RDONLY);
+       int fd = xopenat(dirfd, path, O_RDONLY);
+       size_t len = xfstat(fd).st_size;
 
-       if (fd < 0)
-               die("Unable to open %s\n", path);
+       char *buf = malloc(len + 1);
 
-       struct stat statbuf;
-       if (fstat(fd, &statbuf) < 0)
-               die("fstat error\n");
-
-       char *buf = malloc(statbuf.st_size + 1);
-
-       int len = read(fd, buf, statbuf.st_size);
-       if (len < 0)
-               die("read error while reading from file %s\n", path);
+       xpread(fd, buf, len, 0);
 
        buf[len] = '\0';
        if (len && buf[len - 1] == '\n')
@@ -107,48 +99,33 @@ ssize_t read_string_list_or_die(const char *opt, const char * const list[],
 /* Returns size of file or block device: */
 u64 get_size(const char *path, int fd)
 {
-       struct stat statbuf;
-       u64 ret;
-
-       if (fstat(fd, &statbuf))
-               die("Error statting %s: %s", path, strerror(errno));
+       struct stat statbuf = xfstat(fd);
 
        if (!S_ISBLK(statbuf.st_mode))
                return statbuf.st_size;
 
-       if (ioctl(fd, BLKGETSIZE64, &ret))
-               die("Error getting block device size on %s: %s\n",
-                   path, strerror(errno));
-
+       u64 ret;
+       xioctl(fd, BLKGETSIZE64, &ret);
        return ret;
 }
 
 /* Returns blocksize in units of 512 byte sectors: */
 unsigned get_blocksize(const char *path, int fd)
 {
-       struct stat statbuf;
-       if (fstat(fd, &statbuf))
-               die("Error statting %s: %s", path, strerror(errno));
+       struct stat statbuf = xfstat(fd);
 
        if (!S_ISBLK(statbuf.st_mode))
                return statbuf.st_blksize >> 9;
 
        unsigned ret;
-       if (ioctl(fd, BLKPBSZGET, &ret))
-               die("Error getting blocksize on %s: %s\n",
-                   path, strerror(errno));
-
+       xioctl(fd, BLKPBSZGET, &ret);
        return ret >> 9;
 }
 
 /* Global control device: */
 int bcachectl_open(void)
 {
-       int fd = open("/dev/bcache-ctl", O_RDWR);
-       if (fd < 0)
-               die("Can't open bcache device: %s", strerror(errno));
-
-       return fd;
+       return xopen("/dev/bcache-ctl", O_RDWR);
 }
 
 /* Filesystem handles (ioctl, sysfs dir): */
@@ -162,47 +139,29 @@ struct bcache_handle bcache_fs_open(const char *path)
 
        if (!uuid_parse(path, tmp)) {
                /* It's a UUID, look it up in sysfs: */
-
-               char *sysfs = alloca(strlen(SYSFS_BASE) + strlen(path) + 1);
-               sprintf(sysfs, "%s%s", SYSFS_BASE, path);
-
-               ret.sysfs_fd = open(sysfs, O_RDONLY);
-               if (!ret.sysfs_fd)
-                       die("Unable to open %s\n", path);
+               char *sysfs = mprintf("%s%s", SYSFS_BASE, path);
+               ret.sysfs_fd = xopen(sysfs, O_RDONLY);
 
                char *minor = read_file_str(ret.sysfs_fd, "minor");
-               char *ctl = alloca(20 + strlen(minor));
+               char *ctl = mprintf("/dev/bcache%s-ctl", minor);
+               ret.ioctl_fd = xopen(ctl, O_RDWR);
 
-               sprintf(ctl, "/dev/bcache%s-ctl", minor);
+               free(sysfs);
                free(minor);
-
-               ret.ioctl_fd = open(ctl, O_RDWR);
-               if (ret.ioctl_fd < 0)
-                       die("Error opening control device: %s\n",
-                           strerror(errno));
+               free(ctl);
        } else {
                /* It's a path: */
-
-               ret.ioctl_fd = open(path, O_RDONLY);
-               if (ret.ioctl_fd < 0)
-                       die("Error opening %s: %s\n",
-                           path, strerror(errno));
+               ret.ioctl_fd = xopen(path, O_RDONLY);
 
                struct bch_ioctl_query_uuid uuid;
-               if (ioctl(ret.ioctl_fd, BCH_IOCTL_QUERY_UUID, &uuid))
-                       die("ioctl error (not a bcache fs?): %s\n",
-                           strerror(errno));
+               xioctl(ret.ioctl_fd, BCH_IOCTL_QUERY_UUID, &uuid);
 
                char uuid_str[40];
                uuid_unparse(uuid.uuid.b, uuid_str);
 
-               char *sysfs = alloca(strlen(SYSFS_BASE) + strlen(uuid_str) + 1);
-               sprintf(sysfs, "%s%s", SYSFS_BASE, uuid_str);
-
-               ret.sysfs_fd = open(sysfs, O_RDONLY);
-               if (ret.sysfs_fd < 0)
-                       die("Unable to open sysfs dir %s: %s\n",
-                           sysfs, strerror(errno));
+               char *sysfs = mprintf("%s%s", SYSFS_BASE, uuid_str);
+               ret.sysfs_fd = xopen(sysfs, O_RDONLY);
+               free(sysfs);
        }
 
        return ret;
@@ -225,3 +184,89 @@ bool ask_yn(void)
        free(buf);
        return ret;
 }
+
+static int range_cmp(const void *_l, const void *_r)
+{
+       const struct range *l = _l, *r = _r;
+
+       if (l->start < r->start)
+               return -1;
+       if (l->start > r->start)
+               return  1;
+       return 0;
+}
+
+void ranges_sort_merge(ranges *r)
+{
+       struct range *t, *i;
+       ranges tmp = { NULL };
+
+       sort(&darray_item(*r, 0), darray_size(*r),
+            sizeof(darray_item(*r, 0)), range_cmp, NULL);
+
+       /* Merge contiguous ranges: */
+       darray_foreach(i, *r) {
+               t = tmp.size ?  &tmp.item[tmp.size - 1] : NULL;
+
+               if (t && t->end >= i->start)
+                       t->end = max(t->end, i->end);
+               else
+                       darray_append(tmp, *i);
+       }
+
+       darray_free(*r);
+       *r = tmp;
+}
+
+void ranges_roundup(ranges *r, unsigned block_size)
+{
+       struct range *i;
+
+       darray_foreach(i, *r) {
+               i->start = round_down(i->start, block_size);
+               i->end  = round_up(i->end, block_size);
+       }
+}
+
+void ranges_rounddown(ranges *r, unsigned block_size)
+{
+       struct range *i;
+
+       darray_foreach(i, *r) {
+               i->start = round_up(i->start, block_size);
+               i->end  = round_down(i->end, block_size);
+               i->end  = max(i->end, i->start);
+       }
+}
+
+struct fiemap_extent fiemap_iter_next(struct fiemap_iter *iter)
+{
+       struct fiemap_extent e;
+
+       BUG_ON(iter->idx > iter->f.fm_mapped_extents);
+
+       if (iter->idx == iter->f.fm_mapped_extents) {
+               xioctl(iter->fd, FS_IOC_FIEMAP, &iter->f);
+
+               if (!iter->f.fm_mapped_extents)
+                       return (struct fiemap_extent) { .fe_length = 0 };
+
+               iter->idx = 0;
+       }
+
+       e = iter->f.fm_extents[iter->idx++];
+       BUG_ON(!e.fe_length);
+
+       iter->f.fm_start = e.fe_logical + e.fe_length;
+
+       return e;
+}
+
+const char *strcmp_prefix(const char *a, const char *a_prefix)
+{
+       while (*a_prefix && *a == *a_prefix) {
+               a++;
+               a_prefix++;
+       }
+       return *a_prefix ? NULL : a;
+}
index 09f00efe0b8e3e8f00e9b271cd6bc9a454cc67fc..1aac56ae04f9c5a0a49037f58d1614bb636e94c9 100644 (file)
@@ -5,21 +5,31 @@
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <sys/stat.h>
 #include <sys/types.h>
 #include <unistd.h>
 
+#include <linux/bug.h>
 #include <linux/byteorder.h>
 #include <linux/kernel.h>
 #include <linux/log2.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include "ccan/darray/darray.h"
 
-#define die(arg, ...)                                  \
-do {                                                   \
-       fprintf(stderr, arg "\n", ##__VA_ARGS__);       \
-       exit(EXIT_FAILURE);                             \
+#define die(arg, ...)                                                  \
+do {                                                                   \
+       fprintf(stderr, arg "\n", ##__VA_ARGS__);                       \
+       exit(EXIT_FAILURE);                                             \
 } while (0)
 
+#define mprintf(...)                                                   \
+({                                                                     \
+       char *_str;                                                     \
+       asprintf(&_str, __VA_ARGS__);                                   \
+       _str;                                                           \
+})
+
 static inline void *xcalloc(size_t count, size_t size)
 {
        void *p = calloc(count, size);
@@ -57,6 +67,38 @@ static inline void xpwrite(int fd, const void *buf, size_t count, off_t offset)
                die("write error (ret %zi err %s)", r, strerror(errno));
 }
 
+#define xopenat(_dirfd, _path, ...)                                    \
+({                                                                     \
+       int _fd = openat((_dirfd), (_path), __VA_ARGS__);               \
+       if (_fd < 0)                                                    \
+               die("Error opening %s: %s", (_path), strerror(errno));  \
+       _fd;                                                            \
+})
+
+#define xopen(...)     xopenat(AT_FDCWD, __VA_ARGS__)
+
+static inline struct stat xfstatat(int dirfd, const char *path, int flags)
+{
+       struct stat stat;
+       if (fstatat(dirfd, path, &stat, flags))
+               die("stat error: %s", strerror(errno));
+       return stat;
+}
+
+static inline struct stat xfstat(int fd)
+{
+       struct stat stat;
+       if (fstat(fd, &stat))
+               die("stat error: %s", strerror(errno));
+       return stat;
+}
+
+#define xioctl(_fd, _nr, ...)                                          \
+do {                                                                   \
+       if (ioctl((_fd), (_nr), ##__VA_ARGS__))                         \
+               die(#_nr " ioctl error: %s", strerror(errno));          \
+} while (0)
+
 enum units {
        BYTES,
        SECTORS,
@@ -91,4 +133,74 @@ struct bcache_handle bcache_fs_open(const char *);
 
 bool ask_yn(void);
 
+struct range {
+       u64             start;
+       u64             end;
+};
+
+typedef darray(struct range) ranges;
+
+static inline void range_add(ranges *data, u64 offset, u64 size)
+{
+       darray_append(*data, (struct range) {
+               .start = offset,
+               .end = offset + size
+       });
+}
+
+void ranges_sort_merge(ranges *);
+void ranges_roundup(ranges *, unsigned);
+void ranges_rounddown(ranges *, unsigned);
+
+struct hole_iter {
+       ranges          r;
+       size_t          idx;
+       u64             end;
+};
+
+static inline struct range hole_iter_next(struct hole_iter *iter)
+{
+       struct range r = {
+               .start  = iter->idx ? iter->r.item[iter->idx - 1].end : 0,
+               .end    = iter->idx < iter->r.size
+                       ? iter->r.item[iter->idx].start : iter->end,
+       };
+
+       BUG_ON(r.start > r.end);
+
+       iter->idx++;
+       return r;
+}
+
+#define for_each_hole(_iter, _ranges, _end, _i)                                \
+       for (_iter = (struct hole_iter) { .r = _ranges, .end = _end };  \
+            (_iter.idx <= _iter.r.size &&                              \
+             (_i = hole_iter_next(&_iter), true));)
+
+#include <linux/fiemap.h>
+
+struct fiemap_iter {
+       struct fiemap           f;
+       struct fiemap_extent    fe[1024];
+       unsigned                idx;
+       int                     fd;
+};
+
+static inline void fiemap_iter_init(struct fiemap_iter *iter, int fd)
+{
+       memset(iter, 0, sizeof(*iter));
+
+       iter->f.fm_extent_count = ARRAY_SIZE(iter->fe);
+       iter->f.fm_length       = FIEMAP_MAX_OFFSET;
+       iter->fd                = fd;
+}
+
+struct fiemap_extent fiemap_iter_next(struct fiemap_iter *);
+
+#define fiemap_for_each(fd, iter, extent)                              \
+       for (fiemap_iter_init(&iter, fd);                               \
+            (extent = fiemap_iter_next(&iter)).fe_length;)
+
+const char *strcmp_prefix(const char *, const char *);
+
 #endif /* _TOOLS_UTIL_H */