]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to e99d29e402 bcachefs: zstd support, compression refactoring
authorKent Overstreet <kent.overstreet@gmail.com>
Fri, 16 Feb 2018 20:36:33 +0000 (15:36 -0500)
committerKent Overstreet <kent.overstreet@gmail.com>
Fri, 16 Feb 2018 23:13:24 +0000 (18:13 -0500)
40 files changed:
.bcachefs_revision
Makefile
cmd_migrate.c
debian/control
libbcachefs/alloc.c
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/btree_cache.c
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_leaf.c
libbcachefs/checksum.c
libbcachefs/checksum.h
libbcachefs/compress.c
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/fs-io.c
libbcachefs/io.c
libbcachefs/io.h
libbcachefs/journal.c
libbcachefs/keylist.h
libbcachefs/migrate.c
libbcachefs/migrate.h
libbcachefs/move.c
libbcachefs/move.h
libbcachefs/movinggc.c
libbcachefs/opts.c
libbcachefs/opts.h
libbcachefs/quota.c
libbcachefs/quota.h
libbcachefs/siphash.c
libbcachefs/super-io.c
libbcachefs/super-io.h
libbcachefs/super.c
libbcachefs/super.h
libbcachefs/super_types.h
libbcachefs/sysfs.c
libbcachefs/tier.c
libbcachefs/util.h

index 274236e3701e179353092fb7a86ad3c3a247036a..76acdf9367a3954946f1902ad691f692c116fae4 100644 (file)
@@ -1 +1 @@
-d5e561b3cc023dd247d2b3d08b680709ec21b477
+e99d29e40210f6d9b7ec9e5b7aee1e48ae7655c5
index ef1eacf7b0069aeb5def81900a9ae9f871743e90..af7a206c37283409ecbe4481414237da011da741 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -9,6 +9,7 @@ CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall                            \
        -D_GNU_SOURCE                                           \
        -D_LGPL_SOURCE                                          \
        -DRCU_MEMBARRIER                                        \
+       -DZSTD_STATIC_LINKING_ONLY                              \
        -DNO_BCACHEFS_CHARDEV                                   \
        -DNO_BCACHEFS_FS                                        \
        -DNO_BCACHEFS_SYSFS                                     \
@@ -31,9 +32,15 @@ ifdef D
 endif
 
 PKGCONFIG_LIBS="blkid uuid liburcu libsodium zlib"
+PKGCONFIG_LIBS_STATIC="libzstd"
+
 CFLAGS+=`pkg-config --cflags   ${PKGCONFIG_LIBS}`
-LDLIBS+=`pkg-config --libs     ${PKGCONFIG_LIBS}`              \
-       -lm -lpthread -lrt -lscrypt -lkeyutils -laio
+LDLIBS+=`pkg-config --libs     ${PKGCONFIG_LIBS}`
+
+CFLAGS+=`pkg-config --static --cflags  ${PKGCONFIG_LIBS_STATIC}`
+LDLIBS+=`pkg-config --static --libs    ${PKGCONFIG_LIBS_STATIC}`
+
+LDLIBS+=-lm -lpthread -lrt -lscrypt -lkeyutils -laio
 
 ifeq ($(PREFIX),/usr)
        ROOT_SBINDIR=/sbin
index d676bb584ab19c503fedc3f08f9e3b3c39f11c22..4ba3538d046b05dea391f35eeaafeb34614a0496 100644 (file)
@@ -344,8 +344,8 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
                        die("error reserving space in new filesystem: %s",
                            strerror(-ret));
 
-               bch2_check_mark_super(c, BCH_DATA_USER,
-                                     bch2_bkey_devs(extent_i_to_s_c(e).s_c));
+               bch2_mark_bkey_replicas(c, BCH_DATA_USER,
+                                       extent_i_to_s_c(e).s_c);
 
                ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i,
                                        &res, NULL, NULL, 0);
index 07f2f2f3a952235f2909fbaaf6bcbc67b28c25bc..08673f4be2c5849746b323bcc4032ea619fbc296 100644 (file)
@@ -5,7 +5,7 @@ Priority: optional
 Standards-Version: 3.9.5
 Build-Depends: debhelper (>= 9), pkg-config, libblkid-dev, uuid-dev,
        libscrypt-dev, libsodium-dev, libkeyutils-dev, liburcu-dev, zlib1g-dev,
-       libattr1-dev, libaio-dev
+       libattr1-dev, libaio-dev, libzstd-dev
 Homepage: http://bcache.evilpiepirate.org/
 
 Package: bcachefs-tools
index c195ffbdf3b30a0c47e0eec2c5da5517928c047c..339ffd02c45f5a7e8a21d60e37b4cf7470246eb2 100644 (file)
@@ -1201,43 +1201,56 @@ out:
        return ob - c->open_buckets;
 }
 
-struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
-                                        struct write_point *wp,
-                                        struct bch_devs_mask *devs)
+static int __dev_alloc_cmp(struct bch_fs *c,
+                          struct write_point *wp,
+                          unsigned l, unsigned r)
 {
-       struct dev_alloc_list ret = { .nr = 0 };
-       struct bch_dev *ca, *ca2;
-       unsigned i, j;
+       struct bch_dev *ca_l = rcu_dereference(c->devs[l]);
+       struct bch_dev *ca_r = rcu_dereference(c->devs[r]);
 
-       for_each_member_device_rcu(ca, c, i, devs) {
-               for (j = 0; j < ret.nr; j++) {
-                       unsigned idx = ret.devs[j];
+       if (ca_l && ca_r && ca_l->mi.tier != ca_r->mi.tier)
+               return ((ca_l->mi.tier > ca_r->mi.tier) -
+                       (ca_l->mi.tier < ca_r->mi.tier));
 
-                       ca2 = rcu_dereference(c->devs[idx]);
-                       if (!ca2)
-                               break;
+       return ((wp->next_alloc[l] > wp->next_alloc[r]) -
+               (wp->next_alloc[l] < wp->next_alloc[r]));
+}
 
-                       if (ca->mi.tier < ca2->mi.tier)
-                               break;
+#define dev_alloc_cmp(l, r) __dev_alloc_cmp(c, wp, l, r)
 
-                       if (ca->mi.tier == ca2->mi.tier &&
-                           wp->next_alloc[i] < wp->next_alloc[idx])
-                               break;
-               }
+struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
+                                        struct write_point *wp,
+                                        struct bch_devs_mask *devs)
+{
+       struct dev_alloc_list ret = { .nr = 0 };
+       struct bch_dev *ca;
+       unsigned i;
 
-               array_insert_item(ret.devs, ret.nr, j, i);
-       }
+       for_each_member_device_rcu(ca, c, i, devs)
+               ret.devs[ret.nr++] = i;
 
+       bubble_sort(ret.devs, ret.nr, dev_alloc_cmp);
        return ret;
 }
 
 void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
                     struct write_point *wp)
 {
-       unsigned i;
+       u64 *v = wp->next_alloc + ca->dev_idx;
+       u64 free_space = dev_buckets_free(c, ca);
+       u64 free_space_inv = free_space
+               ? div64_u64(1ULL << 48, free_space)
+               : 1ULL << 48;
+       u64 scale = *v / 4;
+
+       if (*v + free_space_inv >= *v)
+               *v += free_space_inv;
+       else
+               *v = U64_MAX;
 
-       for (i = 0; i < ARRAY_SIZE(wp->next_alloc); i++)
-               wp->next_alloc[i] >>= 1;
+       for (v = wp->next_alloc;
+            v < wp->next_alloc + ARRAY_SIZE(wp->next_alloc); v++)
+               *v = *v < scale ? 0 : *v - scale;
 }
 
 static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
@@ -1249,7 +1262,6 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
 {
        enum bucket_alloc_ret ret = NO_DEVICES;
        struct dev_alloc_list devs_sorted;
-       u64 buckets_free;
        unsigned i;
 
        BUG_ON(nr_replicas > ARRAY_SIZE(wp->ptrs));
@@ -1281,13 +1293,6 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
                BUG_ON(wp->nr_ptrs >= ARRAY_SIZE(wp->ptrs));
                wp->ptrs[wp->nr_ptrs++] = c->open_buckets + ob;
 
-               buckets_free = U64_MAX, dev_buckets_free(c, ca);
-               if (buckets_free)
-                       wp->next_alloc[ca->dev_idx] +=
-                               div64_u64(U64_MAX, buckets_free *
-                                         ca->mi.bucket_size);
-               else
-                       wp->next_alloc[ca->dev_idx] = U64_MAX;
                bch2_wp_rescale(c, ca, wp);
 
                __clear_bit(ca->dev_idx, devs->d);
index cb9906c5bd22ba4aa96bf8f151840441228b844b..5a3e99b3b79a53ed770e4edc0b4c11a21832448b 100644 (file)
 #include <linux/shrinker.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
+#include <linux/zstd.h>
 
 #include "bcachefs_format.h"
 #include "bset.h"
@@ -231,6 +232,12 @@ do {                                                                       \
                bch_info(c, fmt, ##__VA_ARGS__);                        \
 } while (0)
 
+#define pr_verbose_init(opts, fmt, ...)                                        \
+do {                                                                   \
+       if (opt_get(opts, verbose_init))                                \
+               pr_info(fmt, ##__VA_ARGS__);                            \
+} while (0)
+
 /* Parameters that are useful for debugging, but should always be compiled in: */
 #define BCH_DEBUG_PARAMS_ALWAYS()                                      \
        BCH_DEBUG_PARAM(key_merging_disabled,                           \
@@ -646,10 +653,10 @@ struct bch_fs {
        struct mutex            bio_bounce_pages_lock;
        mempool_t               bio_bounce_pages;
 
-       mempool_t               lz4_workspace_pool;
-       void                    *zlib_workspace;
-       struct mutex            zlib_workspace_lock;
        mempool_t               compression_bounce[2];
+       mempool_t               compress_workspace[BCH_COMPRESSION_NR];
+       mempool_t               decompress_workspace;
+       ZSTD_parameters         zstd_params;
 
        struct crypto_shash     *sha256;
        struct crypto_skcipher  *chacha20;
index 854e1c3db5aff235b05b1606f91858b953a6255e..5e406275d5f688defcc9733af3ac44825ae22b45 100644 (file)
@@ -6,7 +6,6 @@
  */
 
 #include <asm/types.h>
-#include <linux/compiler.h>
 #include <asm/byteorder.h>
 #include <linux/uuid.h>
 
@@ -370,7 +369,8 @@ enum bch_compression_type {
        BCH_COMPRESSION_LZ4_OLD         = 1,
        BCH_COMPRESSION_GZIP            = 2,
        BCH_COMPRESSION_LZ4             = 3,
-       BCH_COMPRESSION_NR              = 4,
+       BCH_COMPRESSION_ZSTD            = 4,
+       BCH_COMPRESSION_NR              = 5,
 };
 
 enum bch_extent_entry_type {
@@ -1082,6 +1082,7 @@ LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ,    struct bch_sb, flags[1], 24, 28);
 enum bch_sb_features {
        BCH_FEATURE_LZ4                 = 0,
        BCH_FEATURE_GZIP                = 1,
+       BCH_FEATURE_ZSTD                = 2,
 };
 
 /* options: */
@@ -1109,11 +1110,17 @@ enum bch_str_hash_opts {
        BCH_STR_HASH_NR                 = 3,
 };
 
+#define BCH_COMPRESSION_TYPES()                \
+       x(NONE)                         \
+       x(LZ4)                          \
+       x(GZIP)                         \
+       x(ZSTD)
+
 enum bch_compression_opts {
-       BCH_COMPRESSION_OPT_NONE        = 0,
-       BCH_COMPRESSION_OPT_LZ4         = 1,
-       BCH_COMPRESSION_OPT_GZIP        = 2,
-       BCH_COMPRESSION_OPT_NR          = 3,
+#define x(t) BCH_COMPRESSION_OPT_##t,
+       BCH_COMPRESSION_TYPES()
+#undef x
+       BCH_COMPRESSION_OPT_NR
 };
 
 /*
@@ -1322,8 +1329,10 @@ struct btree_node {
        };
 } __attribute__((packed, aligned(8)));
 
-LE64_BITMASK(BTREE_NODE_ID,    struct btree_node, flags, 0, 4);
-LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8);
+LE64_BITMASK(BTREE_NODE_ID,    struct btree_node, flags,  0,  4);
+LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags,  4,  8);
+/* 8-32 unused */
+LE64_BITMASK(BTREE_NODE_SEQ,   struct btree_node, flags, 32, 64);
 
 struct btree_node_entry {
        struct bch_csum         csum;
index 0bde449ec745fb73ff0d63ffe24e814e113d3ac8..7eae4d20a951239a8175c0ef599f0165b5c7ad47 100644 (file)
@@ -373,19 +373,23 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
 {
        struct btree_cache *bc = &c->btree_cache;
        unsigned i;
-       int ret;
+       int ret = 0;
+
+       pr_verbose_init(c->opts, "");
 
        ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
        if (ret)
-               return ret;
+               goto out;
 
        bc->table_init_done = true;
 
        bch2_recalc_btree_reserve(c);
 
        for (i = 0; i < bc->reserve; i++)
-               if (!btree_node_mem_alloc(c, GFP_KERNEL))
-                       return -ENOMEM;
+               if (!btree_node_mem_alloc(c, GFP_KERNEL)) {
+                       ret = -ENOMEM;
+                       goto out;
+               }
 
        list_splice_init(&bc->live, &bc->freeable);
 
@@ -393,12 +397,16 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
        mutex_init(&c->verify_lock);
 
        c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
-       if (!c->verify_ondisk)
-               return -ENOMEM;
+       if (!c->verify_ondisk) {
+               ret = -ENOMEM;
+               goto out;
+       }
 
        c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL);
-       if (!c->verify_data)
-               return -ENOMEM;
+       if (!c->verify_data) {
+               ret = -ENOMEM;
+               goto out;
+       }
 
        list_del_init(&c->verify_data->list);
 #endif
@@ -408,8 +416,9 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
        bc->shrink.seeks                = 4;
        bc->shrink.batch                = btree_pages(c) * 2;
        register_shrinker(&bc->shrink);
-
-       return 0;
+out:
+       pr_verbose_init(c->opts, "ret %i", ret);
+       return ret;
 }
 
 void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
index 635086638ba841edbf32283e47aa62a7d3726dde..f2e9c10e4efecf1532721ce449eff3782199b14a 100644 (file)
@@ -148,14 +148,13 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
 {
        enum bch_data_type data_type = type == BKEY_TYPE_BTREE
                ? BCH_DATA_BTREE : BCH_DATA_USER;
-       struct bch_devs_list devs = bch2_bkey_devs(k);
        int ret = 0;
 
        if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-           fsck_err_on(!bch2_sb_has_replicas(c, data_type, devs), c,
+           fsck_err_on(!bch2_bkey_replicas_marked(c, data_type, k), c,
                        "superblock not marked as containing replicas (type %u)",
                        data_type)) {
-               ret = bch2_check_mark_super(c, data_type, devs);
+               ret = bch2_mark_bkey_replicas(c, data_type, k);
                if (ret)
                        return ret;
        }
index 9b4eff1c83d1eb4320a8b6a9074a7611ddafd5c3..d805fb41886be1edbdaebdb87167d05d6eeb4166 100644 (file)
@@ -1135,6 +1135,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
                unsigned sectors, whiteout_u64s = 0;
                struct nonce nonce;
                struct bch_csum csum;
+               bool first = !b->written;
 
                if (!b->written) {
                        i = &b->data->keys;
@@ -1194,10 +1195,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
                }
 
                if (ret) {
-                       btree_err_on(!b->written,
+                       btree_err_on(first,
                                     BTREE_ERR_FIXABLE, c, b, i,
                                     "first btree node bset has blacklisted journal seq");
-                       if (b->written)
+                       if (!first)
                                continue;
                }
 
index c45527a2fa866ea5f007bacf33ffa4323461aedf..0e0156d9016c733376082afaeabd463dfe6142e8 100644 (file)
@@ -430,6 +430,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
        n->data->min_key        = b->data->min_key;
        n->data->max_key        = b->data->max_key;
        n->data->format         = format;
+       SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
 
        btree_node_set_format(n, format);
 
@@ -559,8 +560,8 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
                        goto err_free;
                }
 
-               ret = bch2_check_mark_super(c, BCH_DATA_BTREE,
-                                       bch2_bkey_devs(bkey_i_to_s_c(&b->key)));
+               ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+                                             bkey_i_to_s_c(&b->key));
                if (ret)
                        goto err_free;
 
@@ -1225,6 +1226,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
 
        n2->data->max_key       = n1->data->max_key;
        n2->data->format        = n1->format;
+       SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
        n2->key.k.p = n1->key.k.p;
 
        btree_node_set_format(n2, n2->data->format);
@@ -2019,8 +2021,8 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
                        goto err;
        }
 
-       ret = bch2_check_mark_super(c, BCH_DATA_BTREE,
-                                   bch2_extent_devs(extent_i_to_s_c(new_key)));
+       ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+                                     extent_i_to_s_c(new_key).s_c);
        if (ret)
                goto err_free_update;
 
index 4b252b6d5e01d9cf28c1df019ec269820b5268ac..007aa5ef40910e89c6b764500e5d9e3055d0631b 100644 (file)
@@ -272,15 +272,10 @@ static void multi_unlock_write(struct btree_insert *trans)
                        bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
 }
 
-static inline void btree_trans_sort(struct btree_insert *trans)
+static inline int btree_trans_cmp(struct btree_insert_entry l,
+                                 struct btree_insert_entry r)
 {
-       int i, end = trans->nr;
-
-       while (--end > 0)
-               for (i = 0; i < end; i++)
-                       if (btree_iter_cmp(trans->entries[i].iter,
-                                          trans->entries[i + 1].iter) > 0)
-                               swap(trans->entries[i], trans->entries[i + 1]);
+       return btree_iter_cmp(l.iter, r.iter);
 }
 
 /* Normal update interface: */
@@ -313,7 +308,7 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
                                         bkey_i_to_s_c(i->k)));
        }
 
-       btree_trans_sort(trans);
+       bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
 
        if (unlikely(!percpu_ref_tryget(&c->writes)))
                return -EROFS;
index 08755853883583902b5e35a2779d3b61691c0dbc..56bd99fd8b7144cf2f091499c7e266ae4923cb49 100644 (file)
@@ -219,12 +219,16 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce,
                crypto_alloc_skcipher("chacha20", 0, 0);
        int ret;
 
-       if (!chacha20)
+       if (!chacha20) {
+               pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20));
                return PTR_ERR(chacha20);
+       }
 
        ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key));
-       if (ret)
+       if (ret) {
+               pr_err("crypto_skcipher_setkey() error: %i", ret);
                goto err;
+       }
 
        do_encrypt(chacha20, nonce, buf, len);
 err:
@@ -567,7 +571,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c,
 
        ret = bch2_request_key(c->disk_sb, &user_key);
        if (ret) {
-               bch_err(c, "error requesting encryption key");
+               bch_err(c, "error requesting encryption key: %i", ret);
                goto err;
        }
 
@@ -594,13 +598,19 @@ static int bch2_alloc_ciphers(struct bch_fs *c)
 {
        if (!c->chacha20)
                c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0);
-       if (IS_ERR(c->chacha20))
+       if (IS_ERR(c->chacha20)) {
+               bch_err(c, "error requesting chacha20 module: %li",
+                       PTR_ERR(c->chacha20));
                return PTR_ERR(c->chacha20);
+       }
 
        if (!c->poly1305)
                c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
-       if (IS_ERR(c->poly1305))
+       if (IS_ERR(c->poly1305)) {
+               bch_err(c, "error requesting poly1305 module: %li",
+                       PTR_ERR(c->poly1305));
                return PTR_ERR(c->poly1305);
+       }
 
        return 0;
 }
@@ -660,7 +670,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed)
        if (keyed) {
                ret = bch2_request_key(c->disk_sb, &user_key);
                if (ret) {
-                       bch_err(c, "error requesting encryption key");
+                       bch_err(c, "error requesting encryption key: %i", ret);
                        goto err;
                }
 
@@ -707,27 +717,35 @@ int bch2_fs_encryption_init(struct bch_fs *c)
 {
        struct bch_sb_field_crypt *crypt;
        struct bch_key key;
-       int ret;
+       int ret = 0;
+
+       pr_verbose_init(c->opts, "");
 
        c->sha256 = crypto_alloc_shash("sha256", 0, 0);
-       if (IS_ERR(c->sha256))
-               return PTR_ERR(c->sha256);
+       if (IS_ERR(c->sha256)) {
+               bch_err(c, "error requesting sha256 module");
+               ret = PTR_ERR(c->sha256);
+               goto out;
+       }
 
        crypt = bch2_sb_get_crypt(c->disk_sb);
        if (!crypt)
-               return 0;
+               goto out;
 
        ret = bch2_alloc_ciphers(c);
        if (ret)
-               return ret;
+               goto out;
 
        ret = bch2_decrypt_sb_key(c, crypt, &key);
        if (ret)
-               goto err;
+               goto out;
 
        ret = crypto_skcipher_setkey(c->chacha20,
                        (void *) &key.key, sizeof(key.key));
-err:
+       if (ret)
+               goto out;
+out:
        memzero_explicit(&key, sizeof(key));
+       pr_verbose_init(c->opts, "ret %i", ret);
        return ret;
 }
index b0c8a50e7c135000df5f0ce76a77feb63a95d65a..7862294bc03b55cb778483e396642c30f7e41d45 100644 (file)
@@ -91,20 +91,11 @@ static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
        return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
 }
 
-static inline enum bch_compression_type
-bch2_compression_opt_to_type(enum bch_compression_opts type)
-{
-       switch (type) {
-       case BCH_COMPRESSION_OPT_NONE:
-               return BCH_COMPRESSION_NONE;
-       case BCH_COMPRESSION_OPT_LZ4:
-               return BCH_COMPRESSION_LZ4;
-       case BCH_COMPRESSION_OPT_GZIP:
-               return BCH_COMPRESSION_GZIP;
-       default:
-            BUG();
-       }
-}
+static const unsigned bch2_compression_opt_to_type[] = {
+#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_##t,
+       BCH_COMPRESSION_TYPES()
+#undef x
+};
 
 static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
                                           unsigned type)
index 64079981d357d22805923849d6806fa69c7bed62..7726cfd8cfacaa06889281a7e2193c5422dc0ecf 100644 (file)
@@ -8,6 +8,7 @@
 #include "lz4.h"
 #include <linux/lz4.h>
 #include <linux/zlib.h>
+#include <linux/zstd.h>
 
 /* Bounce buffer: */
 struct bbuf {
@@ -151,6 +152,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
        struct bbuf src_data = { NULL };
        size_t src_len = src->bi_iter.bi_size;
        size_t dst_len = crc.uncompressed_size << 9;
+       void *workspace;
        int ret;
 
        src_data = bio_map_or_bounce(c, src, READ);
@@ -159,57 +161,64 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src,
        case BCH_COMPRESSION_LZ4_OLD:
                ret = bch2_lz4_decompress(src_data.b, &src_len,
                                     dst_data, dst_len);
-               if (ret) {
-                       ret = -EIO;
+               if (ret)
                        goto err;
-               }
                break;
        case BCH_COMPRESSION_LZ4:
                ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
                                                  src_len, dst_len, dst_len);
-               if (ret != dst_len) {
-                       ret = -EIO;
+               if (ret != dst_len)
                        goto err;
-               }
                break;
        case BCH_COMPRESSION_GZIP: {
-               void *workspace;
-               z_stream strm;
-
-               workspace = kmalloc(zlib_inflate_workspacesize(),
-                                   GFP_NOIO|__GFP_NOWARN);
-               if (!workspace) {
-                       mutex_lock(&c->zlib_workspace_lock);
-                       workspace = c->zlib_workspace;
-               }
+               z_stream strm = {
+                       .next_in        = src_data.b,
+                       .avail_in       = src_len,
+                       .next_out       = dst_data,
+                       .avail_out      = dst_len,
+               };
+
+               workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
 
-               strm.next_in    = src_data.b;
-               strm.avail_in   = src_len;
-               strm.next_out   = dst_data;
-               strm.avail_out  = dst_len;
                zlib_set_workspace(&strm, workspace);
                zlib_inflateInit2(&strm, -MAX_WBITS);
-
                ret = zlib_inflate(&strm, Z_FINISH);
 
-               if (workspace == c->zlib_workspace)
-                       mutex_unlock(&c->zlib_workspace_lock);
-               else
-                       kfree(workspace);
+               mempool_free(workspace, &c->decompress_workspace);
 
-               if (ret != Z_STREAM_END) {
-                       ret = -EIO;
+               if (ret != Z_STREAM_END)
+                       goto err;
+               break;
+       }
+       case BCH_COMPRESSION_ZSTD: {
+               ZSTD_DCtx *ctx;
+               size_t len;
+
+               workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
+               ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound());
+
+               src_len = le32_to_cpup(src_data.b);
+
+               len = ZSTD_decompressDCtx(ctx,
+                               dst_data,       dst_len,
+                               src_data.b + 4, src_len);
+
+               mempool_free(workspace, &c->decompress_workspace);
+
+               if (len != dst_len)
                        goto err;
-               }
                break;
        }
        default:
                BUG();
        }
        ret = 0;
-err:
+out:
        bio_unmap_or_unbounce(c, src_data);
        return ret;
+err:
+       ret = -EIO;
+       goto out;
 }
 
 int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
@@ -282,113 +291,129 @@ err:
        return ret;
 }
 
+static int attempt_compress(struct bch_fs *c,
+                           void *workspace,
+                           void *dst, size_t dst_len,
+                           void *src, size_t src_len,
+                           unsigned compression_type)
+{
+       switch (compression_type) {
+       case BCH_COMPRESSION_LZ4: {
+               int len = src_len;
+               int ret = LZ4_compress_destSize(
+                               src,            dst,
+                               &len,           dst_len,
+                               workspace);
+
+               if (len < src_len)
+                       return -len;
+
+               return ret;
+       }
+       case BCH_COMPRESSION_GZIP: {
+               z_stream strm = {
+                       .next_in        = src,
+                       .avail_in       = src_len,
+                       .next_out       = dst,
+                       .avail_out      = dst_len,
+               };
+
+               zlib_set_workspace(&strm, workspace);
+               zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+                                 Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
+                                 Z_DEFAULT_STRATEGY);
+
+               if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
+                       return 0;
+
+               if (zlib_deflateEnd(&strm) != Z_OK)
+                       return 0;
+
+               return strm.total_out;
+       }
+       case BCH_COMPRESSION_ZSTD: {
+               ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace,
+                       ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams));
+
+               size_t len = ZSTD_compressCCtx(ctx,
+                               dst + 4,        dst_len - 4,
+                               src,            src_len,
+                               c->zstd_params);
+               if (ZSTD_isError(len))
+                       return 0;
+
+               *((__le32 *) dst) = cpu_to_le32(len);
+               return len + 4;
+       }
+       default:
+               BUG();
+       }
+}
+
 static unsigned __bio_compress(struct bch_fs *c,
                               struct bio *dst, size_t *dst_len,
                               struct bio *src, size_t *src_len,
                               unsigned compression_type)
 {
        struct bbuf src_data = { NULL }, dst_data = { NULL };
+       void *workspace;
        unsigned pad;
        int ret = 0;
 
        /* If it's only one block, don't bother trying to compress: */
        if (bio_sectors(src) <= c->opts.block_size)
-               goto err;
+               return 0;
 
        dst_data = bio_map_or_bounce(c, dst, WRITE);
        src_data = bio_map_or_bounce(c, src, READ);
 
-       switch (compression_type) {
-       case BCH_COMPRESSION_LZ4_OLD:
-               compression_type = BCH_COMPRESSION_LZ4;
+       workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO);
 
-       case BCH_COMPRESSION_LZ4: {
-               void *workspace;
-               int len = src->bi_iter.bi_size;
-
-               workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO);
-
-               while (1) {
-                       if (len <= block_bytes(c)) {
-                               ret = 0;
-                               break;
-                       }
-
-                       ret = LZ4_compress_destSize(
-                                       src_data.b,     dst_data.b,
-                                       &len,           dst->bi_iter.bi_size,
-                                       workspace);
-                       if (ret >= len) {
-                               /* uncompressible: */
-                               ret = 0;
-                               break;
-                       }
-
-                       if (!(len & (block_bytes(c) - 1)))
-                               break;
-                       len = round_down(len, block_bytes(c));
-               }
-               mempool_free(workspace, &c->lz4_workspace_pool);
+       *src_len = src->bi_iter.bi_size;
+       *dst_len = dst->bi_iter.bi_size;
 
-               if (!ret)
-                       goto err;
-
-               *src_len = len;
-               *dst_len = ret;
-               ret = 0;
-               break;
-       }
-       case BCH_COMPRESSION_GZIP: {
-               void *workspace;
-               z_stream strm;
-
-               workspace = kmalloc(zlib_deflate_workspacesize(MAX_WBITS,
-                                                              DEF_MEM_LEVEL),
-                                   GFP_NOIO|__GFP_NOWARN);
-               if (!workspace) {
-                       mutex_lock(&c->zlib_workspace_lock);
-                       workspace = c->zlib_workspace;
+       /*
+        * XXX: this algorithm sucks when the compression code doesn't tell us
+        * how much would fit, like LZ4 does:
+        */
+       while (1) {
+               if (*src_len <= block_bytes(c)) {
+                       ret = -1;
+                       break;
                }
 
-               strm.next_in    = src_data.b;
-               strm.avail_in   = min(src->bi_iter.bi_size,
-                                     dst->bi_iter.bi_size);
-               strm.next_out   = dst_data.b;
-               strm.avail_out  = dst->bi_iter.bi_size;
-               zlib_set_workspace(&strm, workspace);
-               zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
-                                 Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
-                                 Z_DEFAULT_STRATEGY);
-
-               ret = zlib_deflate(&strm, Z_FINISH);
-               if (ret != Z_STREAM_END) {
-                       ret = -EIO;
-                       goto zlib_err;
+               ret = attempt_compress(c, workspace,
+                                      dst_data.b,      *dst_len,
+                                      src_data.b,      *src_len,
+                                      compression_type);
+               if (ret > 0) {
+                       *dst_len = ret;
+                       ret = 0;
+                       break;
                }
 
-               ret = zlib_deflateEnd(&strm);
-               if (ret != Z_OK) {
-                       ret = -EIO;
-                       goto zlib_err;
+               /* Didn't fit: should we retry with a smaller amount?  */
+               if (*src_len <= *dst_len) {
+                       ret = -1;
+                       break;
                }
 
-               ret = 0;
-zlib_err:
-               if (workspace == c->zlib_workspace)
-                       mutex_unlock(&c->zlib_workspace_lock);
+               /*
+                * If ret is negative, it's a hint as to how much data would fit
+                */
+               BUG_ON(-ret >= *src_len);
+
+               if (ret < 0)
+                       *src_len = -ret;
                else
-                       kfree(workspace);
+                       *src_len -= (*src_len - *dst_len) / 2;
+               *src_len = round_down(*src_len, block_bytes(c));
+       }
 
-               if (ret)
-                       goto err;
+       mempool_free(workspace, &c->compress_workspace[compression_type]);
 
-               *dst_len = strm.total_out;
-               *src_len = strm.total_in;
-               break;
-       }
-       default:
-               BUG();
-       }
+       if (ret)
+               goto err;
 
        /* Didn't get smaller: */
        if (round_up(*dst_len, block_bytes(c)) >= *src_len)
@@ -429,6 +454,9 @@ unsigned bch2_bio_compress(struct bch_fs *c,
        /* Don't generate a bigger output than input: */
        dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
 
+       if (compression_type == BCH_COMPRESSION_LZ4_OLD)
+               compression_type = BCH_COMPRESSION_LZ4;
+
        compression_type =
                __bio_compress(c, dst, dst_len, src, src_len, compression_type);
 
@@ -437,81 +465,147 @@ unsigned bch2_bio_compress(struct bch_fs *c,
        return compression_type;
 }
 
+#define BCH_FEATURE_NONE       0
+
+static const unsigned bch2_compression_opt_to_feature[] = {
+#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
+       BCH_COMPRESSION_TYPES()
+#undef x
+};
+
+#undef BCH_FEATURE_NONE
+
 /* doesn't write superblock: */
 int bch2_check_set_has_compressed_data(struct bch_fs *c,
                                      unsigned compression_type)
 {
-       switch (compression_type) {
-       case BCH_COMPRESSION_OPT_NONE:
-               return 0;
-       case BCH_COMPRESSION_OPT_LZ4:
-               if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4))
-                       return 0;
+       unsigned f;
+       int ret = 0;
 
-               bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_LZ4);
-               break;
-       case BCH_COMPRESSION_OPT_GZIP:
-               if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
-                       return 0;
+       pr_verbose_init(c->opts, "");
 
-               bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_GZIP);
-               break;
-       default:
-               BUG();
-       }
+       BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
+
+       if (!compression_type)
+               goto out;
 
-       return bch2_fs_compress_init(c);
+       f = bch2_compression_opt_to_feature[compression_type];
+       if (bch2_sb_test_feature(c->disk_sb, f))
+               goto out;
+
+       bch2_sb_set_feature(c->disk_sb, f);
+       ret = bch2_fs_compress_init(c);
+out:
+       pr_verbose_init(c->opts, "ret %i", ret);
+       return ret;
 }
 
 void bch2_fs_compress_exit(struct bch_fs *c)
 {
-       vfree(c->zlib_workspace);
-       mempool_exit(&c->lz4_workspace_pool);
+       unsigned i;
+
+       mempool_exit(&c->decompress_workspace);
+       for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
+               mempool_exit(&c->compress_workspace[i]);
        mempool_exit(&c->compression_bounce[WRITE]);
        mempool_exit(&c->compression_bounce[READ]);
 }
 
-#define COMPRESSION_WORKSPACE_SIZE                                     \
-       max_t(size_t, zlib_inflate_workspacesize(),                     \
-             zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL))
+static void *mempool_kvpmalloc(gfp_t gfp_mask, void *pool_data)
+{
+       size_t size = (size_t)pool_data;
+       return kvpmalloc(size, gfp_mask);
+}
+
+void mempool_kvpfree(void *element, void *pool_data)
+{
+       size_t size = (size_t)pool_data;
+       kvpfree(element, size);
+}
+
+static int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+{
+       return !mempool_initialized(pool)
+               ? mempool_init(pool, min_nr, mempool_kvpmalloc,
+                              mempool_kvpfree, (void *) size)
+               : 0;
+}
 
 int bch2_fs_compress_init(struct bch_fs *c)
 {
-       unsigned order = get_order(c->sb.encoded_extent_max << 9);
-       int ret;
+       size_t max_extent = c->sb.encoded_extent_max << 9;
+       size_t order = get_order(max_extent);
+       size_t decompress_workspace_size = 0;
+       bool decompress_workspace_needed;
+       ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0);
+       struct {
+               unsigned        feature;
+               unsigned        type;
+               size_t          compress_workspace;
+               size_t          decompress_workspace;
+       } compression_types[] = {
+               { BCH_FEATURE_LZ4, BCH_COMPRESSION_LZ4, LZ4_MEM_COMPRESS, 0 },
+               { BCH_FEATURE_GZIP, BCH_COMPRESSION_GZIP,
+                       zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
+                       zlib_inflate_workspacesize(), },
+               { BCH_FEATURE_ZSTD, BCH_COMPRESSION_ZSTD,
+                       ZSTD_CCtxWorkspaceBound(params.cParams),
+                       ZSTD_DCtxWorkspaceBound() },
+       }, *i;
+       int ret = 0;
 
-       if (!bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
-           !bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
-               return 0;
+       pr_verbose_init(c->opts, "");
+
+       c->zstd_params = params;
+
+       for (i = compression_types;
+            i < compression_types + ARRAY_SIZE(compression_types);
+            i++)
+               if (bch2_sb_test_feature(c->disk_sb, i->feature))
+                       goto have_compressed;
+
+       goto out;
+have_compressed:
 
        if (!mempool_initialized(&c->compression_bounce[READ])) {
                ret = mempool_init_page_pool(&c->compression_bounce[READ],
                                             1, order);
                if (ret)
-                       return ret;
+                       goto out;
        }
 
        if (!mempool_initialized(&c->compression_bounce[WRITE])) {
                ret = mempool_init_page_pool(&c->compression_bounce[WRITE],
                                             1, order);
                if (ret)
-                       return ret;
+                       goto out;
        }
 
-       if (!mempool_initialized(&c->lz4_workspace_pool) &&
-           bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4)) {
-               ret = mempool_init_kmalloc_pool(&c->lz4_workspace_pool,
-                                               1, LZ4_MEM_COMPRESS);
-               if (ret)
-                       return ret;
-       }
+       for (i = compression_types;
+            i < compression_types + ARRAY_SIZE(compression_types);
+            i++) {
+               decompress_workspace_size =
+                       max(decompress_workspace_size, i->decompress_workspace);
 
-       if (!c->zlib_workspace &&
-           bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) {
-               c->zlib_workspace = vmalloc(COMPRESSION_WORKSPACE_SIZE);
-               if (!c->zlib_workspace)
-                       return -ENOMEM;
+               if (!bch2_sb_test_feature(c->disk_sb, i->feature))
+                       continue;
+
+               if (i->decompress_workspace)
+                       decompress_workspace_needed = true;
+
+               ret = mempool_init_kvpmalloc_pool(
+                               &c->compress_workspace[i->type],
+                               1, i->compress_workspace);
+               if (ret)
+                       goto out;
        }
 
-       return 0;
+       ret = mempool_init_kmalloc_pool(
+                       &c->decompress_workspace,
+                       1, decompress_workspace_size);
+       if (ret)
+               goto out;
+out:
+       pr_verbose_init(c->opts, "ret %i", ret);
+       return ret;
 }
index f5dccfad15d6572d4470e859a023e9d4a59cf4de..ce1f8ba230356b532f273074e95bb215bf823cb9 100644 (file)
@@ -694,7 +694,7 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
                        goto err;
        }
 
-       if (!bch2_sb_has_replicas(c, BCH_DATA_BTREE, bch2_extent_devs(e))) {
+       if (!bch2_bkey_replicas_marked(c, BCH_DATA_BTREE, e.s_c)) {
                bch2_bkey_val_to_text(c, btree_node_type(b),
                                     buf, sizeof(buf), k);
                bch2_fs_bug(c,
@@ -1834,7 +1834,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
        }
 
        if (!bkey_extent_is_cached(e.k) &&
-           !bch2_sb_has_replicas(c, BCH_DATA_USER, bch2_extent_devs(e))) {
+           !bch2_bkey_replicas_marked(c, BCH_DATA_USER, e.s_c)) {
                bch2_bkey_val_to_text(c, btree_node_type(b),
                                     buf, sizeof(buf), e.s_c);
                bch2_fs_bug(c,
@@ -2013,17 +2013,18 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
 }
 
 void bch2_extent_mark_replicas_cached(struct bch_fs *c,
-                                     struct bkey_s_extent e)
+                                     struct bkey_s_extent e,
+                                     unsigned nr_desired_replicas)
 {
        struct bch_extent_ptr *ptr;
        unsigned tier = 0, nr_cached = 0;
        unsigned nr_good = bch2_extent_nr_good_ptrs(c, e.c);
        bool have_higher_tier;
 
-       if (nr_good <= c->opts.data_replicas)
+       if (nr_good <= nr_desired_replicas)
                return;
 
-       nr_cached = nr_good - c->opts.data_replicas;
+       nr_cached = nr_good - nr_desired_replicas;
 
        do {
                have_higher_tier = false;
index e8f54f2e9acb89ff17a795ef11f853e713268520..75579273fae8ea159bf3eea7570a545752cb9050 100644 (file)
@@ -38,7 +38,8 @@ bch2_insert_fixup_extent(struct btree_insert *,
                        struct btree_insert_entry *);
 
 bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
-void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent);
+void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
+                                     unsigned);
 
 const struct bch_extent_ptr *
 bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
@@ -430,6 +431,18 @@ static inline struct bch_devs_list bch2_extent_dirty_devs(struct bkey_s_c_extent
        return ret;
 }
 
+static inline struct bch_devs_list bch2_extent_cached_devs(struct bkey_s_c_extent e)
+{
+       struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+       const struct bch_extent_ptr *ptr;
+
+       extent_for_each_ptr(e, ptr)
+               if (ptr->cached)
+                       ret.devs[ret.nr++] = ptr->dev;
+
+       return ret;
+}
+
 static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
 {
        switch (k.k->type) {
@@ -441,6 +454,28 @@ static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
        }
 }
 
+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
+{
+       switch (k.k->type) {
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED:
+               return bch2_extent_dirty_devs(bkey_s_c_to_extent(k));
+       default:
+               return (struct bch_devs_list) { .nr = 0 };
+       }
+}
+
+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
+{
+       switch (k.k->type) {
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED:
+               return bch2_extent_cached_devs(bkey_s_c_to_extent(k));
+       default:
+               return (struct bch_devs_list) { .nr = 0 };
+       }
+}
+
 bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
                                 struct bch_extent_crc_unpacked);
 bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
index 1bffddf6ce519dcad1d8ad34b97715265ffafd81..00475b99dff83a50ab221c737321c3a77dca61da 100644 (file)
@@ -452,14 +452,18 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 
                        ret = bch2_btree_insert_at(wop->c, &wop->res,
                                        &hook.hook, op_journal_seq(wop),
-                                       BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
+                                       BTREE_INSERT_NOFAIL|
+                                       BTREE_INSERT_ATOMIC|
+                                       BTREE_INSERT_USE_RESERVE,
                                        BTREE_INSERT_ENTRY(&extent_iter, k),
                                        BTREE_INSERT_ENTRY_EXTRA_RES(&inode_iter,
                                                        &hook.inode_p.inode.k_i, 2));
                } else {
                        ret = bch2_btree_insert_at(wop->c, &wop->res,
                                        &hook.hook, op_journal_seq(wop),
-                                       BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
+                                       BTREE_INSERT_NOFAIL|
+                                       BTREE_INSERT_ATOMIC|
+                                       BTREE_INSERT_USE_RESERVE,
                                        BTREE_INSERT_ENTRY(&extent_iter, k));
                }
 
@@ -502,7 +506,7 @@ static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
 
        bch2_write_op_init(&op->op, c);
        op->op.csum_type        = bch2_data_checksum_type(c, opts.data_checksum);
-       op->op.compression_type = bch2_compression_opt_to_type(opts.compression);
+       op->op.compression_type = bch2_compression_opt_to_type[opts.compression];
        op->op.devs             = c->fastest_devs;
        op->op.index_update_fn  = bchfs_write_index_update;
        op_journal_seq_set(&op->op, &inode->ei_journal_seq);
@@ -2692,6 +2696,10 @@ void bch2_fs_fsio_exit(struct bch_fs *c)
 
 int bch2_fs_fsio_init(struct bch_fs *c)
 {
+       int ret = 0;
+
+       pr_verbose_init(c->opts, "");
+
        if (bioset_init(&c->writepage_bioset,
                        4, offsetof(struct bch_writepage_io, op.op.wbio.bio),
                        BIOSET_NEED_BVECS) ||
@@ -2701,9 +2709,10 @@ int bch2_fs_fsio_init(struct bch_fs *c)
            bioset_init(&c->dio_write_bioset,
                        4, offsetof(struct dio_write, iop.op.wbio.bio),
                        BIOSET_NEED_BVECS))
-               return -ENOMEM;
+               ret = -ENOMEM;
 
-       return 0;
+       pr_verbose_init(c->opts, "ret %i", ret);
+       return ret;
 }
 
 #endif /* NO_BCACHEFS_FS */
index 7cddbccd1938ef4daeaaa2bda3d53e54ccf28faf..13495d487a686b5b4db16e632ea98a773adce5ef 100644 (file)
@@ -209,17 +209,6 @@ static void bch2_write_done(struct closure *cl)
        closure_return(cl);
 }
 
-static u64 keylist_sectors(struct keylist *keys)
-{
-       struct bkey_i *k;
-       u64 ret = 0;
-
-       for_each_keylist_key(keys, k)
-               ret += k->k.size;
-
-       return ret;
-}
-
 int bch2_write_index_default(struct bch_write_op *op)
 {
        struct keylist *keys = &op->insert_keys;
@@ -232,7 +221,8 @@ int bch2_write_index_default(struct bch_write_op *op)
 
        ret = bch2_btree_insert_list_at(&iter, keys, &op->res,
                                       NULL, op_journal_seq(op),
-                                      BTREE_INSERT_NOFAIL);
+                                      BTREE_INSERT_NOFAIL|
+                                      BTREE_INSERT_USE_RESERVE);
        bch2_btree_iter_unlock(&iter);
 
        return ret;
@@ -268,8 +258,7 @@ static void bch2_write_index(struct closure *cl)
                }
 
                if (!(op->flags & BCH_WRITE_NOMARK_REPLICAS)) {
-                       ret = bch2_check_mark_super(c, BCH_DATA_USER,
-                                                   bch2_extent_devs(e.c));
+                       ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, e.s_c);
                        if (ret)
                                goto err;
                }
@@ -910,18 +899,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
        swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
        rbio->promote = NULL;
 
-       bch2_write_op_init(&op->write.op, c);
-       op->write.op.csum_type = bch2_data_checksum_type(c, rbio->opts.data_checksum);
-       op->write.op.compression_type =
-               bch2_compression_opt_to_type(rbio->opts.compression);
-
-       op->write.move_dev      = -1;
-       op->write.op.devs       = c->fastest_devs;
-       op->write.op.write_point = writepoint_hashed((unsigned long) current);
-       op->write.op.flags      |= BCH_WRITE_ALLOC_NOWAIT;
-       op->write.op.flags      |= BCH_WRITE_CACHED;
-
-       bch2_migrate_write_init(&op->write, rbio);
+       bch2_migrate_read_done(&op->write, rbio);
 
        closure_init(cl, NULL);
        closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
@@ -932,13 +910,16 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
  * XXX: multiple promotes can race with each other, wastefully. Keep a list of
  * outstanding promotes?
  */
-static struct promote_op *promote_alloc(struct bch_read_bio *rbio)
+static struct promote_op *promote_alloc(struct bch_read_bio *rbio,
+                                       struct bkey_s_c k)
 {
+       struct bch_fs *c = rbio->c;
        struct promote_op *op;
        struct bio *bio;
        /* data might have to be decompressed in the write path: */
        unsigned pages = DIV_ROUND_UP(rbio->pick.crc.uncompressed_size,
                                      PAGE_SECTORS);
+       int ret;
 
        BUG_ON(!rbio->bounce);
        BUG_ON(pages < rbio->bio.bi_vcnt);
@@ -954,6 +935,14 @@ static struct promote_op *promote_alloc(struct bch_read_bio *rbio)
        memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
               sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
 
+       ret = bch2_migrate_write_init(c, &op->write, c->fastest_devs,
+                                     writepoint_hashed((unsigned long) current),
+                                     rbio->opts,
+                                     DATA_PROMOTE,
+                                     (struct data_opts) { 0 },
+                                     k);
+       BUG_ON(ret);
+
        return op;
 }
 
@@ -1407,7 +1396,7 @@ noclone:
        rbio->pick              = *pick;
        rbio->pos               = pos;
        rbio->version           = e.k->version;
-       rbio->promote           = promote ? promote_alloc(rbio) : NULL;
+       rbio->promote           = promote ? promote_alloc(rbio, e.s_c) : NULL;
        INIT_WORK(&rbio->work, NULL);
 
        bio_set_dev(&rbio->bio, pick->ca->disk_sb.bdev);
index 71eee4f6896b8ccfee8f3749663017cc9b562b83..4208fd4385bf058dc63fb2eb6de2c12620254c72 100644 (file)
@@ -70,7 +70,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
        op->error               = 0;
        op->csum_type           = bch2_data_checksum_type(c, c->opts.data_checksum);
        op->compression_type    =
-               bch2_compression_opt_to_type(c->opts.compression);
+               bch2_compression_opt_to_type[c->opts.compression];
        op->nr_replicas         = 0;
        op->nr_replicas_required = c->opts.data_replicas_required;
        op->alloc_reserve       = RESERVE_NONE;
index a1e45625704afd42509f8f3a20c8132554b5dbf0..8ce1745233e53e1ce3f8eb3a843c195436e35dc6 100644 (file)
@@ -1046,12 +1046,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 
                if (!degraded &&
                    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-                    fsck_err_on(!bch2_sb_has_replicas(c, BCH_DATA_JOURNAL,
+                    fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL,
                                                       i->devs), c,
                                 "superblock not marked as containing replicas (type %u)",
                                 BCH_DATA_JOURNAL))) {
-                       ret = bch2_check_mark_super(c, BCH_DATA_JOURNAL,
-                                                   i->devs);
+                       ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs);
                        if (ret)
                                return ret;
                }
@@ -2232,7 +2231,7 @@ static void journal_write_done(struct closure *cl)
                goto err;
        }
 
-       if (bch2_check_mark_super(c, BCH_DATA_JOURNAL, devs))
+       if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs))
                goto err;
 out:
        __bch2_time_stats_update(j->write_time, j->write_start_time);
@@ -2851,7 +2850,7 @@ int bch2_journal_flush_device(struct journal *j, int dev_idx)
                seq++;
 
                spin_unlock(&j->lock);
-               ret = bch2_check_mark_super(c, BCH_DATA_JOURNAL, devs);
+               ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
                spin_lock(&j->lock);
        }
        spin_unlock(&j->lock);
@@ -2946,7 +2945,11 @@ void bch2_fs_journal_exit(struct journal *j)
 
 int bch2_fs_journal_init(struct journal *j)
 {
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
        static struct lock_class_key res_key;
+       int ret = 0;
+
+       pr_verbose_init(c->opts, "");
 
        spin_lock_init(&j->lock);
        spin_lock_init(&j->err_lock);
@@ -2972,12 +2975,15 @@ int bch2_fs_journal_init(struct journal *j)
 
        if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
            !(j->buf[0].data = kvpmalloc(j->buf[0].size, GFP_KERNEL)) ||
-           !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL)))
-               return -ENOMEM;
+           !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL))) {
+               ret = -ENOMEM;
+               goto out;
+       }
 
        j->pin.front = j->pin.back = 1;
-
-       return 0;
+out:
+       pr_verbose_init(c->opts, "ret %i", ret);
+       return ret;
 }
 
 /* debug: */
index b7c8a861e537e4ec2b247c27c079f7d68c6378f4..a8c8883ba0719ae736bcb17370de00d7d18322df 100644 (file)
@@ -58,6 +58,17 @@ static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
 #define keylist_single(k)                                      \
        ((struct keylist) { .keys = k, .top = bkey_next(k) })
 
+static inline u64 keylist_sectors(struct keylist *keys)
+{
+       struct bkey_i *k;
+       u64 ret = 0;
+
+       for_each_keylist_key(keys, k)
+               ret += k->k.size;
+
+       return ret;
+}
+
 #ifdef CONFIG_BCACHEFS_DEBUG
 void bch2_verify_keylist_sorted(struct keylist *);
 #else
index 9c2920cff61cb874b08b23af2e80d63ad878e644..9200ed9f591cf880e373c2b54e8d50e96d2962a5 100644 (file)
 #include "move.h"
 #include "super-io.h"
 
-static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
-{
-       struct bch_dev *ca = arg;
-
-       return bch2_extent_has_device(e, ca->dev_idx);
-}
-
-#define MAX_DATA_OFF_ITER      10
-
-static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
-                                   int flags)
-{
-       struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bch_move_stats stats;
-       unsigned pass = 0;
-       int ret = 0;
-
-       if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_USER)))
-               return 0;
-
-       /*
-        * XXX: we should be able to do this in one pass, but bch2_move_data()
-        * can spuriously fail to move an extent due to racing with other move
-        * operations
-        */
-       do {
-               memset(&stats, 0, sizeof(stats));
-
-               ret = bch2_move_data(c, NULL,
-                                    SECTORS_IN_FLIGHT_PER_DEVICE,
-                                    NULL,
-                                    writepoint_hashed((unsigned long) current),
-                                    0,
-                                    ca->dev_idx,
-                                    POS_MIN, POS_MAX,
-                                    migrate_pred, ca,
-                                    &stats);
-               if (ret) {
-                       bch_err(c, "error migrating data: %i", ret);
-                       return ret;
-               }
-       } while (atomic64_read(&stats.keys_moved) && pass++ < MAX_DATA_OFF_ITER);
-
-       if (atomic64_read(&stats.keys_moved)) {
-               bch_err(c, "unable to migrate all data in %d iterations",
-                       MAX_DATA_OFF_ITER);
-               return -1;
-       }
-
-       mutex_lock(&c->replicas_gc_lock);
-       bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
-
-       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH, k) {
-               ret = bch2_check_mark_super(c, BCH_DATA_USER, bch2_bkey_devs(k));
-               if (ret) {
-                       bch_err(c, "error migrating data %i from check_mark_super()", ret);
-                       break;
-               }
-       }
-
-       bch2_replicas_gc_end(c, ret);
-       mutex_unlock(&c->replicas_gc_lock);
-       return ret;
-}
-
-static int bch2_dev_metadata_migrate(struct bch_fs *c, struct bch_dev *ca,
-                                    int flags)
-{
-       struct btree_iter iter;
-       struct btree *b;
-       int ret = 0;
-       unsigned id;
-
-       if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_BTREE)))
-               return 0;
-
-       mutex_lock(&c->replicas_gc_lock);
-       bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
-
-       for (id = 0; id < BTREE_ID_NR; id++) {
-               for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
-                       struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
-
-                       if (!bch2_extent_has_device(e, ca->dev_idx))
-                               continue;
-
-                       ret = bch2_btree_node_rewrite(c, &iter, b->data->keys.seq, 0);
-                       if (ret) {
-                               bch2_btree_iter_unlock(&iter);
-                               goto err;
-                       }
-               }
-               ret = bch2_btree_iter_unlock(&iter);
-               if (ret)
-                       goto err;
-       }
-err:
-       bch2_replicas_gc_end(c, ret);
-       mutex_unlock(&c->replicas_gc_lock);
-       return ret;
-}
-
-int bch2_dev_data_migrate(struct bch_fs *c, struct bch_dev *ca, int flags)
-{
-       BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW &&
-              bch2_dev_is_online(ca));
-
-       return bch2_dev_usrdata_migrate(c, ca, flags) ?:
-               bch2_dev_metadata_migrate(c, ca, flags);
-}
-
 static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
                         unsigned dev_idx, int flags, bool metadata)
 {
@@ -152,7 +40,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
        int ret = 0;
 
        mutex_lock(&c->replicas_gc_lock);
-       bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
+       bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
 
        bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
                             POS_MIN, BTREE_ITER_PREFETCH);
@@ -161,8 +49,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
               !(ret = btree_iter_err(k))) {
                if (!bkey_extent_is_data(k.k) ||
                    !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
-                       ret = bch2_check_mark_super(c, BCH_DATA_USER,
-                                                   bch2_bkey_devs(k));
+                       ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
                        if (ret)
                                break;
                        bch2_btree_iter_next(&iter);
@@ -183,8 +70,8 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
                 */
                bch2_extent_normalize(c, e.s);
 
-               ret = bch2_check_mark_super(c, BCH_DATA_USER,
-                               bch2_bkey_devs(bkey_i_to_s_c(&tmp.key)));
+               ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
+                                             bkey_i_to_s_c(&tmp.key));
                if (ret)
                        break;
 
@@ -240,8 +127,8 @@ retry:
                                                    dev_idx)) {
                                bch2_btree_iter_set_locks_want(&iter, 0);
 
-                               ret = bch2_check_mark_super(c, BCH_DATA_BTREE,
-                                               bch2_bkey_devs(bkey_i_to_s_c(&b->key)));
+                               ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+                                                             bkey_i_to_s_c(&b->key));
                                if (ret)
                                        goto err;
                        } else {
index 6db7b9111bf27806474920cba1cabb11ae37bda8..de2faab24e1159eddcb8172be21b4c570bd0980e 100644 (file)
@@ -1,7 +1,6 @@
 #ifndef _BCACHEFS_MIGRATE_H
 #define _BCACHEFS_MIGRATE_H
 
-int bch2_dev_data_migrate(struct bch_fs *, struct bch_dev *, int);
 int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
 
 #endif /* _BCACHEFS_MIGRATE_H */
index e5a46ba6d03fbf1236eb7b4a19834af94ce173c9..a176484ae91d00d8c01936fe1169a91fe3fbb5b7 100644 (file)
@@ -58,6 +58,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                BKEY_PADDED(k) _new, _insert;
                struct bch_extent_ptr *ptr;
                struct bch_extent_crc_unpacked crc;
+               unsigned nr_dirty;
                bool did_work = false;
 
                if (btree_iter_err(k)) {
@@ -71,6 +72,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                                             m->ptr, m->offset))
                        goto nomatch;
 
+               if (m->data_cmd == DATA_REWRITE &&
+                   !bch2_extent_has_device(bkey_s_c_to_extent(k),
+                                           m->data_opts.rewrite_dev))
+                       goto nomatch;
+
                bkey_reassemble(&_insert.k, k);
                insert = bkey_i_to_extent(&_insert.k);
 
@@ -81,11 +87,12 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                bch2_cut_back(new->k.p, &insert->k);
                bch2_cut_back(insert->k.p, &new->k);
 
-               if (m->move_dev >= 0 &&
-                   (ptr = (struct bch_extent_ptr *)
-                    bch2_extent_has_device(extent_i_to_s_c(insert),
-                                           m->move_dev)))
+               if (m->data_cmd == DATA_REWRITE) {
+                       ptr = (struct bch_extent_ptr *)
+                               bch2_extent_has_device(extent_i_to_s_c(insert),
+                                                      m->data_opts.rewrite_dev);
                        bch2_extent_drop_ptr(extent_i_to_s(insert), ptr);
+               }
 
                extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) {
                        if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) {
@@ -108,10 +115,35 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                bch2_extent_narrow_crcs(insert,
                                (struct bch_extent_crc_unpacked) { 0 });
                bch2_extent_normalize(c, extent_i_to_s(insert).s);
-               bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert));
+               bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert),
+                                                c->opts.data_replicas);
+
+               /*
+                * It's possible we race, and for whatever reason the extent now
+                * has fewer replicas than when we last looked at it - meaning
+                * we need to get a disk reservation here:
+                */
+               nr_dirty = bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i));
+               if (m->nr_ptrs_reserved < nr_dirty) {
+                       unsigned sectors = (nr_dirty - m->nr_ptrs_reserved) *
+                                       keylist_sectors(keys);
+
+                       /*
+                        * can't call bch2_disk_reservation_add() with btree
+                        * locks held, at least not without a song and dance
+                        */
+                       bch2_btree_iter_unlock(&iter);
+
+                       ret = bch2_disk_reservation_add(c, &op->res, sectors, 0);
+                       if (ret)
+                               goto out;
+
+                       m->nr_ptrs_reserved = nr_dirty;
+                       goto next;
+               }
 
-               ret = bch2_check_mark_super(c, BCH_DATA_USER,
-                               bch2_extent_devs(extent_i_to_s_c(insert)));
+               ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
+                                             extent_i_to_s_c(insert).s_c);
                if (ret)
                        break;
 
@@ -119,7 +151,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                                NULL, op_journal_seq(op),
                                BTREE_INSERT_ATOMIC|
                                BTREE_INSERT_NOFAIL|
-                               m->btree_insert_flags,
+                               BTREE_INSERT_USE_RESERVE|
+                               m->data_opts.btree_insert_flags,
                                BTREE_INSERT_ENTRY(&iter, &insert->k_i));
                if (!ret)
                        atomic_long_inc(&c->extent_migrate_done);
@@ -150,8 +183,7 @@ out:
        return ret;
 }
 
-void bch2_migrate_write_init(struct migrate_write *m,
-                            struct bch_read_bio *rbio)
+void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
 {
        /* write bio must own pages: */
        BUG_ON(!m->op.wbio.bio.bi_vcnt);
@@ -162,16 +194,39 @@ void bch2_migrate_write_init(struct migrate_write *m,
        m->op.pos       = rbio->pos;
        m->op.version   = rbio->version;
        m->op.crc       = rbio->pick.crc;
+       m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
 
        if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
                m->op.nonce     = m->op.crc.nonce + m->op.crc.offset;
                m->op.csum_type = m->op.crc.csum_type;
        }
 
-       if (m->move_dev >= 0)
-               bch2_dev_list_drop_dev(&m->op.devs_have, m->move_dev);
+       if (m->data_cmd == DATA_REWRITE)
+               bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
+}
+
+int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
+                           struct bch_devs_mask *devs,
+                           struct write_point_specifier wp,
+                           struct bch_io_opts io_opts,
+                           enum data_cmd data_cmd,
+                           struct data_opts data_opts,
+                           struct bkey_s_c k)
+{
+       int ret;
 
-       if (m->btree_insert_flags & BTREE_INSERT_USE_RESERVE)
+       m->data_cmd     = data_cmd;
+       m->data_opts    = data_opts;
+       m->nr_ptrs_reserved = bch2_extent_nr_dirty_ptrs(k);
+
+       bch2_write_op_init(&m->op, c);
+       m->op.csum_type = bch2_data_checksum_type(c, io_opts.data_checksum);
+       m->op.compression_type =
+               bch2_compression_opt_to_type[io_opts.compression];
+       m->op.devs      = devs;
+       m->op.write_point = wp;
+
+       if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
                m->op.alloc_reserve = RESERVE_MOVINGGC;
 
        m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
@@ -180,10 +235,35 @@ void bch2_migrate_write_init(struct migrate_write *m,
                BCH_WRITE_DATA_ENCODED|
                BCH_WRITE_NOMARK_REPLICAS;
 
-       m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
        m->op.nr_replicas       = 1;
        m->op.nr_replicas_required = 1;
        m->op.index_update_fn   = bch2_migrate_index_update;
+
+       switch (data_cmd) {
+       case DATA_ADD_REPLICAS:
+               if (m->nr_ptrs_reserved < c->opts.data_replicas) {
+                       m->op.nr_replicas = c->opts.data_replicas - m->nr_ptrs_reserved;
+
+                       ret = bch2_disk_reservation_get(c, &m->op.res,
+                                                       k.k->size,
+                                                       m->op.nr_replicas, 0);
+                       if (ret)
+                               return ret;
+
+                       m->nr_ptrs_reserved = c->opts.data_replicas;
+               }
+               break;
+       case DATA_REWRITE:
+               break;
+       case DATA_PROMOTE:
+               m->op.flags     |= BCH_WRITE_ALLOC_NOWAIT;
+               m->op.flags     |= BCH_WRITE_CACHED;
+               break;
+       default:
+               BUG();
+       }
+
+       return 0;
 }
 
 static void move_free(struct closure *cl)
@@ -210,7 +290,7 @@ static void move_write(struct closure *cl)
        struct moving_io *io = container_of(cl, struct moving_io, cl);
 
        if (likely(!io->rbio.bio.bi_status)) {
-               bch2_migrate_write_init(&io->write, &io->rbio);
+               bch2_migrate_read_done(&io->write, &io->rbio);
                closure_call(&io->write.op.cl, bch2_write, NULL, cl);
        }
 
@@ -238,19 +318,19 @@ static void move_read_endio(struct bio *bio)
 }
 
 static int bch2_move_extent(struct bch_fs *c,
-                         struct moving_context *ctxt,
-                         struct bch_devs_mask *devs,
-                         struct write_point_specifier wp,
-                         int btree_insert_flags,
-                         int move_device,
-                         struct bch_io_opts opts,
-                         struct bkey_s_c_extent e)
+                           struct moving_context *ctxt,
+                           struct bch_devs_mask *devs,
+                           struct write_point_specifier wp,
+                           struct bch_io_opts io_opts,
+                           struct bkey_s_c_extent e,
+                           enum data_cmd data_cmd,
+                           struct data_opts data_opts)
 {
        struct extent_pick_ptr pick;
        struct moving_io *io;
        const struct bch_extent_ptr *ptr;
        struct bch_extent_crc_unpacked crc;
-       unsigned sectors = e.k->size, pages, nr_good;
+       unsigned sectors = e.k->size, pages;
        int ret = -ENOMEM;
 
        bch2_extent_pick_ptr(c, e.s_c, NULL, &pick);
@@ -279,7 +359,7 @@ static int bch2_move_extent(struct bch_fs *c,
        if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL))
                goto err_free;
 
-       io->rbio.opts = opts;
+       io->rbio.opts = io_opts;
        bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
        bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
        io->rbio.bio.bi_iter.bi_size = sectors << 9;
@@ -288,27 +368,10 @@ static int bch2_move_extent(struct bch_fs *c,
        io->rbio.bio.bi_iter.bi_sector  = bkey_start_offset(e.k);
        io->rbio.bio.bi_end_io          = move_read_endio;
 
-       io->write.btree_insert_flags = btree_insert_flags;
-       io->write.move_dev      = move_device;
-
-       bch2_write_op_init(&io->write.op, c);
-       io->write.op.csum_type = bch2_data_checksum_type(c, opts.data_checksum);
-       io->write.op.compression_type =
-               bch2_compression_opt_to_type(opts.compression);
-       io->write.op.devs       = devs;
-       io->write.op.write_point = wp;
-
-       if (move_device < 0 &&
-           ((nr_good = bch2_extent_nr_good_ptrs(c, e)) <
-            c->opts.data_replicas)) {
-               io->write.op.nr_replicas = c->opts.data_replicas - nr_good;
-
-               ret = bch2_disk_reservation_get(c, &io->write.op.res,
-                                               e.k->size,
-                                               io->write.op.nr_replicas, 0);
-               if (ret)
-                       goto err_free_pages;
-       }
+       ret = bch2_migrate_write_init(c, &io->write, devs, wp,
+                                     io_opts, data_cmd, data_opts, e.s_c);
+       if (ret)
+               goto err_free_pages;
 
        atomic64_inc(&ctxt->stats->keys_moved);
        atomic64_add(e.k->size, &ctxt->stats->sectors_moved);
@@ -369,8 +432,6 @@ int bch2_move_data(struct bch_fs *c,
                   unsigned sectors_in_flight,
                   struct bch_devs_mask *devs,
                   struct write_point_specifier wp,
-                  int btree_insert_flags,
-                  int move_device,
                   struct bpos start,
                   struct bpos end,
                   move_pred_fn pred, void *arg,
@@ -378,12 +439,14 @@ int bch2_move_data(struct bch_fs *c,
 {
        bool kthread = (current->flags & PF_KTHREAD) != 0;
        struct moving_context ctxt = { .stats = stats };
-       struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
+       struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
        BKEY_PADDED(k) tmp;
        struct bkey_s_c k;
        struct bkey_s_c_extent e;
+       struct data_opts data_opts;
+       enum data_cmd data_cmd;
        u64 cur_inum = U64_MAX;
-       int ret = 0;
+       int ret = 0, ret2;
 
        closure_init_stack(&ctxt.cl);
        INIT_LIST_HEAD(&ctxt.reads);
@@ -430,28 +493,44 @@ peek:
                        /* don't hold btree locks while looking up inode: */
                        bch2_btree_iter_unlock(&stats->iter);
 
-                       opts = bch2_opts_to_inode_opts(c->opts);
+                       io_opts = bch2_opts_to_inode_opts(c->opts);
                        if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
-                               bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode));
+                               bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
                        cur_inum = k.k->p.inode;
                        goto peek;
                }
 
-               if (!pred(arg, e))
+               switch ((data_cmd = pred(c, arg, BKEY_TYPE_EXTENTS, e,
+                                        &io_opts, &data_opts))) {
+               case DATA_SKIP:
                        goto next;
+               case DATA_SCRUB:
+                       BUG();
+               case DATA_ADD_REPLICAS:
+               case DATA_REWRITE:
+               case DATA_PROMOTE:
+                       break;
+               default:
+                       BUG();
+               }
 
                /* unlock before doing IO: */
                bkey_reassemble(&tmp.k, k);
                k = bkey_i_to_s_c(&tmp.k);
                bch2_btree_iter_unlock(&stats->iter);
 
-               if (bch2_move_extent(c, &ctxt, devs, wp,
-                                    btree_insert_flags,
-                                    move_device, opts,
-                                    bkey_s_c_to_extent(k))) {
-                       /* memory allocation failure, wait for some IO to finish */
-                       bch2_move_ctxt_wait_for_io(&ctxt);
-                       continue;
+               ret2 = bch2_move_extent(c, &ctxt, devs, wp, io_opts,
+                                       bkey_s_c_to_extent(k),
+                                       data_cmd, data_opts);
+               if (ret2) {
+                       if (ret2 == -ENOMEM) {
+                               /* memory allocation failure, wait for some IO to finish */
+                               bch2_move_ctxt_wait_for_io(&ctxt);
+                               continue;
+                       }
+
+                       /* XXX signal failure */
+                       goto next;
                }
 
                if (rate)
@@ -486,11 +565,11 @@ static int bch2_gc_data_replicas(struct bch_fs *c)
        int ret;
 
        mutex_lock(&c->replicas_gc_lock);
-       bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
+       bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
 
        for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
                           BTREE_ITER_PREFETCH, k) {
-               ret = bch2_check_mark_super(c, BCH_DATA_USER, bch2_bkey_devs(k));
+               ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
                if (ret)
                        break;
        }
@@ -514,8 +593,8 @@ static int bch2_gc_btree_replicas(struct bch_fs *c)
 
        for (id = 0; id < BTREE_ID_NR; id++) {
                for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
-                       ret = bch2_check_mark_super(c, BCH_DATA_BTREE,
-                                       bch2_bkey_devs(bkey_i_to_s_c(&b->key)));
+                       ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+                                                     bkey_i_to_s_c(&b->key));
 
                        bch2_btree_iter_cond_resched(&iter);
                }
@@ -534,18 +613,35 @@ static int bch2_move_btree(struct bch_fs *c,
                           void *arg,
                           struct bch_move_stats *stats)
 {
+       struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
        struct btree *b;
        unsigned id;
+       struct data_opts data_opts;
+       enum data_cmd cmd;
        int ret = 0;
 
        stats->data_type = BCH_DATA_BTREE;
 
        for (id = 0; id < BTREE_ID_NR; id++) {
                for_each_btree_node(&stats->iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
-                       if (pred(arg, bkey_i_to_s_c_extent(&b->key)))
-                               ret = bch2_btree_node_rewrite(c, &stats->iter,
-                                               b->data->keys.seq, 0) ?: ret;
+                       switch ((cmd = pred(c, arg, BKEY_TYPE_BTREE,
+                                           bkey_i_to_s_c_extent(&b->key),
+                                           &io_opts,
+                                           &data_opts))) {
+                       case DATA_SKIP:
+                               goto next;
+                       case DATA_SCRUB:
+                               BUG();
+                       case DATA_ADD_REPLICAS:
+                       case DATA_REWRITE:
+                               break;
+                       default:
+                               BUG();
+                       }
 
+                       ret = bch2_btree_node_rewrite(c, &stats->iter,
+                                       b->data->keys.seq, 0) ?: ret;
+next:
                        bch2_btree_iter_cond_resched(&stats->iter);
                }
 
@@ -556,32 +652,48 @@ static int bch2_move_btree(struct bch_fs *c,
 }
 
 #if 0
-static bool scrub_data_pred(void *arg, struct bkey_s_c_extent e)
+static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
+                               enum bkey_type type,
+                               struct bkey_s_c_extent e,
+                               struct bch_io_opts *io_opts,
+                               struct data_opts *data_opts)
 {
+       return DATA_SCRUB;
 }
 #endif
 
-static bool rereplicate_metadata_pred(void *arg, struct bkey_s_c_extent e)
+static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
+                                     enum bkey_type type,
+                                     struct bkey_s_c_extent e,
+                                     struct bch_io_opts *io_opts,
+                                     struct data_opts *data_opts)
 {
-       struct bch_fs *c = arg;
        unsigned nr_good = bch2_extent_nr_good_ptrs(c, e);
+       unsigned replicas = type == BKEY_TYPE_BTREE
+               ? c->opts.metadata_replicas
+               : c->opts.data_replicas;
 
-       return nr_good && nr_good < c->opts.metadata_replicas;
-}
+       if (!nr_good || nr_good >= replicas)
+               return DATA_SKIP;
 
-static bool rereplicate_data_pred(void *arg, struct bkey_s_c_extent e)
-{
-       struct bch_fs *c = arg;
-       unsigned nr_good = bch2_extent_nr_good_ptrs(c, e);
-
-       return nr_good && nr_good < c->opts.data_replicas;
+       data_opts->btree_insert_flags = 0;
+       return DATA_ADD_REPLICAS;
 }
 
-static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
+static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
+                                 enum bkey_type type,
+                                 struct bkey_s_c_extent e,
+                                 struct bch_io_opts *io_opts,
+                                 struct data_opts *data_opts)
 {
        struct bch_ioctl_data *op = arg;
 
-       return bch2_extent_has_device(e, op->migrate.dev);
+       if (!bch2_extent_has_device(e, op->migrate.dev))
+               return DATA_SKIP;
+
+       data_opts->btree_insert_flags   = 0;
+       data_opts->rewrite_dev          = op->migrate.dev;
+       return DATA_REWRITE;
 }
 
 int bch2_data_job(struct bch_fs *c,
@@ -595,16 +707,15 @@ int bch2_data_job(struct bch_fs *c,
                stats->data_type = BCH_DATA_JOURNAL;
                ret = bch2_journal_flush_device(&c->journal, -1);
 
-               ret = bch2_move_btree(c, rereplicate_metadata_pred, c, stats) ?: ret;
+               ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
                ret = bch2_gc_btree_replicas(c) ?: ret;
 
                ret = bch2_move_data(c, NULL, SECTORS_IN_FLIGHT_PER_DEVICE,
                                     NULL,
                                     writepoint_hashed((unsigned long) current),
-                                    0, -1,
                                     op.start,
                                     op.end,
-                                    rereplicate_data_pred, c, stats) ?: ret;
+                                    rereplicate_pred, c, stats) ?: ret;
                ret = bch2_gc_data_replicas(c) ?: ret;
                break;
        case BCH_DATA_OP_MIGRATE:
@@ -620,7 +731,6 @@ int bch2_data_job(struct bch_fs *c,
                ret = bch2_move_data(c, NULL, SECTORS_IN_FLIGHT_PER_DEVICE,
                                     NULL,
                                     writepoint_hashed((unsigned long) current),
-                                    0, -1,
                                     op.start,
                                     op.end,
                                     migrate_pred, &op, stats) ?: ret;
index 07aa5669524c8da0a4786f193524c84f02d48a0b..819e5d9f0a24234b800c144fa43445898be3501b 100644 (file)
@@ -8,23 +8,47 @@
 struct bch_read_bio;
 struct moving_context;
 
+enum data_cmd {
+       DATA_SKIP,
+       DATA_SCRUB,
+       DATA_ADD_REPLICAS,
+       DATA_REWRITE,
+       DATA_PROMOTE,
+};
+
+struct data_opts {
+       unsigned        rewrite_dev;
+       int             btree_insert_flags;
+};
+
 struct migrate_write {
+       enum data_cmd           data_cmd;
+       struct data_opts        data_opts;
+
+       unsigned                nr_ptrs_reserved;
+
        struct moving_context   *ctxt;
 
        /* what we read: */
        struct bch_extent_ptr   ptr;
        u64                     offset;
 
-       int                     move_dev;
-       int                     btree_insert_flags;
        struct bch_write_op     op;
 };
 
-void bch2_migrate_write_init(struct migrate_write *, struct bch_read_bio *);
+void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *);
+int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
+                           struct bch_devs_mask *,
+                           struct write_point_specifier,
+                           struct bch_io_opts,
+                           enum data_cmd, struct data_opts,
+                           struct bkey_s_c);
 
 #define SECTORS_IN_FLIGHT_PER_DEVICE   2048
 
-typedef bool (*move_pred_fn)(void *, struct bkey_s_c_extent);
+typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
+                               enum bkey_type, struct bkey_s_c_extent,
+                               struct bch_io_opts *, struct data_opts *);
 
 struct bch_move_stats {
        enum bch_data_type      data_type;
@@ -39,7 +63,7 @@ struct bch_move_stats {
 int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
                   unsigned, struct bch_devs_mask *,
                   struct write_point_specifier,
-                  int, int, struct bpos, struct bpos,
+                  struct bpos, struct bpos,
                   move_pred_fn, void *,
                   struct bch_move_stats *);
 
index 515d5001aec890f7f23354c1645472036b919b9c..c306a89f8401f160dd3204c75cdc57fb17a730da 100644 (file)
@@ -61,9 +61,9 @@ static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
        return (l->offset > r->offset) - (l->offset < r->offset);
 }
 
-static bool copygc_pred(void *arg, struct bkey_s_c_extent e)
+static bool __copygc_pred(struct bch_dev *ca,
+                         struct bkey_s_c_extent e)
 {
-       struct bch_dev *ca = arg;
        copygc_heap *h = &ca->copygc_heap;
        const struct bch_extent_ptr *ptr =
                bch2_extent_has_device(e, ca->dev_idx);
@@ -83,6 +83,22 @@ static bool copygc_pred(void *arg, struct bkey_s_c_extent e)
        return false;
 }
 
+static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
+                                enum bkey_type type,
+                                struct bkey_s_c_extent e,
+                                struct bch_io_opts *io_opts,
+                                struct data_opts *data_opts)
+{
+       struct bch_dev *ca = arg;
+
+       if (!__copygc_pred(ca, e))
+               return DATA_SKIP;
+
+       data_opts->btree_insert_flags   = BTREE_INSERT_USE_RESERVE,
+       data_opts->rewrite_dev          = ca->dev_idx;
+       return DATA_REWRITE;
+}
+
 static bool have_copygc_reserve(struct bch_dev *ca)
 {
        bool ret;
@@ -165,8 +181,6 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
                             SECTORS_IN_FLIGHT_PER_DEVICE,
                             &ca->self,
                             writepoint_ptr(&ca->copygc_write_point),
-                            BTREE_INSERT_USE_RESERVE,
-                            ca->dev_idx,
                             POS_MIN, POS_MAX,
                             copygc_pred, ca,
                             &move_stats);
index eae63cf82c4042df063f28bce4c4e92c9b6b5e83..ec50345fda622dabdafe6bf7f550baeff995ab52 100644 (file)
@@ -22,6 +22,7 @@ const char * const bch2_compression_types[] = {
        "none",
        "lz4",
        "gzip",
+       "zstd",
        NULL
 };
 
index 5d42dd5f570d3aca965b25766f04f47e46e0b802..8a3ac66b948ce0f60b2a90f3a04ea1ffaa029075 100644 (file)
@@ -73,10 +73,10 @@ enum opt_type {
        BCH_OPT(errors,                 u8,     OPT_RUNTIME,            \
                OPT_STR(bch2_error_actions),                            \
                BCH_SB_ERROR_ACTION,            BCH_ON_ERROR_RO)        \
-       BCH_OPT(metadata_replicas,      u8,     OPT_MOUNT,              \
+       BCH_OPT(metadata_replicas,      u8,     OPT_RUNTIME,            \
                OPT_UINT(1, BCH_REPLICAS_MAX),                          \
                BCH_SB_META_REPLICAS_WANT,      1)                      \
-       BCH_OPT(data_replicas,          u8,     OPT_MOUNT,              \
+       BCH_OPT(data_replicas,          u8,     OPT_RUNTIME,            \
                OPT_UINT(1, BCH_REPLICAS_MAX),                          \
                BCH_SB_DATA_REPLICAS_WANT,      1)                      \
        BCH_OPT(metadata_replicas_required, u8, OPT_MOUNT,              \
@@ -127,6 +127,9 @@ enum opt_type {
        BCH_OPT(verbose_recovery,       u8,     OPT_MOUNT,              \
                OPT_BOOL(),                                             \
                NO_SB_OPT,                      false)                  \
+       BCH_OPT(verbose_init,           u8,     OPT_MOUNT,              \
+               OPT_BOOL(),                                             \
+               NO_SB_OPT,                      false)                  \
        BCH_OPT(journal_flush_disabled, u8,     OPT_RUNTIME,            \
                OPT_BOOL(),                                             \
                NO_SB_OPT,                      false)                  \
index 6ab2c866a168dccef2bd3112f2e055d5080c7ad4..d28f1333e69365b60fd028a95d1b0157f70f97dc 100644 (file)
@@ -74,13 +74,6 @@ static inline unsigned __next_qtype(unsigned i, unsigned qtypes)
              _i < QTYP_NR);                                            \
             _i++)
 
-static inline unsigned enabled_qtypes(struct bch_fs *c)
-{
-       return ((c->opts.usrquota << QTYP_USR)|
-               (c->opts.grpquota << QTYP_GRP)|
-               (c->opts.prjquota << QTYP_PRJ));
-}
-
 static bool ignore_hardlimit(struct bch_memquota_type *q)
 {
        if (capable(CAP_SYS_RESOURCE))
@@ -478,7 +471,7 @@ static int bch2_quota_enable(struct super_block     *sb, unsigned uflags)
        if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
                return -EINVAL;
 
-       if (uflags & FS_QUOTA_PDQ_ENFD)
+       if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota)
                return -EINVAL;
 
        mutex_lock(&c->sb_lock);
@@ -487,10 +480,9 @@ static int bch2_quota_enable(struct super_block    *sb, unsigned uflags)
 
        if (uflags & FS_QUOTA_GDQ_ENFD)
                SET_BCH_SB_GRPQUOTA(c->disk_sb, true);
-#if 0
+
        if (uflags & FS_QUOTA_PDQ_ENFD)
                SET_BCH_SB_PRJQUOTA(c->disk_sb, true);
-#endif
 
        bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
index b5536be94ed407620c61c4911c23b79c34a963cb..509b7f0e069d0f33423f72f70fa5c7b05603b0cf 100644 (file)
@@ -20,6 +20,13 @@ static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u)
        };
 }
 
+static inline unsigned enabled_qtypes(struct bch_fs *c)
+{
+       return ((c->opts.usrquota << QTYP_USR)|
+               (c->opts.grpquota << QTYP_GRP)|
+               (c->opts.prjquota << QTYP_PRJ));
+}
+
 #ifdef CONFIG_BCACHEFS_QUOTA
 
 int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
index d689a7b0789d76300799da8d435f51a43d1c7007..3a6c9c8217f03719561f236ad3fe3ec3e66095f5 100644 (file)
@@ -43,7 +43,6 @@
  * https://131002.net/siphash/
  */
 
-#include <linux/compiler.h>
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
 #include <linux/bitops.h>
index f333b8fad58adb907398e84e67c1464be31148dd..c747391707b3f5a1e210fb836ed11ce71f643e13 100644 (file)
@@ -546,6 +546,8 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
        __le64 *i;
        int ret;
 
+       pr_verbose_init(*opts, "");
+
        memset(sb, 0, sizeof(*sb));
        sb->mode = FMODE_READ;
 
@@ -566,8 +568,10 @@ int bch2_read_super(const char *path, struct bch_opts *opts,
                        opt_set(*opts, nochanges, true);
        }
 
-       if (IS_ERR(sb->bdev))
-               return PTR_ERR(sb->bdev);
+       if (IS_ERR(sb->bdev)) {
+               ret = PTR_ERR(sb->bdev);
+               goto out;
+       }
 
        err = "cannot allocate memory";
        ret = __bch2_super_realloc(sb, 0);
@@ -638,12 +642,14 @@ got_super:
        if (sb->mode & FMODE_WRITE)
                bdev_get_queue(sb->bdev)->backing_dev_info->capabilities
                        |= BDI_CAP_STABLE_WRITES;
-
-       return 0;
+       ret = 0;
+out:
+       pr_verbose_init(*opts, "ret %i", ret);
+       return ret;
 err:
        bch2_free_super(sb);
        pr_err("error reading superblock: %s", err);
-       return ret;
+       goto out;
 }
 
 /* write superblock: */
@@ -744,17 +750,15 @@ void bch2_write_super(struct bch_fs *c)
        nr_wrote = dev_mask_nr(&sb_written);
 
        can_mount_with_written =
-               bch2_have_enough_devs(c,
-                       __bch2_replicas_status(c, sb_written),
-                       BCH_FORCE_IF_DEGRADED);
+               bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
+                                     BCH_FORCE_IF_DEGRADED);
 
        for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
                sb_written.d[i] = ~sb_written.d[i];
 
        can_mount_without_written =
-               bch2_have_enough_devs(c,
-                       __bch2_replicas_status(c, sb_written),
-                       BCH_FORCE_IF_DEGRADED);
+               bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
+                                     BCH_FORCE_IF_DEGRADED);
 
        /*
         * If we would be able to mount _without_ the devices we successfully
@@ -1052,7 +1056,7 @@ static bool replicas_has_entry(struct bch_replicas_cpu *r,
 }
 
 noinline
-static int bch2_check_mark_super_slowpath(struct bch_fs *c,
+static int bch2_mark_replicas_slowpath(struct bch_fs *c,
                                struct bch_replicas_cpu_entry new_entry,
                                unsigned max_dev)
 {
@@ -1109,9 +1113,9 @@ err:
        return ret;
 }
 
-int bch2_check_mark_super(struct bch_fs *c,
-                         enum bch_data_type data_type,
-                         struct bch_devs_list devs)
+int bch2_mark_replicas(struct bch_fs *c,
+                      enum bch_data_type data_type,
+                      struct bch_devs_list devs)
 {
        struct bch_replicas_cpu_entry search;
        struct bch_replicas_cpu *r, *gc_r;
@@ -1121,6 +1125,8 @@ int bch2_check_mark_super(struct bch_fs *c,
        if (!devs.nr)
                return 0;
 
+       BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
+
        devlist_to_replicas(devs, data_type, &search, &max_dev);
 
        rcu_read_lock();
@@ -1131,7 +1137,23 @@ int bch2_check_mark_super(struct bch_fs *c,
        rcu_read_unlock();
 
        return likely(marked) ? 0
-               : bch2_check_mark_super_slowpath(c, search, max_dev);
+               : bch2_mark_replicas_slowpath(c, search, max_dev);
+}
+
+int bch2_mark_bkey_replicas(struct bch_fs *c,
+                           enum bch_data_type data_type,
+                           struct bkey_s_c k)
+{
+       struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+       unsigned i;
+       int ret;
+
+       for (i = 0; i < cached.nr; i++)
+               if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
+                                             bch2_dev_list_single(cached.devs[i]))))
+                       return ret;
+
+       return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
 }
 
 int bch2_replicas_gc_end(struct bch_fs *c, int err)
@@ -1417,7 +1439,7 @@ int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t
 
 /* Query replicas: */
 
-bool bch2_sb_has_replicas(struct bch_fs *c,
+bool bch2_replicas_marked(struct bch_fs *c,
                          enum bch_data_type data_type,
                          struct bch_devs_list devs)
 {
@@ -1438,6 +1460,21 @@ bool bch2_sb_has_replicas(struct bch_fs *c,
        return ret;
 }
 
+bool bch2_bkey_replicas_marked(struct bch_fs *c,
+                              enum bch_data_type data_type,
+                              struct bkey_s_c k)
+{
+       struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+       unsigned i;
+
+       for (i = 0; i < cached.nr; i++)
+               if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
+                                         bch2_dev_list_single(cached.devs[i])))
+                       return false;
+
+       return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k));
+}
+
 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
                                              struct bch_devs_mask online_devs)
 {
@@ -1495,29 +1532,26 @@ struct replicas_status bch2_replicas_status(struct bch_fs *c)
        return __bch2_replicas_status(c, bch2_online_devs(c));
 }
 
-bool bch2_have_enough_devs(struct bch_fs *c,
-                          struct replicas_status s,
-                          unsigned flags)
+static bool have_enough_devs(struct replicas_status s,
+                            enum bch_data_type type,
+                            bool force_if_degraded,
+                            bool force_if_lost)
 {
-       if ((s.replicas[BCH_DATA_JOURNAL].nr_offline ||
-            s.replicas[BCH_DATA_BTREE].nr_offline) &&
-           !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
-               return false;
-
-       if ((!s.replicas[BCH_DATA_JOURNAL].nr_online ||
-            !s.replicas[BCH_DATA_BTREE].nr_online) &&
-           !(flags & BCH_FORCE_IF_METADATA_LOST))
-               return false;
-
-       if (s.replicas[BCH_DATA_USER].nr_offline &&
-           !(flags & BCH_FORCE_IF_DATA_DEGRADED))
-               return false;
-
-       if (!s.replicas[BCH_DATA_USER].nr_online &&
-           !(flags & BCH_FORCE_IF_DATA_LOST))
-               return false;
+       return (!s.replicas[type].nr_offline || force_if_degraded) &&
+               (s.replicas[type].nr_online || force_if_lost);
+}
 
-       return true;
+bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
+{
+       return (have_enough_devs(s, BCH_DATA_JOURNAL,
+                                flags & BCH_FORCE_IF_METADATA_DEGRADED,
+                                flags & BCH_FORCE_IF_METADATA_LOST) &&
+               have_enough_devs(s, BCH_DATA_BTREE,
+                                flags & BCH_FORCE_IF_METADATA_DEGRADED,
+                                flags & BCH_FORCE_IF_METADATA_LOST) &&
+               have_enough_devs(s, BCH_DATA_USER,
+                                flags & BCH_FORCE_IF_DATA_DEGRADED,
+                                flags & BCH_FORCE_IF_DATA_LOST));
 }
 
 unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
index eb85410c5f16d90a19efc3b7bcca61fdb5053d9c..d7fecf02f81cb019e9fa548be43337368fecfed2 100644 (file)
@@ -139,10 +139,14 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 
 /* BCH_SB_FIELD_replicas: */
 
-bool bch2_sb_has_replicas(struct bch_fs *, enum bch_data_type,
-                         struct bch_devs_list);
-int bch2_check_mark_super(struct bch_fs *, enum bch_data_type,
+bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
                          struct bch_devs_list);
+bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type,
+                              struct bkey_s_c);
+int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
+                      struct bch_devs_list);
+int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type,
+                           struct bkey_s_c);
 
 int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t);
 int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t);
@@ -157,7 +161,7 @@ struct replicas_status {
 struct replicas_status __bch2_replicas_status(struct bch_fs *,
                                              struct bch_devs_mask);
 struct replicas_status bch2_replicas_status(struct bch_fs *);
-bool bch2_have_enough_devs(struct bch_fs *, struct replicas_status, unsigned);
+bool bch2_have_enough_devs(struct replicas_status, unsigned);
 
 unsigned bch2_replicas_online(struct bch_fs *, bool);
 unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
index f836c199e06b38292ecd0dd044782cde23d8f684..58bcd7d1ee062c3390ad9874672c575c4910ffd3 100644 (file)
@@ -507,9 +507,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        struct bch_fs *c;
        unsigned i, iter_size;
 
+       pr_verbose_init(opts, "");
+
        c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
        if (!c)
-               return NULL;
+               goto out;
 
        __module_get(THIS_MODULE);
 
@@ -539,7 +541,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        mutex_init(&c->btree_interior_update_lock);
 
        mutex_init(&c->bio_bounce_pages_lock);
-       mutex_init(&c->zlib_workspace_lock);
 
        bio_list_init(&c->btree_write_error_list);
        spin_lock_init(&c->btree_write_error_lock);
@@ -646,10 +647,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        kobject_init(&c->internal, &bch2_fs_internal_ktype);
        kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
        kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+out:
+       pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM);
        return c;
 err:
        bch2_fs_free(c);
-       return NULL;
+       c = NULL;
+       goto out;
 }
 
 static const char *__bch2_fs_online(struct bch_fs *c)
@@ -809,7 +813,7 @@ static const char *__bch2_fs_start(struct bch_fs *c)
                        goto err;
                bch_verbose(c, "fsck done");
 
-               if (c->opts.usrquota || c->opts.grpquota) {
+               if (enabled_qtypes(c)) {
                        bch_verbose(c, "reading quotas:");
                        ret = bch2_fs_quota_read(c);
                        if (ret)
@@ -864,7 +868,7 @@ static const char *__bch2_fs_start(struct bch_fs *c)
                                     NULL, NULL, NULL, 0))
                        goto err;
 
-               if (c->opts.usrquota || c->opts.grpquota) {
+               if (enabled_qtypes(c)) {
                        ret = bch2_fs_quota_read(c);
                        if (ret)
                                goto err;
@@ -1084,14 +1088,17 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
 static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 {
        struct bch_member *member;
-       struct bch_dev *ca;
+       struct bch_dev *ca = NULL;
+       int ret = 0;
+
+       pr_verbose_init(c->opts, "");
 
        if (bch2_fs_init_fault("dev_alloc"))
-               return -ENOMEM;
+               goto err;
 
        ca = kzalloc(sizeof(*ca), GFP_KERNEL);
        if (!ca)
-               return -ENOMEM;
+               goto err;
 
        kobject_init(&ca->kobj, &bch2_dev_ktype);
        init_completion(&ca->ref_completion);
@@ -1133,11 +1140,14 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 
        if (bch2_dev_sysfs_online(c, ca))
                pr_warn("error creating sysfs objects");
-
-       return 0;
+out:
+       pr_verbose_init(c->opts, "ret %i", ret);
+       return ret;
 err:
-       bch2_dev_free(ca);
-       return -ENOMEM;
+       if (ca)
+               bch2_dev_free(ca);
+       ret = -ENOMEM;
+       goto out;
 }
 
 static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
@@ -1240,7 +1250,8 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
 
                /* do we have enough devices to write to?  */
                for_each_member_device(ca2, c, i)
-                       nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW;
+                       if (ca2 != ca)
+                               nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW;
 
                required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
                               ? c->opts.metadata_replicas
@@ -1249,7 +1260,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
                               ? c->opts.data_replicas
                               : c->opts.data_replicas_required);
 
-               return nr_rw - 1 <= required;
+               return nr_rw >= required;
        case BCH_MEMBER_STATE_FAILED:
        case BCH_MEMBER_STATE_SPARE:
                if (ca->mi.state != BCH_MEMBER_STATE_RW &&
@@ -1262,7 +1273,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
 
                s = __bch2_replicas_status(c, new_online_devs);
 
-               return bch2_have_enough_devs(c, s, flags);
+               return bch2_have_enough_devs(s, flags);
        default:
                BUG();
        }
@@ -1299,7 +1310,7 @@ static bool bch2_fs_may_start(struct bch_fs *c)
 
        s = bch2_replicas_status(c);
 
-       return bch2_have_enough_devs(c, s, flags);
+       return bch2_have_enough_devs(s, flags);
 }
 
 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
@@ -1346,12 +1357,8 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
        if (!bch2_dev_state_allowed(c, ca, new_state, flags))
                return -EINVAL;
 
-       if (new_state == BCH_MEMBER_STATE_RW) {
-               if (__bch2_dev_read_write(c, ca))
-                       return -ENOMEM;
-       } else {
+       if (new_state != BCH_MEMBER_STATE_RW)
                __bch2_dev_read_only(c, ca);
-       }
 
        bch_notice(ca, "%s", bch2_dev_state[new_state]);
 
@@ -1361,6 +1368,9 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
        bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
 
+       if (new_state == BCH_MEMBER_STATE_RW)
+               return __bch2_dev_read_write(c, ca) ? -ENOMEM : 0;
+
        return 0;
 }
 
@@ -1701,11 +1711,17 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
        const char *err;
        int ret = -ENOMEM;
 
-       if (!nr_devices)
-               return ERR_PTR(-EINVAL);
+       pr_verbose_init(opts, "");
 
-       if (!try_module_get(THIS_MODULE))
-               return ERR_PTR(-ENODEV);
+       if (!nr_devices) {
+               c = ERR_PTR(-EINVAL);
+               goto out2;
+       }
+
+       if (!try_module_get(THIS_MODULE)) {
+               c = ERR_PTR(-ENODEV);
+               goto out2;
+       }
 
        sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
        if (!sb)
@@ -1760,8 +1776,11 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
        if (err)
                goto err_print;
 
+out:
        kfree(sb);
        module_put(THIS_MODULE);
+out2:
+       pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
        return c;
 err_print:
        pr_err("bch_fs_open err opening %s: %s",
@@ -1770,12 +1789,10 @@ err_print:
 err:
        if (c)
                bch2_fs_stop(c);
-
        for (i = 0; i < nr_devices; i++)
                bch2_free_super(&sb[i]);
-       kfree(sb);
-       module_put(THIS_MODULE);
-       return ERR_PTR(ret);
+       c = ERR_PTR(ret);
+       goto out;
 }
 
 static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
index d0a38cf6750882e38dfcece4123e202c71f705e3..1718f5c103034acbe9f48222298d62ab2280c513 100644 (file)
@@ -67,6 +67,11 @@ static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
        devs->devs[devs->nr++] = dev;
 }
 
+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
+{
+       return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
+}
+
 static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
                                              const struct bch_devs_mask *mask)
 {
index 966da4afbeda99fb7bc4341798ff5b5e1acfeeac..d76d917cb03986f2c892cf38d8efd6153c0dca76 100644 (file)
@@ -15,7 +15,7 @@ struct bch_devs_mask {
 
 struct bch_devs_list {
        u8                      nr;
-       u8                      devs[BCH_REPLICAS_MAX];
+       u8                      devs[BCH_REPLICAS_MAX + 1];
 };
 
 struct bch_member_cpu {
index 597e1f02bd6a9d497133a7eea6122f1c37353540..2e958a8ef3817075eda22b8be1cdaccc8b9f005d 100644 (file)
@@ -164,6 +164,8 @@ read_attribute(extent_migrate_raced);
 rw_attribute(journal_write_delay_ms);
 rw_attribute(journal_reclaim_delay_ms);
 
+rw_attribute(writeback_pages_max);
+
 rw_attribute(discard);
 rw_attribute(cache_replacement_policy);
 
@@ -310,6 +312,8 @@ SHOW(bch2_fs)
        sysfs_print(journal_write_delay_ms,     c->journal.write_delay_ms);
        sysfs_print(journal_reclaim_delay_ms,   c->journal.reclaim_delay_ms);
 
+       sysfs_print(writeback_pages_max,        c->writeback_pages_max);
+
        sysfs_print(block_size,                 block_bytes(c));
        sysfs_print(btree_node_size,            btree_bytes(c));
        sysfs_hprint(btree_cache_size,          bch2_btree_cache_size(c));
@@ -370,6 +374,9 @@ STORE(__bch2_fs)
        sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
        sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
 
+       if (attr == &sysfs_writeback_pages_max)
+               c->writeback_pages_max = strtoul_restrict_or_return(buf, 1, UINT_MAX);
+
        if (attr == &sysfs_btree_gc_periodic) {
                ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
                        ?: (ssize_t) size;
@@ -459,6 +466,8 @@ struct attribute *bch2_fs_files[] = {
        &sysfs_journal_write_delay_ms,
        &sysfs_journal_reclaim_delay_ms,
 
+       &sysfs_writeback_pages_max,
+
        &sysfs_tiering_percent,
 
        &sysfs_compression_stats,
index c4625c80bbf80db449bb66ea611b1bc327ef1dd9..775c2e2be686d4d97c04f7a3a2f721106549a1a1 100644 (file)
 #include <linux/kthread.h>
 #include <trace/events/bcachefs.h>
 
-static bool tiering_pred(void *arg, struct bkey_s_c_extent e)
+static bool __tiering_pred(struct bch_fs *c, struct bch_tier *tier,
+                          struct bkey_s_c_extent e)
 {
-       struct bch_tier *tier = arg;
-       struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]);
        const struct bch_extent_ptr *ptr;
        unsigned replicas = 0;
 
@@ -33,6 +32,21 @@ static bool tiering_pred(void *arg, struct bkey_s_c_extent e)
        return replicas < c->opts.data_replicas;
 }
 
+static enum data_cmd tiering_pred(struct bch_fs *c, void *arg,
+                                 enum bkey_type type,
+                                 struct bkey_s_c_extent e,
+                                 struct bch_io_opts *io_opts,
+                                 struct data_opts *data_opts)
+{
+       struct bch_tier *tier = arg;
+
+       if (!__tiering_pred(c, tier, e))
+               return DATA_SKIP;
+
+       data_opts->btree_insert_flags = 0;
+       return DATA_ADD_REPLICAS;
+}
+
 static int bch2_tiering_thread(void *arg)
 {
        struct bch_tier *tier = arg;
@@ -90,8 +104,6 @@ static int bch2_tiering_thread(void *arg)
                               SECTORS_IN_FLIGHT_PER_DEVICE * nr_devices,
                               &tier->devs,
                               writepoint_ptr(&tier->wp),
-                              0,
-                              -1,
                               POS_MIN, POS_MAX,
                               tiering_pred, tier,
                               &move_stats);
index 6e97e83184e17d0a4e4a8c617888152c20395686..d475f986ad30b4584a9f117d5a05e0b4592b0d82 100644 (file)
@@ -817,4 +817,19 @@ do {                                                                       \
 #define array_remove_item(_array, _nr, _pos)                           \
        array_remove_items(_array, _nr, _pos, 1)
 
+#define bubble_sort(_base, _nr, _cmp)                                  \
+do {                                                                   \
+       ssize_t _i, _end;                                               \
+       bool _swapped = true;                                           \
+                                                                       \
+       for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\
+               _swapped = false;                                       \
+               for (_i = 0; _i < _end; _i++)                           \
+                       if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) {   \
+                               swap((_base)[_i], (_base)[_i + 1]);     \
+                               _swapped = true;                        \
+                       }                                               \
+       }                                                               \
+} while (0)
+
 #endif /* _BCACHEFS_UTIL_H */