-d5e561b3cc023dd247d2b3d08b680709ec21b477
+e99d29e40210f6d9b7ec9e5b7aee1e48ae7655c5
-D_GNU_SOURCE \
-D_LGPL_SOURCE \
-DRCU_MEMBARRIER \
+ -DZSTD_STATIC_LINKING_ONLY \
-DNO_BCACHEFS_CHARDEV \
-DNO_BCACHEFS_FS \
-DNO_BCACHEFS_SYSFS \
endif
PKGCONFIG_LIBS="blkid uuid liburcu libsodium zlib"
+PKGCONFIG_LIBS_STATIC="libzstd"
+
CFLAGS+=`pkg-config --cflags ${PKGCONFIG_LIBS}`
-LDLIBS+=`pkg-config --libs ${PKGCONFIG_LIBS}` \
- -lm -lpthread -lrt -lscrypt -lkeyutils -laio
+LDLIBS+=`pkg-config --libs ${PKGCONFIG_LIBS}`
+
+CFLAGS+=`pkg-config --static --cflags ${PKGCONFIG_LIBS_STATIC}`
+LDLIBS+=`pkg-config --static --libs ${PKGCONFIG_LIBS_STATIC}`
+
+LDLIBS+=-lm -lpthread -lrt -lscrypt -lkeyutils -laio
ifeq ($(PREFIX),/usr)
ROOT_SBINDIR=/sbin
die("error reserving space in new filesystem: %s",
strerror(-ret));
- bch2_check_mark_super(c, BCH_DATA_USER,
- bch2_bkey_devs(extent_i_to_s_c(e).s_c));
+ bch2_mark_bkey_replicas(c, BCH_DATA_USER,
+ extent_i_to_s_c(e).s_c);
ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i,
&res, NULL, NULL, 0);
Standards-Version: 3.9.5
Build-Depends: debhelper (>= 9), pkg-config, libblkid-dev, uuid-dev,
libscrypt-dev, libsodium-dev, libkeyutils-dev, liburcu-dev, zlib1g-dev,
- libattr1-dev, libaio-dev
+ libattr1-dev, libaio-dev, libzstd-dev
Homepage: http://bcache.evilpiepirate.org/
Package: bcachefs-tools
return ob - c->open_buckets;
}
-struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
- struct write_point *wp,
- struct bch_devs_mask *devs)
+static int __dev_alloc_cmp(struct bch_fs *c,
+ struct write_point *wp,
+ unsigned l, unsigned r)
{
- struct dev_alloc_list ret = { .nr = 0 };
- struct bch_dev *ca, *ca2;
- unsigned i, j;
+ struct bch_dev *ca_l = rcu_dereference(c->devs[l]);
+ struct bch_dev *ca_r = rcu_dereference(c->devs[r]);
- for_each_member_device_rcu(ca, c, i, devs) {
- for (j = 0; j < ret.nr; j++) {
- unsigned idx = ret.devs[j];
+ if (ca_l && ca_r && ca_l->mi.tier != ca_r->mi.tier)
+ return ((ca_l->mi.tier > ca_r->mi.tier) -
+ (ca_l->mi.tier < ca_r->mi.tier));
- ca2 = rcu_dereference(c->devs[idx]);
- if (!ca2)
- break;
+ return ((wp->next_alloc[l] > wp->next_alloc[r]) -
+ (wp->next_alloc[l] < wp->next_alloc[r]));
+}
- if (ca->mi.tier < ca2->mi.tier)
- break;
+#define dev_alloc_cmp(l, r) __dev_alloc_cmp(c, wp, l, r)
- if (ca->mi.tier == ca2->mi.tier &&
- wp->next_alloc[i] < wp->next_alloc[idx])
- break;
- }
+struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
+ struct write_point *wp,
+ struct bch_devs_mask *devs)
+{
+ struct dev_alloc_list ret = { .nr = 0 };
+ struct bch_dev *ca;
+ unsigned i;
- array_insert_item(ret.devs, ret.nr, j, i);
- }
+ for_each_member_device_rcu(ca, c, i, devs)
+ ret.devs[ret.nr++] = i;
+ bubble_sort(ret.devs, ret.nr, dev_alloc_cmp);
return ret;
}
void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
struct write_point *wp)
{
- unsigned i;
+ u64 *v = wp->next_alloc + ca->dev_idx;
+ u64 free_space = dev_buckets_free(c, ca);
+ u64 free_space_inv = free_space
+ ? div64_u64(1ULL << 48, free_space)
+ : 1ULL << 48;
+ u64 scale = *v / 4;
+
+ if (*v + free_space_inv >= *v)
+ *v += free_space_inv;
+ else
+ *v = U64_MAX;
- for (i = 0; i < ARRAY_SIZE(wp->next_alloc); i++)
- wp->next_alloc[i] >>= 1;
+ for (v = wp->next_alloc;
+ v < wp->next_alloc + ARRAY_SIZE(wp->next_alloc); v++)
+ *v = *v < scale ? 0 : *v - scale;
}
static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
{
enum bucket_alloc_ret ret = NO_DEVICES;
struct dev_alloc_list devs_sorted;
- u64 buckets_free;
unsigned i;
BUG_ON(nr_replicas > ARRAY_SIZE(wp->ptrs));
BUG_ON(wp->nr_ptrs >= ARRAY_SIZE(wp->ptrs));
wp->ptrs[wp->nr_ptrs++] = c->open_buckets + ob;
- buckets_free = U64_MAX, dev_buckets_free(c, ca);
- if (buckets_free)
- wp->next_alloc[ca->dev_idx] +=
- div64_u64(U64_MAX, buckets_free *
- ca->mi.bucket_size);
- else
- wp->next_alloc[ca->dev_idx] = U64_MAX;
bch2_wp_rescale(c, ca, wp);
__clear_bit(ca->dev_idx, devs->d);
#include <linux/shrinker.h>
#include <linux/types.h>
#include <linux/workqueue.h>
+#include <linux/zstd.h>
#include "bcachefs_format.h"
#include "bset.h"
bch_info(c, fmt, ##__VA_ARGS__); \
} while (0)
+#define pr_verbose_init(opts, fmt, ...) \
+do { \
+ if (opt_get(opts, verbose_init)) \
+ pr_info(fmt, ##__VA_ARGS__); \
+} while (0)
+
/* Parameters that are useful for debugging, but should always be compiled in: */
#define BCH_DEBUG_PARAMS_ALWAYS() \
BCH_DEBUG_PARAM(key_merging_disabled, \
struct mutex bio_bounce_pages_lock;
mempool_t bio_bounce_pages;
- mempool_t lz4_workspace_pool;
- void *zlib_workspace;
- struct mutex zlib_workspace_lock;
mempool_t compression_bounce[2];
+ mempool_t compress_workspace[BCH_COMPRESSION_NR];
+ mempool_t decompress_workspace;
+ ZSTD_parameters zstd_params;
struct crypto_shash *sha256;
struct crypto_skcipher *chacha20;
*/
#include <asm/types.h>
-#include <linux/compiler.h>
#include <asm/byteorder.h>
#include <linux/uuid.h>
BCH_COMPRESSION_LZ4_OLD = 1,
BCH_COMPRESSION_GZIP = 2,
BCH_COMPRESSION_LZ4 = 3,
- BCH_COMPRESSION_NR = 4,
+ BCH_COMPRESSION_ZSTD = 4,
+ BCH_COMPRESSION_NR = 5,
};
enum bch_extent_entry_type {
enum bch_sb_features {
BCH_FEATURE_LZ4 = 0,
BCH_FEATURE_GZIP = 1,
+ BCH_FEATURE_ZSTD = 2,
};
/* options: */
BCH_STR_HASH_NR = 3,
};
+#define BCH_COMPRESSION_TYPES() \
+ x(NONE) \
+ x(LZ4) \
+ x(GZIP) \
+ x(ZSTD)
+
enum bch_compression_opts {
- BCH_COMPRESSION_OPT_NONE = 0,
- BCH_COMPRESSION_OPT_LZ4 = 1,
- BCH_COMPRESSION_OPT_GZIP = 2,
- BCH_COMPRESSION_OPT_NR = 3,
+#define x(t) BCH_COMPRESSION_OPT_##t,
+ BCH_COMPRESSION_TYPES()
+#undef x
+ BCH_COMPRESSION_OPT_NR
};
/*
};
} __attribute__((packed, aligned(8)));
-LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4);
-LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8);
+LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4);
+LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8);
+/* 8-32 unused */
+LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64);
struct btree_node_entry {
struct bch_csum csum;
{
struct btree_cache *bc = &c->btree_cache;
unsigned i;
- int ret;
+ int ret = 0;
+
+ pr_verbose_init(c->opts, "");
ret = rhashtable_init(&bc->table, &bch_btree_cache_params);
if (ret)
- return ret;
+ goto out;
bc->table_init_done = true;
bch2_recalc_btree_reserve(c);
for (i = 0; i < bc->reserve; i++)
- if (!btree_node_mem_alloc(c, GFP_KERNEL))
- return -ENOMEM;
+ if (!btree_node_mem_alloc(c, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto out;
+ }
list_splice_init(&bc->live, &bc->freeable);
mutex_init(&c->verify_lock);
c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
- if (!c->verify_ondisk)
- return -ENOMEM;
+ if (!c->verify_ondisk) {
+ ret = -ENOMEM;
+ goto out;
+ }
c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL);
- if (!c->verify_data)
- return -ENOMEM;
+ if (!c->verify_data) {
+ ret = -ENOMEM;
+ goto out;
+ }
list_del_init(&c->verify_data->list);
#endif
bc->shrink.seeks = 4;
bc->shrink.batch = btree_pages(c) * 2;
register_shrinker(&bc->shrink);
-
- return 0;
+out:
+ pr_verbose_init(c->opts, "ret %i", ret);
+ return ret;
}
void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
{
enum bch_data_type data_type = type == BKEY_TYPE_BTREE
? BCH_DATA_BTREE : BCH_DATA_USER;
- struct bch_devs_list devs = bch2_bkey_devs(k);
int ret = 0;
if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
- fsck_err_on(!bch2_sb_has_replicas(c, data_type, devs), c,
+ fsck_err_on(!bch2_bkey_replicas_marked(c, data_type, k), c,
"superblock not marked as containing replicas (type %u)",
data_type)) {
- ret = bch2_check_mark_super(c, data_type, devs);
+ ret = bch2_mark_bkey_replicas(c, data_type, k);
if (ret)
return ret;
}
unsigned sectors, whiteout_u64s = 0;
struct nonce nonce;
struct bch_csum csum;
+ bool first = !b->written;
if (!b->written) {
i = &b->data->keys;
}
if (ret) {
- btree_err_on(!b->written,
+ btree_err_on(first,
BTREE_ERR_FIXABLE, c, b, i,
"first btree node bset has blacklisted journal seq");
- if (b->written)
+ if (!first)
continue;
}
n->data->min_key = b->data->min_key;
n->data->max_key = b->data->max_key;
n->data->format = format;
+ SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);
btree_node_set_format(n, format);
goto err_free;
}
- ret = bch2_check_mark_super(c, BCH_DATA_BTREE,
- bch2_bkey_devs(bkey_i_to_s_c(&b->key)));
+ ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+ bkey_i_to_s_c(&b->key));
if (ret)
goto err_free;
n2->data->max_key = n1->data->max_key;
n2->data->format = n1->format;
+ SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
n2->key.k.p = n1->key.k.p;
btree_node_set_format(n2, n2->data->format);
goto err;
}
- ret = bch2_check_mark_super(c, BCH_DATA_BTREE,
- bch2_extent_devs(extent_i_to_s_c(new_key)));
+ ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+ extent_i_to_s_c(new_key).s_c);
if (ret)
goto err_free_update;
bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
}
-static inline void btree_trans_sort(struct btree_insert *trans)
+static inline int btree_trans_cmp(struct btree_insert_entry l,
+ struct btree_insert_entry r)
{
- int i, end = trans->nr;
-
- while (--end > 0)
- for (i = 0; i < end; i++)
- if (btree_iter_cmp(trans->entries[i].iter,
- trans->entries[i + 1].iter) > 0)
- swap(trans->entries[i], trans->entries[i + 1]);
+ return btree_iter_cmp(l.iter, r.iter);
}
/* Normal update interface: */
bkey_i_to_s_c(i->k)));
}
- btree_trans_sort(trans);
+ bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
if (unlikely(!percpu_ref_tryget(&c->writes)))
return -EROFS;
crypto_alloc_skcipher("chacha20", 0, 0);
int ret;
- if (!chacha20)
+ if (!chacha20) {
+ pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20));
return PTR_ERR(chacha20);
+ }
ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key));
- if (ret)
+ if (ret) {
+ pr_err("crypto_skcipher_setkey() error: %i", ret);
goto err;
+ }
do_encrypt(chacha20, nonce, buf, len);
err:
ret = bch2_request_key(c->disk_sb, &user_key);
if (ret) {
- bch_err(c, "error requesting encryption key");
+ bch_err(c, "error requesting encryption key: %i", ret);
goto err;
}
{
if (!c->chacha20)
c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0);
- if (IS_ERR(c->chacha20))
+ if (IS_ERR(c->chacha20)) {
+ bch_err(c, "error requesting chacha20 module: %li",
+ PTR_ERR(c->chacha20));
return PTR_ERR(c->chacha20);
+ }
if (!c->poly1305)
c->poly1305 = crypto_alloc_shash("poly1305", 0, 0);
- if (IS_ERR(c->poly1305))
+ if (IS_ERR(c->poly1305)) {
+ bch_err(c, "error requesting poly1305 module: %li",
+ PTR_ERR(c->poly1305));
return PTR_ERR(c->poly1305);
+ }
return 0;
}
if (keyed) {
ret = bch2_request_key(c->disk_sb, &user_key);
if (ret) {
- bch_err(c, "error requesting encryption key");
+ bch_err(c, "error requesting encryption key: %i", ret);
goto err;
}
{
struct bch_sb_field_crypt *crypt;
struct bch_key key;
- int ret;
+ int ret = 0;
+
+ pr_verbose_init(c->opts, "");
c->sha256 = crypto_alloc_shash("sha256", 0, 0);
- if (IS_ERR(c->sha256))
- return PTR_ERR(c->sha256);
+ if (IS_ERR(c->sha256)) {
+ bch_err(c, "error requesting sha256 module");
+ ret = PTR_ERR(c->sha256);
+ goto out;
+ }
crypt = bch2_sb_get_crypt(c->disk_sb);
if (!crypt)
- return 0;
+ goto out;
ret = bch2_alloc_ciphers(c);
if (ret)
- return ret;
+ goto out;
ret = bch2_decrypt_sb_key(c, crypt, &key);
if (ret)
- goto err;
+ goto out;
ret = crypto_skcipher_setkey(c->chacha20,
(void *) &key.key, sizeof(key.key));
-err:
+ if (ret)
+ goto out;
+out:
memzero_explicit(&key, sizeof(key));
+ pr_verbose_init(c->opts, "ret %i", ret);
return ret;
}
return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
}
-static inline enum bch_compression_type
-bch2_compression_opt_to_type(enum bch_compression_opts type)
-{
- switch (type) {
- case BCH_COMPRESSION_OPT_NONE:
- return BCH_COMPRESSION_NONE;
- case BCH_COMPRESSION_OPT_LZ4:
- return BCH_COMPRESSION_LZ4;
- case BCH_COMPRESSION_OPT_GZIP:
- return BCH_COMPRESSION_GZIP;
- default:
- BUG();
- }
-}
+static const unsigned bch2_compression_opt_to_type[] = {
+#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_##t,
+ BCH_COMPRESSION_TYPES()
+#undef x
+};
static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
unsigned type)
#include "lz4.h"
#include <linux/lz4.h>
#include <linux/zlib.h>
+#include <linux/zstd.h>
/* Bounce buffer: */
struct bbuf {
struct bbuf src_data = { NULL };
size_t src_len = src->bi_iter.bi_size;
size_t dst_len = crc.uncompressed_size << 9;
+ void *workspace;
int ret;
src_data = bio_map_or_bounce(c, src, READ);
case BCH_COMPRESSION_LZ4_OLD:
ret = bch2_lz4_decompress(src_data.b, &src_len,
dst_data, dst_len);
- if (ret) {
- ret = -EIO;
+ if (ret)
goto err;
- }
break;
case BCH_COMPRESSION_LZ4:
ret = LZ4_decompress_safe_partial(src_data.b, dst_data,
src_len, dst_len, dst_len);
- if (ret != dst_len) {
- ret = -EIO;
+ if (ret != dst_len)
goto err;
- }
break;
case BCH_COMPRESSION_GZIP: {
- void *workspace;
- z_stream strm;
-
- workspace = kmalloc(zlib_inflate_workspacesize(),
- GFP_NOIO|__GFP_NOWARN);
- if (!workspace) {
- mutex_lock(&c->zlib_workspace_lock);
- workspace = c->zlib_workspace;
- }
+ z_stream strm = {
+ .next_in = src_data.b,
+ .avail_in = src_len,
+ .next_out = dst_data,
+ .avail_out = dst_len,
+ };
+
+ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
- strm.next_in = src_data.b;
- strm.avail_in = src_len;
- strm.next_out = dst_data;
- strm.avail_out = dst_len;
zlib_set_workspace(&strm, workspace);
zlib_inflateInit2(&strm, -MAX_WBITS);
-
ret = zlib_inflate(&strm, Z_FINISH);
- if (workspace == c->zlib_workspace)
- mutex_unlock(&c->zlib_workspace_lock);
- else
- kfree(workspace);
+ mempool_free(workspace, &c->decompress_workspace);
- if (ret != Z_STREAM_END) {
- ret = -EIO;
+ if (ret != Z_STREAM_END)
+ goto err;
+ break;
+ }
+ case BCH_COMPRESSION_ZSTD: {
+ ZSTD_DCtx *ctx;
+ size_t len;
+
+ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO);
+ ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound());
+
+ src_len = le32_to_cpup(src_data.b);
+
+ len = ZSTD_decompressDCtx(ctx,
+ dst_data, dst_len,
+ src_data.b + 4, src_len);
+
+ mempool_free(workspace, &c->decompress_workspace);
+
+ if (len != dst_len)
goto err;
- }
break;
}
default:
BUG();
}
ret = 0;
-err:
+out:
bio_unmap_or_unbounce(c, src_data);
return ret;
+err:
+ ret = -EIO;
+ goto out;
}
int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
return ret;
}
+static int attempt_compress(struct bch_fs *c,
+ void *workspace,
+ void *dst, size_t dst_len,
+ void *src, size_t src_len,
+ unsigned compression_type)
+{
+ switch (compression_type) {
+ case BCH_COMPRESSION_LZ4: {
+ int len = src_len;
+ int ret = LZ4_compress_destSize(
+ src, dst,
+ &len, dst_len,
+ workspace);
+
+ if (len < src_len)
+ return -len;
+
+ return ret;
+ }
+ case BCH_COMPRESSION_GZIP: {
+ z_stream strm = {
+ .next_in = src,
+ .avail_in = src_len,
+ .next_out = dst,
+ .avail_out = dst_len,
+ };
+
+ zlib_set_workspace(&strm, workspace);
+ zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
+ Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
+ Z_DEFAULT_STRATEGY);
+
+ if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END)
+ return 0;
+
+ if (zlib_deflateEnd(&strm) != Z_OK)
+ return 0;
+
+ return strm.total_out;
+ }
+ case BCH_COMPRESSION_ZSTD: {
+ ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace,
+ ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams));
+
+ size_t len = ZSTD_compressCCtx(ctx,
+ dst + 4, dst_len - 4,
+ src, src_len,
+ c->zstd_params);
+ if (ZSTD_isError(len))
+ return 0;
+
+ *((__le32 *) dst) = cpu_to_le32(len);
+ return len + 4;
+ }
+ default:
+ BUG();
+ }
+}
+
static unsigned __bio_compress(struct bch_fs *c,
struct bio *dst, size_t *dst_len,
struct bio *src, size_t *src_len,
unsigned compression_type)
{
struct bbuf src_data = { NULL }, dst_data = { NULL };
+ void *workspace;
unsigned pad;
int ret = 0;
/* If it's only one block, don't bother trying to compress: */
if (bio_sectors(src) <= c->opts.block_size)
- goto err;
+ return 0;
dst_data = bio_map_or_bounce(c, dst, WRITE);
src_data = bio_map_or_bounce(c, src, READ);
- switch (compression_type) {
- case BCH_COMPRESSION_LZ4_OLD:
- compression_type = BCH_COMPRESSION_LZ4;
+ workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO);
- case BCH_COMPRESSION_LZ4: {
- void *workspace;
- int len = src->bi_iter.bi_size;
-
- workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO);
-
- while (1) {
- if (len <= block_bytes(c)) {
- ret = 0;
- break;
- }
-
- ret = LZ4_compress_destSize(
- src_data.b, dst_data.b,
- &len, dst->bi_iter.bi_size,
- workspace);
- if (ret >= len) {
- /* uncompressible: */
- ret = 0;
- break;
- }
-
- if (!(len & (block_bytes(c) - 1)))
- break;
- len = round_down(len, block_bytes(c));
- }
- mempool_free(workspace, &c->lz4_workspace_pool);
+ *src_len = src->bi_iter.bi_size;
+ *dst_len = dst->bi_iter.bi_size;
- if (!ret)
- goto err;
-
- *src_len = len;
- *dst_len = ret;
- ret = 0;
- break;
- }
- case BCH_COMPRESSION_GZIP: {
- void *workspace;
- z_stream strm;
-
- workspace = kmalloc(zlib_deflate_workspacesize(MAX_WBITS,
- DEF_MEM_LEVEL),
- GFP_NOIO|__GFP_NOWARN);
- if (!workspace) {
- mutex_lock(&c->zlib_workspace_lock);
- workspace = c->zlib_workspace;
+ /*
+ * XXX: this algorithm sucks when the compression code doesn't tell us
+ * how much would fit, like LZ4 does:
+ */
+ while (1) {
+ if (*src_len <= block_bytes(c)) {
+ ret = -1;
+ break;
}
- strm.next_in = src_data.b;
- strm.avail_in = min(src->bi_iter.bi_size,
- dst->bi_iter.bi_size);
- strm.next_out = dst_data.b;
- strm.avail_out = dst->bi_iter.bi_size;
- zlib_set_workspace(&strm, workspace);
- zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
- Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL,
- Z_DEFAULT_STRATEGY);
-
- ret = zlib_deflate(&strm, Z_FINISH);
- if (ret != Z_STREAM_END) {
- ret = -EIO;
- goto zlib_err;
+ ret = attempt_compress(c, workspace,
+ dst_data.b, *dst_len,
+ src_data.b, *src_len,
+ compression_type);
+ if (ret > 0) {
+ *dst_len = ret;
+ ret = 0;
+ break;
}
- ret = zlib_deflateEnd(&strm);
- if (ret != Z_OK) {
- ret = -EIO;
- goto zlib_err;
+ /* Didn't fit: should we retry with a smaller amount? */
+ if (*src_len <= *dst_len) {
+ ret = -1;
+ break;
}
- ret = 0;
-zlib_err:
- if (workspace == c->zlib_workspace)
- mutex_unlock(&c->zlib_workspace_lock);
+ /*
+ * If ret is negative, it's a hint as to how much data would fit
+ */
+ BUG_ON(-ret >= *src_len);
+
+ if (ret < 0)
+ *src_len = -ret;
else
- kfree(workspace);
+ *src_len -= (*src_len - *dst_len) / 2;
+ *src_len = round_down(*src_len, block_bytes(c));
+ }
- if (ret)
- goto err;
+ mempool_free(workspace, &c->compress_workspace[compression_type]);
- *dst_len = strm.total_out;
- *src_len = strm.total_in;
- break;
- }
- default:
- BUG();
- }
+ if (ret)
+ goto err;
/* Didn't get smaller: */
if (round_up(*dst_len, block_bytes(c)) >= *src_len)
/* Don't generate a bigger output than input: */
dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+ if (compression_type == BCH_COMPRESSION_LZ4_OLD)
+ compression_type = BCH_COMPRESSION_LZ4;
+
compression_type =
__bio_compress(c, dst, dst_len, src, src_len, compression_type);
return compression_type;
}
+#define BCH_FEATURE_NONE 0
+
+static const unsigned bch2_compression_opt_to_feature[] = {
+#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t,
+ BCH_COMPRESSION_TYPES()
+#undef x
+};
+
+#undef BCH_FEATURE_NONE
+
/* doesn't write superblock: */
int bch2_check_set_has_compressed_data(struct bch_fs *c,
unsigned compression_type)
{
- switch (compression_type) {
- case BCH_COMPRESSION_OPT_NONE:
- return 0;
- case BCH_COMPRESSION_OPT_LZ4:
- if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4))
- return 0;
+ unsigned f;
+ int ret = 0;
- bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_LZ4);
- break;
- case BCH_COMPRESSION_OPT_GZIP:
- if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
- return 0;
+ pr_verbose_init(c->opts, "");
- bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_GZIP);
- break;
- default:
- BUG();
- }
+ BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature));
+
+ if (!compression_type)
+ goto out;
- return bch2_fs_compress_init(c);
+ f = bch2_compression_opt_to_feature[compression_type];
+ if (bch2_sb_test_feature(c->disk_sb, f))
+ goto out;
+
+ bch2_sb_set_feature(c->disk_sb, f);
+ ret = bch2_fs_compress_init(c);
+out:
+ pr_verbose_init(c->opts, "ret %i", ret);
+ return ret;
}
void bch2_fs_compress_exit(struct bch_fs *c)
{
- vfree(c->zlib_workspace);
- mempool_exit(&c->lz4_workspace_pool);
+ unsigned i;
+
+ mempool_exit(&c->decompress_workspace);
+ for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++)
+ mempool_exit(&c->compress_workspace[i]);
mempool_exit(&c->compression_bounce[WRITE]);
mempool_exit(&c->compression_bounce[READ]);
}
-#define COMPRESSION_WORKSPACE_SIZE \
- max_t(size_t, zlib_inflate_workspacesize(), \
- zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL))
+static void *mempool_kvpmalloc(gfp_t gfp_mask, void *pool_data)
+{
+ size_t size = (size_t)pool_data;
+ return kvpmalloc(size, gfp_mask);
+}
+
+void mempool_kvpfree(void *element, void *pool_data)
+{
+ size_t size = (size_t)pool_data;
+ kvpfree(element, size);
+}
+
+static int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
+{
+ return !mempool_initialized(pool)
+ ? mempool_init(pool, min_nr, mempool_kvpmalloc,
+ mempool_kvpfree, (void *) size)
+ : 0;
+}
int bch2_fs_compress_init(struct bch_fs *c)
{
- unsigned order = get_order(c->sb.encoded_extent_max << 9);
- int ret;
+ size_t max_extent = c->sb.encoded_extent_max << 9;
+ size_t order = get_order(max_extent);
+ size_t decompress_workspace_size = 0;
+ bool decompress_workspace_needed;
+ ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0);
+ struct {
+ unsigned feature;
+ unsigned type;
+ size_t compress_workspace;
+ size_t decompress_workspace;
+ } compression_types[] = {
+ { BCH_FEATURE_LZ4, BCH_COMPRESSION_LZ4, LZ4_MEM_COMPRESS, 0 },
+ { BCH_FEATURE_GZIP, BCH_COMPRESSION_GZIP,
+ zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL),
+ zlib_inflate_workspacesize(), },
+ { BCH_FEATURE_ZSTD, BCH_COMPRESSION_ZSTD,
+ ZSTD_CCtxWorkspaceBound(params.cParams),
+ ZSTD_DCtxWorkspaceBound() },
+ }, *i;
+ int ret = 0;
- if (!bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) &&
- !bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP))
- return 0;
+ pr_verbose_init(c->opts, "");
+
+ c->zstd_params = params;
+
+ for (i = compression_types;
+ i < compression_types + ARRAY_SIZE(compression_types);
+ i++)
+ if (bch2_sb_test_feature(c->disk_sb, i->feature))
+ goto have_compressed;
+
+ goto out;
+have_compressed:
if (!mempool_initialized(&c->compression_bounce[READ])) {
ret = mempool_init_page_pool(&c->compression_bounce[READ],
1, order);
if (ret)
- return ret;
+ goto out;
}
if (!mempool_initialized(&c->compression_bounce[WRITE])) {
ret = mempool_init_page_pool(&c->compression_bounce[WRITE],
1, order);
if (ret)
- return ret;
+ goto out;
}
- if (!mempool_initialized(&c->lz4_workspace_pool) &&
- bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4)) {
- ret = mempool_init_kmalloc_pool(&c->lz4_workspace_pool,
- 1, LZ4_MEM_COMPRESS);
- if (ret)
- return ret;
- }
+ for (i = compression_types;
+ i < compression_types + ARRAY_SIZE(compression_types);
+ i++) {
+ decompress_workspace_size =
+ max(decompress_workspace_size, i->decompress_workspace);
- if (!c->zlib_workspace &&
- bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) {
- c->zlib_workspace = vmalloc(COMPRESSION_WORKSPACE_SIZE);
- if (!c->zlib_workspace)
- return -ENOMEM;
+ if (!bch2_sb_test_feature(c->disk_sb, i->feature))
+ continue;
+
+ if (i->decompress_workspace)
+ decompress_workspace_needed = true;
+
+ ret = mempool_init_kvpmalloc_pool(
+ &c->compress_workspace[i->type],
+ 1, i->compress_workspace);
+ if (ret)
+ goto out;
}
- return 0;
+ ret = mempool_init_kmalloc_pool(
+ &c->decompress_workspace,
+ 1, decompress_workspace_size);
+ if (ret)
+ goto out;
+out:
+ pr_verbose_init(c->opts, "ret %i", ret);
+ return ret;
}
goto err;
}
- if (!bch2_sb_has_replicas(c, BCH_DATA_BTREE, bch2_extent_devs(e))) {
+ if (!bch2_bkey_replicas_marked(c, BCH_DATA_BTREE, e.s_c)) {
bch2_bkey_val_to_text(c, btree_node_type(b),
buf, sizeof(buf), k);
bch2_fs_bug(c,
}
if (!bkey_extent_is_cached(e.k) &&
- !bch2_sb_has_replicas(c, BCH_DATA_USER, bch2_extent_devs(e))) {
+ !bch2_bkey_replicas_marked(c, BCH_DATA_USER, e.s_c)) {
bch2_bkey_val_to_text(c, btree_node_type(b),
buf, sizeof(buf), e.s_c);
bch2_fs_bug(c,
}
void bch2_extent_mark_replicas_cached(struct bch_fs *c,
- struct bkey_s_extent e)
+ struct bkey_s_extent e,
+ unsigned nr_desired_replicas)
{
struct bch_extent_ptr *ptr;
unsigned tier = 0, nr_cached = 0;
unsigned nr_good = bch2_extent_nr_good_ptrs(c, e.c);
bool have_higher_tier;
- if (nr_good <= c->opts.data_replicas)
+ if (nr_good <= nr_desired_replicas)
return;
- nr_cached = nr_good - c->opts.data_replicas;
+ nr_cached = nr_good - nr_desired_replicas;
do {
have_higher_tier = false;
struct btree_insert_entry *);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
-void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent);
+void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent,
+ unsigned);
const struct bch_extent_ptr *
bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
return ret;
}
+static inline struct bch_devs_list bch2_extent_cached_devs(struct bkey_s_c_extent e)
+{
+ struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+ const struct bch_extent_ptr *ptr;
+
+ extent_for_each_ptr(e, ptr)
+ if (ptr->cached)
+ ret.devs[ret.nr++] = ptr->dev;
+
+ return ret;
+}
+
static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
{
switch (k.k->type) {
}
}
+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case BCH_EXTENT:
+ case BCH_EXTENT_CACHED:
+ return bch2_extent_dirty_devs(bkey_s_c_to_extent(k));
+ default:
+ return (struct bch_devs_list) { .nr = 0 };
+ }
+}
+
+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
+{
+ switch (k.k->type) {
+ case BCH_EXTENT:
+ case BCH_EXTENT_CACHED:
+ return bch2_extent_cached_devs(bkey_s_c_to_extent(k));
+ default:
+ return (struct bch_devs_list) { .nr = 0 };
+ }
+}
+
bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
struct bch_extent_crc_unpacked);
bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
ret = bch2_btree_insert_at(wop->c, &wop->res,
&hook.hook, op_journal_seq(wop),
- BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_USE_RESERVE,
BTREE_INSERT_ENTRY(&extent_iter, k),
BTREE_INSERT_ENTRY_EXTRA_RES(&inode_iter,
&hook.inode_p.inode.k_i, 2));
} else {
ret = bch2_btree_insert_at(wop->c, &wop->res,
&hook.hook, op_journal_seq(wop),
- BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_ATOMIC|
+ BTREE_INSERT_USE_RESERVE,
BTREE_INSERT_ENTRY(&extent_iter, k));
}
bch2_write_op_init(&op->op, c);
op->op.csum_type = bch2_data_checksum_type(c, opts.data_checksum);
- op->op.compression_type = bch2_compression_opt_to_type(opts.compression);
+ op->op.compression_type = bch2_compression_opt_to_type[opts.compression];
op->op.devs = c->fastest_devs;
op->op.index_update_fn = bchfs_write_index_update;
op_journal_seq_set(&op->op, &inode->ei_journal_seq);
int bch2_fs_fsio_init(struct bch_fs *c)
{
+ int ret = 0;
+
+ pr_verbose_init(c->opts, "");
+
if (bioset_init(&c->writepage_bioset,
4, offsetof(struct bch_writepage_io, op.op.wbio.bio),
BIOSET_NEED_BVECS) ||
bioset_init(&c->dio_write_bioset,
4, offsetof(struct dio_write, iop.op.wbio.bio),
BIOSET_NEED_BVECS))
- return -ENOMEM;
+ ret = -ENOMEM;
- return 0;
+ pr_verbose_init(c->opts, "ret %i", ret);
+ return ret;
}
#endif /* NO_BCACHEFS_FS */
closure_return(cl);
}
-static u64 keylist_sectors(struct keylist *keys)
-{
- struct bkey_i *k;
- u64 ret = 0;
-
- for_each_keylist_key(keys, k)
- ret += k->k.size;
-
- return ret;
-}
-
int bch2_write_index_default(struct bch_write_op *op)
{
struct keylist *keys = &op->insert_keys;
ret = bch2_btree_insert_list_at(&iter, keys, &op->res,
NULL, op_journal_seq(op),
- BTREE_INSERT_NOFAIL);
+ BTREE_INSERT_NOFAIL|
+ BTREE_INSERT_USE_RESERVE);
bch2_btree_iter_unlock(&iter);
return ret;
}
if (!(op->flags & BCH_WRITE_NOMARK_REPLICAS)) {
- ret = bch2_check_mark_super(c, BCH_DATA_USER,
- bch2_extent_devs(e.c));
+ ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, e.s_c);
if (ret)
goto err;
}
swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
rbio->promote = NULL;
- bch2_write_op_init(&op->write.op, c);
- op->write.op.csum_type = bch2_data_checksum_type(c, rbio->opts.data_checksum);
- op->write.op.compression_type =
- bch2_compression_opt_to_type(rbio->opts.compression);
-
- op->write.move_dev = -1;
- op->write.op.devs = c->fastest_devs;
- op->write.op.write_point = writepoint_hashed((unsigned long) current);
- op->write.op.flags |= BCH_WRITE_ALLOC_NOWAIT;
- op->write.op.flags |= BCH_WRITE_CACHED;
-
- bch2_migrate_write_init(&op->write, rbio);
+ bch2_migrate_read_done(&op->write, rbio);
closure_init(cl, NULL);
closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
* XXX: multiple promotes can race with each other, wastefully. Keep a list of
* outstanding promotes?
*/
-static struct promote_op *promote_alloc(struct bch_read_bio *rbio)
+static struct promote_op *promote_alloc(struct bch_read_bio *rbio,
+ struct bkey_s_c k)
{
+ struct bch_fs *c = rbio->c;
struct promote_op *op;
struct bio *bio;
/* data might have to be decompressed in the write path: */
unsigned pages = DIV_ROUND_UP(rbio->pick.crc.uncompressed_size,
PAGE_SECTORS);
+ int ret;
BUG_ON(!rbio->bounce);
BUG_ON(pages < rbio->bio.bi_vcnt);
memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
+ ret = bch2_migrate_write_init(c, &op->write, c->fastest_devs,
+ writepoint_hashed((unsigned long) current),
+ rbio->opts,
+ DATA_PROMOTE,
+ (struct data_opts) { 0 },
+ k);
+ BUG_ON(ret);
+
return op;
}
rbio->pick = *pick;
rbio->pos = pos;
rbio->version = e.k->version;
- rbio->promote = promote ? promote_alloc(rbio) : NULL;
+ rbio->promote = promote ? promote_alloc(rbio, e.s_c) : NULL;
INIT_WORK(&rbio->work, NULL);
bio_set_dev(&rbio->bio, pick->ca->disk_sb.bdev);
op->error = 0;
op->csum_type = bch2_data_checksum_type(c, c->opts.data_checksum);
op->compression_type =
- bch2_compression_opt_to_type(c->opts.compression);
+ bch2_compression_opt_to_type[c->opts.compression];
op->nr_replicas = 0;
op->nr_replicas_required = c->opts.data_replicas_required;
op->alloc_reserve = RESERVE_NONE;
if (!degraded &&
(test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
- fsck_err_on(!bch2_sb_has_replicas(c, BCH_DATA_JOURNAL,
+ fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL,
i->devs), c,
"superblock not marked as containing replicas (type %u)",
BCH_DATA_JOURNAL))) {
- ret = bch2_check_mark_super(c, BCH_DATA_JOURNAL,
- i->devs);
+ ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs);
if (ret)
return ret;
}
goto err;
}
- if (bch2_check_mark_super(c, BCH_DATA_JOURNAL, devs))
+ if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs))
goto err;
out:
__bch2_time_stats_update(j->write_time, j->write_start_time);
seq++;
spin_unlock(&j->lock);
- ret = bch2_check_mark_super(c, BCH_DATA_JOURNAL, devs);
+ ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
spin_lock(&j->lock);
}
spin_unlock(&j->lock);
int bch2_fs_journal_init(struct journal *j)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
static struct lock_class_key res_key;
+ int ret = 0;
+
+ pr_verbose_init(c->opts, "");
spin_lock_init(&j->lock);
spin_lock_init(&j->err_lock);
if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
!(j->buf[0].data = kvpmalloc(j->buf[0].size, GFP_KERNEL)) ||
- !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL)))
- return -ENOMEM;
+ !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL))) {
+ ret = -ENOMEM;
+ goto out;
+ }
j->pin.front = j->pin.back = 1;
-
- return 0;
+out:
+ pr_verbose_init(c->opts, "ret %i", ret);
+ return ret;
}
/* debug: */
#define keylist_single(k) \
((struct keylist) { .keys = k, .top = bkey_next(k) })
+static inline u64 keylist_sectors(struct keylist *keys)
+{
+ struct bkey_i *k;
+ u64 ret = 0;
+
+ for_each_keylist_key(keys, k)
+ ret += k->k.size;
+
+ return ret;
+}
+
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_verify_keylist_sorted(struct keylist *);
#else
#include "move.h"
#include "super-io.h"
-static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
-{
- struct bch_dev *ca = arg;
-
- return bch2_extent_has_device(e, ca->dev_idx);
-}
-
-#define MAX_DATA_OFF_ITER 10
-
-static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca,
- int flags)
-{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_move_stats stats;
- unsigned pass = 0;
- int ret = 0;
-
- if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_USER)))
- return 0;
-
- /*
- * XXX: we should be able to do this in one pass, but bch2_move_data()
- * can spuriously fail to move an extent due to racing with other move
- * operations
- */
- do {
- memset(&stats, 0, sizeof(stats));
-
- ret = bch2_move_data(c, NULL,
- SECTORS_IN_FLIGHT_PER_DEVICE,
- NULL,
- writepoint_hashed((unsigned long) current),
- 0,
- ca->dev_idx,
- POS_MIN, POS_MAX,
- migrate_pred, ca,
- &stats);
- if (ret) {
- bch_err(c, "error migrating data: %i", ret);
- return ret;
- }
- } while (atomic64_read(&stats.keys_moved) && pass++ < MAX_DATA_OFF_ITER);
-
- if (atomic64_read(&stats.keys_moved)) {
- bch_err(c, "unable to migrate all data in %d iterations",
- MAX_DATA_OFF_ITER);
- return -1;
- }
-
- mutex_lock(&c->replicas_gc_lock);
- bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
-
- for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH, k) {
- ret = bch2_check_mark_super(c, BCH_DATA_USER, bch2_bkey_devs(k));
- if (ret) {
- bch_err(c, "error migrating data %i from check_mark_super()", ret);
- break;
- }
- }
-
- bch2_replicas_gc_end(c, ret);
- mutex_unlock(&c->replicas_gc_lock);
- return ret;
-}
-
-static int bch2_dev_metadata_migrate(struct bch_fs *c, struct bch_dev *ca,
- int flags)
-{
- struct btree_iter iter;
- struct btree *b;
- int ret = 0;
- unsigned id;
-
- if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_BTREE)))
- return 0;
-
- mutex_lock(&c->replicas_gc_lock);
- bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE);
-
- for (id = 0; id < BTREE_ID_NR; id++) {
- for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
- struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
-
- if (!bch2_extent_has_device(e, ca->dev_idx))
- continue;
-
- ret = bch2_btree_node_rewrite(c, &iter, b->data->keys.seq, 0);
- if (ret) {
- bch2_btree_iter_unlock(&iter);
- goto err;
- }
- }
- ret = bch2_btree_iter_unlock(&iter);
- if (ret)
- goto err;
- }
-err:
- bch2_replicas_gc_end(c, ret);
- mutex_unlock(&c->replicas_gc_lock);
- return ret;
-}
-
-int bch2_dev_data_migrate(struct bch_fs *c, struct bch_dev *ca, int flags)
-{
- BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW &&
- bch2_dev_is_online(ca));
-
- return bch2_dev_usrdata_migrate(c, ca, flags) ?:
- bch2_dev_metadata_migrate(c, ca, flags);
-}
-
static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e,
unsigned dev_idx, int flags, bool metadata)
{
int ret = 0;
mutex_lock(&c->replicas_gc_lock);
- bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
+ bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
POS_MIN, BTREE_ITER_PREFETCH);
!(ret = btree_iter_err(k))) {
if (!bkey_extent_is_data(k.k) ||
!bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) {
- ret = bch2_check_mark_super(c, BCH_DATA_USER,
- bch2_bkey_devs(k));
+ ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
if (ret)
break;
bch2_btree_iter_next(&iter);
*/
bch2_extent_normalize(c, e.s);
- ret = bch2_check_mark_super(c, BCH_DATA_USER,
- bch2_bkey_devs(bkey_i_to_s_c(&tmp.key)));
+ ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
+ bkey_i_to_s_c(&tmp.key));
if (ret)
break;
dev_idx)) {
bch2_btree_iter_set_locks_want(&iter, 0);
- ret = bch2_check_mark_super(c, BCH_DATA_BTREE,
- bch2_bkey_devs(bkey_i_to_s_c(&b->key)));
+ ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+ bkey_i_to_s_c(&b->key));
if (ret)
goto err;
} else {
#ifndef _BCACHEFS_MIGRATE_H
#define _BCACHEFS_MIGRATE_H
-int bch2_dev_data_migrate(struct bch_fs *, struct bch_dev *, int);
int bch2_dev_data_drop(struct bch_fs *, unsigned, int);
#endif /* _BCACHEFS_MIGRATE_H */
BKEY_PADDED(k) _new, _insert;
struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
+ unsigned nr_dirty;
bool did_work = false;
if (btree_iter_err(k)) {
m->ptr, m->offset))
goto nomatch;
+ if (m->data_cmd == DATA_REWRITE &&
+ !bch2_extent_has_device(bkey_s_c_to_extent(k),
+ m->data_opts.rewrite_dev))
+ goto nomatch;
+
bkey_reassemble(&_insert.k, k);
insert = bkey_i_to_extent(&_insert.k);
bch2_cut_back(new->k.p, &insert->k);
bch2_cut_back(insert->k.p, &new->k);
- if (m->move_dev >= 0 &&
- (ptr = (struct bch_extent_ptr *)
- bch2_extent_has_device(extent_i_to_s_c(insert),
- m->move_dev)))
+ if (m->data_cmd == DATA_REWRITE) {
+ ptr = (struct bch_extent_ptr *)
+ bch2_extent_has_device(extent_i_to_s_c(insert),
+ m->data_opts.rewrite_dev);
bch2_extent_drop_ptr(extent_i_to_s(insert), ptr);
+ }
extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) {
if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) {
bch2_extent_narrow_crcs(insert,
(struct bch_extent_crc_unpacked) { 0 });
bch2_extent_normalize(c, extent_i_to_s(insert).s);
- bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert));
+ bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert),
+ c->opts.data_replicas);
+
+ /*
+ * It's possible we race, and for whatever reason the extent now
+ * has fewer replicas than when we last looked at it - meaning
+ * we need to get a disk reservation here:
+ */
+ nr_dirty = bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i));
+ if (m->nr_ptrs_reserved < nr_dirty) {
+ unsigned sectors = (nr_dirty - m->nr_ptrs_reserved) *
+ keylist_sectors(keys);
+
+ /*
+ * can't call bch2_disk_reservation_add() with btree
+ * locks held, at least not without a song and dance
+ */
+ bch2_btree_iter_unlock(&iter);
+
+ ret = bch2_disk_reservation_add(c, &op->res, sectors, 0);
+ if (ret)
+ goto out;
+
+ m->nr_ptrs_reserved = nr_dirty;
+ goto next;
+ }
- ret = bch2_check_mark_super(c, BCH_DATA_USER,
- bch2_extent_devs(extent_i_to_s_c(insert)));
+ ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER,
+ extent_i_to_s_c(insert).s_c);
if (ret)
break;
NULL, op_journal_seq(op),
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOFAIL|
- m->btree_insert_flags,
+ BTREE_INSERT_USE_RESERVE|
+ m->data_opts.btree_insert_flags,
BTREE_INSERT_ENTRY(&iter, &insert->k_i));
if (!ret)
atomic_long_inc(&c->extent_migrate_done);
return ret;
}
-void bch2_migrate_write_init(struct migrate_write *m,
- struct bch_read_bio *rbio)
+void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio)
{
/* write bio must own pages: */
BUG_ON(!m->op.wbio.bio.bi_vcnt);
m->op.pos = rbio->pos;
m->op.version = rbio->version;
m->op.crc = rbio->pick.crc;
+ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
m->op.nonce = m->op.crc.nonce + m->op.crc.offset;
m->op.csum_type = m->op.crc.csum_type;
}
- if (m->move_dev >= 0)
- bch2_dev_list_drop_dev(&m->op.devs_have, m->move_dev);
+ if (m->data_cmd == DATA_REWRITE)
+ bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev);
+}
+
+int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
+ struct bch_devs_mask *devs,
+ struct write_point_specifier wp,
+ struct bch_io_opts io_opts,
+ enum data_cmd data_cmd,
+ struct data_opts data_opts,
+ struct bkey_s_c k)
+{
+ int ret;
- if (m->btree_insert_flags & BTREE_INSERT_USE_RESERVE)
+ m->data_cmd = data_cmd;
+ m->data_opts = data_opts;
+ m->nr_ptrs_reserved = bch2_extent_nr_dirty_ptrs(k);
+
+ bch2_write_op_init(&m->op, c);
+ m->op.csum_type = bch2_data_checksum_type(c, io_opts.data_checksum);
+ m->op.compression_type =
+ bch2_compression_opt_to_type[io_opts.compression];
+ m->op.devs = devs;
+ m->op.write_point = wp;
+
+ if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE)
m->op.alloc_reserve = RESERVE_MOVINGGC;
m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
BCH_WRITE_DATA_ENCODED|
BCH_WRITE_NOMARK_REPLICAS;
- m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
m->op.nr_replicas = 1;
m->op.nr_replicas_required = 1;
m->op.index_update_fn = bch2_migrate_index_update;
+
+ switch (data_cmd) {
+ case DATA_ADD_REPLICAS:
+ if (m->nr_ptrs_reserved < c->opts.data_replicas) {
+ m->op.nr_replicas = c->opts.data_replicas - m->nr_ptrs_reserved;
+
+ ret = bch2_disk_reservation_get(c, &m->op.res,
+ k.k->size,
+ m->op.nr_replicas, 0);
+ if (ret)
+ return ret;
+
+ m->nr_ptrs_reserved = c->opts.data_replicas;
+ }
+ break;
+ case DATA_REWRITE:
+ break;
+ case DATA_PROMOTE:
+ m->op.flags |= BCH_WRITE_ALLOC_NOWAIT;
+ m->op.flags |= BCH_WRITE_CACHED;
+ break;
+ default:
+ BUG();
+ }
+
+ return 0;
}
static void move_free(struct closure *cl)
struct moving_io *io = container_of(cl, struct moving_io, cl);
if (likely(!io->rbio.bio.bi_status)) {
- bch2_migrate_write_init(&io->write, &io->rbio);
+ bch2_migrate_read_done(&io->write, &io->rbio);
closure_call(&io->write.op.cl, bch2_write, NULL, cl);
}
}
static int bch2_move_extent(struct bch_fs *c,
- struct moving_context *ctxt,
- struct bch_devs_mask *devs,
- struct write_point_specifier wp,
- int btree_insert_flags,
- int move_device,
- struct bch_io_opts opts,
- struct bkey_s_c_extent e)
+ struct moving_context *ctxt,
+ struct bch_devs_mask *devs,
+ struct write_point_specifier wp,
+ struct bch_io_opts io_opts,
+ struct bkey_s_c_extent e,
+ enum data_cmd data_cmd,
+ struct data_opts data_opts)
{
struct extent_pick_ptr pick;
struct moving_io *io;
const struct bch_extent_ptr *ptr;
struct bch_extent_crc_unpacked crc;
- unsigned sectors = e.k->size, pages, nr_good;
+ unsigned sectors = e.k->size, pages;
int ret = -ENOMEM;
bch2_extent_pick_ptr(c, e.s_c, NULL, &pick);
if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL))
goto err_free;
- io->rbio.opts = opts;
+ io->rbio.opts = io_opts;
bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
io->rbio.bio.bi_iter.bi_size = sectors << 9;
io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(e.k);
io->rbio.bio.bi_end_io = move_read_endio;
- io->write.btree_insert_flags = btree_insert_flags;
- io->write.move_dev = move_device;
-
- bch2_write_op_init(&io->write.op, c);
- io->write.op.csum_type = bch2_data_checksum_type(c, opts.data_checksum);
- io->write.op.compression_type =
- bch2_compression_opt_to_type(opts.compression);
- io->write.op.devs = devs;
- io->write.op.write_point = wp;
-
- if (move_device < 0 &&
- ((nr_good = bch2_extent_nr_good_ptrs(c, e)) <
- c->opts.data_replicas)) {
- io->write.op.nr_replicas = c->opts.data_replicas - nr_good;
-
- ret = bch2_disk_reservation_get(c, &io->write.op.res,
- e.k->size,
- io->write.op.nr_replicas, 0);
- if (ret)
- goto err_free_pages;
- }
+ ret = bch2_migrate_write_init(c, &io->write, devs, wp,
+ io_opts, data_cmd, data_opts, e.s_c);
+ if (ret)
+ goto err_free_pages;
atomic64_inc(&ctxt->stats->keys_moved);
atomic64_add(e.k->size, &ctxt->stats->sectors_moved);
unsigned sectors_in_flight,
struct bch_devs_mask *devs,
struct write_point_specifier wp,
- int btree_insert_flags,
- int move_device,
struct bpos start,
struct bpos end,
move_pred_fn pred, void *arg,
{
bool kthread = (current->flags & PF_KTHREAD) != 0;
struct moving_context ctxt = { .stats = stats };
- struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts);
+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
BKEY_PADDED(k) tmp;
struct bkey_s_c k;
struct bkey_s_c_extent e;
+ struct data_opts data_opts;
+ enum data_cmd data_cmd;
u64 cur_inum = U64_MAX;
- int ret = 0;
+ int ret = 0, ret2;
closure_init_stack(&ctxt.cl);
INIT_LIST_HEAD(&ctxt.reads);
/* don't hold btree locks while looking up inode: */
bch2_btree_iter_unlock(&stats->iter);
- opts = bch2_opts_to_inode_opts(c->opts);
+ io_opts = bch2_opts_to_inode_opts(c->opts);
if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode))
- bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode));
+ bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode));
cur_inum = k.k->p.inode;
goto peek;
}
- if (!pred(arg, e))
+ switch ((data_cmd = pred(c, arg, BKEY_TYPE_EXTENTS, e,
+ &io_opts, &data_opts))) {
+ case DATA_SKIP:
goto next;
+ case DATA_SCRUB:
+ BUG();
+ case DATA_ADD_REPLICAS:
+ case DATA_REWRITE:
+ case DATA_PROMOTE:
+ break;
+ default:
+ BUG();
+ }
/* unlock before doing IO: */
bkey_reassemble(&tmp.k, k);
k = bkey_i_to_s_c(&tmp.k);
bch2_btree_iter_unlock(&stats->iter);
- if (bch2_move_extent(c, &ctxt, devs, wp,
- btree_insert_flags,
- move_device, opts,
- bkey_s_c_to_extent(k))) {
- /* memory allocation failure, wait for some IO to finish */
- bch2_move_ctxt_wait_for_io(&ctxt);
- continue;
+ ret2 = bch2_move_extent(c, &ctxt, devs, wp, io_opts,
+ bkey_s_c_to_extent(k),
+ data_cmd, data_opts);
+ if (ret2) {
+ if (ret2 == -ENOMEM) {
+ /* memory allocation failure, wait for some IO to finish */
+ bch2_move_ctxt_wait_for_io(&ctxt);
+ continue;
+ }
+
+ /* XXX signal failure */
+ goto next;
}
if (rate)
int ret;
mutex_lock(&c->replicas_gc_lock);
- bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
+ bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED));
for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
BTREE_ITER_PREFETCH, k) {
- ret = bch2_check_mark_super(c, BCH_DATA_USER, bch2_bkey_devs(k));
+ ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k);
if (ret)
break;
}
for (id = 0; id < BTREE_ID_NR; id++) {
for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
- ret = bch2_check_mark_super(c, BCH_DATA_BTREE,
- bch2_bkey_devs(bkey_i_to_s_c(&b->key)));
+ ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
+ bkey_i_to_s_c(&b->key));
bch2_btree_iter_cond_resched(&iter);
}
void *arg,
struct bch_move_stats *stats)
{
+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
struct btree *b;
unsigned id;
+ struct data_opts data_opts;
+ enum data_cmd cmd;
int ret = 0;
stats->data_type = BCH_DATA_BTREE;
for (id = 0; id < BTREE_ID_NR; id++) {
for_each_btree_node(&stats->iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
- if (pred(arg, bkey_i_to_s_c_extent(&b->key)))
- ret = bch2_btree_node_rewrite(c, &stats->iter,
- b->data->keys.seq, 0) ?: ret;
+ switch ((cmd = pred(c, arg, BKEY_TYPE_BTREE,
+ bkey_i_to_s_c_extent(&b->key),
+ &io_opts,
+ &data_opts))) {
+ case DATA_SKIP:
+ goto next;
+ case DATA_SCRUB:
+ BUG();
+ case DATA_ADD_REPLICAS:
+ case DATA_REWRITE:
+ break;
+ default:
+ BUG();
+ }
+ ret = bch2_btree_node_rewrite(c, &stats->iter,
+ b->data->keys.seq, 0) ?: ret;
+next:
bch2_btree_iter_cond_resched(&stats->iter);
}
}
#if 0
-static bool scrub_data_pred(void *arg, struct bkey_s_c_extent e)
+static enum data_cmd scrub_pred(struct bch_fs *c, void *arg,
+ enum bkey_type type,
+ struct bkey_s_c_extent e,
+ struct bch_io_opts *io_opts,
+ struct data_opts *data_opts)
{
+ return DATA_SCRUB;
}
#endif
-static bool rereplicate_metadata_pred(void *arg, struct bkey_s_c_extent e)
+static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg,
+ enum bkey_type type,
+ struct bkey_s_c_extent e,
+ struct bch_io_opts *io_opts,
+ struct data_opts *data_opts)
{
- struct bch_fs *c = arg;
unsigned nr_good = bch2_extent_nr_good_ptrs(c, e);
+ unsigned replicas = type == BKEY_TYPE_BTREE
+ ? c->opts.metadata_replicas
+ : c->opts.data_replicas;
- return nr_good && nr_good < c->opts.metadata_replicas;
-}
+ if (!nr_good || nr_good >= replicas)
+ return DATA_SKIP;
-static bool rereplicate_data_pred(void *arg, struct bkey_s_c_extent e)
-{
- struct bch_fs *c = arg;
- unsigned nr_good = bch2_extent_nr_good_ptrs(c, e);
-
- return nr_good && nr_good < c->opts.data_replicas;
+ data_opts->btree_insert_flags = 0;
+ return DATA_ADD_REPLICAS;
}
-static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
+static enum data_cmd migrate_pred(struct bch_fs *c, void *arg,
+ enum bkey_type type,
+ struct bkey_s_c_extent e,
+ struct bch_io_opts *io_opts,
+ struct data_opts *data_opts)
{
struct bch_ioctl_data *op = arg;
- return bch2_extent_has_device(e, op->migrate.dev);
+ if (!bch2_extent_has_device(e, op->migrate.dev))
+ return DATA_SKIP;
+
+ data_opts->btree_insert_flags = 0;
+ data_opts->rewrite_dev = op->migrate.dev;
+ return DATA_REWRITE;
}
int bch2_data_job(struct bch_fs *c,
stats->data_type = BCH_DATA_JOURNAL;
ret = bch2_journal_flush_device(&c->journal, -1);
- ret = bch2_move_btree(c, rereplicate_metadata_pred, c, stats) ?: ret;
+ ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
ret = bch2_gc_btree_replicas(c) ?: ret;
ret = bch2_move_data(c, NULL, SECTORS_IN_FLIGHT_PER_DEVICE,
NULL,
writepoint_hashed((unsigned long) current),
- 0, -1,
op.start,
op.end,
- rereplicate_data_pred, c, stats) ?: ret;
+ rereplicate_pred, c, stats) ?: ret;
ret = bch2_gc_data_replicas(c) ?: ret;
break;
case BCH_DATA_OP_MIGRATE:
ret = bch2_move_data(c, NULL, SECTORS_IN_FLIGHT_PER_DEVICE,
NULL,
writepoint_hashed((unsigned long) current),
- 0, -1,
op.start,
op.end,
migrate_pred, &op, stats) ?: ret;
struct bch_read_bio;
struct moving_context;
+enum data_cmd {
+ DATA_SKIP,
+ DATA_SCRUB,
+ DATA_ADD_REPLICAS,
+ DATA_REWRITE,
+ DATA_PROMOTE,
+};
+
+struct data_opts {
+ unsigned rewrite_dev;
+ int btree_insert_flags;
+};
+
struct migrate_write {
+ enum data_cmd data_cmd;
+ struct data_opts data_opts;
+
+ unsigned nr_ptrs_reserved;
+
struct moving_context *ctxt;
/* what we read: */
struct bch_extent_ptr ptr;
u64 offset;
- int move_dev;
- int btree_insert_flags;
struct bch_write_op op;
};
-void bch2_migrate_write_init(struct migrate_write *, struct bch_read_bio *);
+void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *);
+int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
+ struct bch_devs_mask *,
+ struct write_point_specifier,
+ struct bch_io_opts,
+ enum data_cmd, struct data_opts,
+ struct bkey_s_c);
#define SECTORS_IN_FLIGHT_PER_DEVICE 2048
-typedef bool (*move_pred_fn)(void *, struct bkey_s_c_extent);
+typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *,
+ enum bkey_type, struct bkey_s_c_extent,
+ struct bch_io_opts *, struct data_opts *);
struct bch_move_stats {
enum bch_data_type data_type;
int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
unsigned, struct bch_devs_mask *,
struct write_point_specifier,
- int, int, struct bpos, struct bpos,
+ struct bpos, struct bpos,
move_pred_fn, void *,
struct bch_move_stats *);
return (l->offset > r->offset) - (l->offset < r->offset);
}
-static bool copygc_pred(void *arg, struct bkey_s_c_extent e)
+static bool __copygc_pred(struct bch_dev *ca,
+ struct bkey_s_c_extent e)
{
- struct bch_dev *ca = arg;
copygc_heap *h = &ca->copygc_heap;
const struct bch_extent_ptr *ptr =
bch2_extent_has_device(e, ca->dev_idx);
return false;
}
+static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
+ enum bkey_type type,
+ struct bkey_s_c_extent e,
+ struct bch_io_opts *io_opts,
+ struct data_opts *data_opts)
+{
+ struct bch_dev *ca = arg;
+
+ if (!__copygc_pred(ca, e))
+ return DATA_SKIP;
+
+ data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE,
+ data_opts->rewrite_dev = ca->dev_idx;
+ return DATA_REWRITE;
+}
+
static bool have_copygc_reserve(struct bch_dev *ca)
{
bool ret;
SECTORS_IN_FLIGHT_PER_DEVICE,
&ca->self,
writepoint_ptr(&ca->copygc_write_point),
- BTREE_INSERT_USE_RESERVE,
- ca->dev_idx,
POS_MIN, POS_MAX,
copygc_pred, ca,
&move_stats);
"none",
"lz4",
"gzip",
+ "zstd",
NULL
};
BCH_OPT(errors, u8, OPT_RUNTIME, \
OPT_STR(bch2_error_actions), \
BCH_SB_ERROR_ACTION, BCH_ON_ERROR_RO) \
- BCH_OPT(metadata_replicas, u8, OPT_MOUNT, \
+ BCH_OPT(metadata_replicas, u8, OPT_RUNTIME, \
OPT_UINT(1, BCH_REPLICAS_MAX), \
BCH_SB_META_REPLICAS_WANT, 1) \
- BCH_OPT(data_replicas, u8, OPT_MOUNT, \
+ BCH_OPT(data_replicas, u8, OPT_RUNTIME, \
OPT_UINT(1, BCH_REPLICAS_MAX), \
BCH_SB_DATA_REPLICAS_WANT, 1) \
BCH_OPT(metadata_replicas_required, u8, OPT_MOUNT, \
BCH_OPT(verbose_recovery, u8, OPT_MOUNT, \
OPT_BOOL(), \
NO_SB_OPT, false) \
+ BCH_OPT(verbose_init, u8, OPT_MOUNT, \
+ OPT_BOOL(), \
+ NO_SB_OPT, false) \
BCH_OPT(journal_flush_disabled, u8, OPT_RUNTIME, \
OPT_BOOL(), \
NO_SB_OPT, false) \
_i < QTYP_NR); \
_i++)
-static inline unsigned enabled_qtypes(struct bch_fs *c)
-{
- return ((c->opts.usrquota << QTYP_USR)|
- (c->opts.grpquota << QTYP_GRP)|
- (c->opts.prjquota << QTYP_PRJ));
-}
-
static bool ignore_hardlimit(struct bch_memquota_type *q)
{
if (capable(CAP_SYS_RESOURCE))
if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota)
return -EINVAL;
- if (uflags & FS_QUOTA_PDQ_ENFD)
+ if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota)
return -EINVAL;
mutex_lock(&c->sb_lock);
if (uflags & FS_QUOTA_GDQ_ENFD)
SET_BCH_SB_GRPQUOTA(c->disk_sb, true);
-#if 0
+
if (uflags & FS_QUOTA_PDQ_ENFD)
SET_BCH_SB_PRJQUOTA(c->disk_sb, true);
-#endif
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
};
}
+static inline unsigned enabled_qtypes(struct bch_fs *c)
+{
+ return ((c->opts.usrquota << QTYP_USR)|
+ (c->opts.grpquota << QTYP_GRP)|
+ (c->opts.prjquota << QTYP_PRJ));
+}
+
#ifdef CONFIG_BCACHEFS_QUOTA
int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters,
* https://131002.net/siphash/
*/
-#include <linux/compiler.h>
#include <asm/byteorder.h>
#include <asm/unaligned.h>
#include <linux/bitops.h>
__le64 *i;
int ret;
+ pr_verbose_init(*opts, "");
+
memset(sb, 0, sizeof(*sb));
sb->mode = FMODE_READ;
opt_set(*opts, nochanges, true);
}
- if (IS_ERR(sb->bdev))
- return PTR_ERR(sb->bdev);
+ if (IS_ERR(sb->bdev)) {
+ ret = PTR_ERR(sb->bdev);
+ goto out;
+ }
err = "cannot allocate memory";
ret = __bch2_super_realloc(sb, 0);
if (sb->mode & FMODE_WRITE)
bdev_get_queue(sb->bdev)->backing_dev_info->capabilities
|= BDI_CAP_STABLE_WRITES;
-
- return 0;
+ ret = 0;
+out:
+ pr_verbose_init(*opts, "ret %i", ret);
+ return ret;
err:
bch2_free_super(sb);
pr_err("error reading superblock: %s", err);
- return ret;
+ goto out;
}
/* write superblock: */
nr_wrote = dev_mask_nr(&sb_written);
can_mount_with_written =
- bch2_have_enough_devs(c,
- __bch2_replicas_status(c, sb_written),
- BCH_FORCE_IF_DEGRADED);
+ bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
+ BCH_FORCE_IF_DEGRADED);
for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
sb_written.d[i] = ~sb_written.d[i];
can_mount_without_written =
- bch2_have_enough_devs(c,
- __bch2_replicas_status(c, sb_written),
- BCH_FORCE_IF_DEGRADED);
+ bch2_have_enough_devs(__bch2_replicas_status(c, sb_written),
+ BCH_FORCE_IF_DEGRADED);
/*
* If we would be able to mount _without_ the devices we successfully
}
noinline
-static int bch2_check_mark_super_slowpath(struct bch_fs *c,
+static int bch2_mark_replicas_slowpath(struct bch_fs *c,
struct bch_replicas_cpu_entry new_entry,
unsigned max_dev)
{
return ret;
}
-int bch2_check_mark_super(struct bch_fs *c,
- enum bch_data_type data_type,
- struct bch_devs_list devs)
+int bch2_mark_replicas(struct bch_fs *c,
+ enum bch_data_type data_type,
+ struct bch_devs_list devs)
{
struct bch_replicas_cpu_entry search;
struct bch_replicas_cpu *r, *gc_r;
if (!devs.nr)
return 0;
+ BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
+
devlist_to_replicas(devs, data_type, &search, &max_dev);
rcu_read_lock();
rcu_read_unlock();
return likely(marked) ? 0
- : bch2_check_mark_super_slowpath(c, search, max_dev);
+ : bch2_mark_replicas_slowpath(c, search, max_dev);
+}
+
+int bch2_mark_bkey_replicas(struct bch_fs *c,
+ enum bch_data_type data_type,
+ struct bkey_s_c k)
+{
+ struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+ unsigned i;
+ int ret;
+
+ for (i = 0; i < cached.nr; i++)
+ if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
+ bch2_dev_list_single(cached.devs[i]))))
+ return ret;
+
+ return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
}
int bch2_replicas_gc_end(struct bch_fs *c, int err)
/* Query replicas: */
-bool bch2_sb_has_replicas(struct bch_fs *c,
+bool bch2_replicas_marked(struct bch_fs *c,
enum bch_data_type data_type,
struct bch_devs_list devs)
{
return ret;
}
+bool bch2_bkey_replicas_marked(struct bch_fs *c,
+ enum bch_data_type data_type,
+ struct bkey_s_c k)
+{
+ struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+ unsigned i;
+
+ for (i = 0; i < cached.nr; i++)
+ if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
+ bch2_dev_list_single(cached.devs[i])))
+ return false;
+
+ return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k));
+}
+
struct replicas_status __bch2_replicas_status(struct bch_fs *c,
struct bch_devs_mask online_devs)
{
return __bch2_replicas_status(c, bch2_online_devs(c));
}
-bool bch2_have_enough_devs(struct bch_fs *c,
- struct replicas_status s,
- unsigned flags)
+static bool have_enough_devs(struct replicas_status s,
+ enum bch_data_type type,
+ bool force_if_degraded,
+ bool force_if_lost)
{
- if ((s.replicas[BCH_DATA_JOURNAL].nr_offline ||
- s.replicas[BCH_DATA_BTREE].nr_offline) &&
- !(flags & BCH_FORCE_IF_METADATA_DEGRADED))
- return false;
-
- if ((!s.replicas[BCH_DATA_JOURNAL].nr_online ||
- !s.replicas[BCH_DATA_BTREE].nr_online) &&
- !(flags & BCH_FORCE_IF_METADATA_LOST))
- return false;
-
- if (s.replicas[BCH_DATA_USER].nr_offline &&
- !(flags & BCH_FORCE_IF_DATA_DEGRADED))
- return false;
-
- if (!s.replicas[BCH_DATA_USER].nr_online &&
- !(flags & BCH_FORCE_IF_DATA_LOST))
- return false;
+ return (!s.replicas[type].nr_offline || force_if_degraded) &&
+ (s.replicas[type].nr_online || force_if_lost);
+}
- return true;
+bool bch2_have_enough_devs(struct replicas_status s, unsigned flags)
+{
+ return (have_enough_devs(s, BCH_DATA_JOURNAL,
+ flags & BCH_FORCE_IF_METADATA_DEGRADED,
+ flags & BCH_FORCE_IF_METADATA_LOST) &&
+ have_enough_devs(s, BCH_DATA_BTREE,
+ flags & BCH_FORCE_IF_METADATA_DEGRADED,
+ flags & BCH_FORCE_IF_METADATA_LOST) &&
+ have_enough_devs(s, BCH_DATA_USER,
+ flags & BCH_FORCE_IF_DATA_DEGRADED,
+ flags & BCH_FORCE_IF_DATA_LOST));
}
unsigned bch2_replicas_online(struct bch_fs *c, bool meta)
/* BCH_SB_FIELD_replicas: */
-bool bch2_sb_has_replicas(struct bch_fs *, enum bch_data_type,
- struct bch_devs_list);
-int bch2_check_mark_super(struct bch_fs *, enum bch_data_type,
+bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
struct bch_devs_list);
+bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type,
+ struct bkey_s_c);
+int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
+ struct bch_devs_list);
+int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type,
+ struct bkey_s_c);
int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t);
int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t);
struct replicas_status __bch2_replicas_status(struct bch_fs *,
struct bch_devs_mask);
struct replicas_status bch2_replicas_status(struct bch_fs *);
-bool bch2_have_enough_devs(struct bch_fs *, struct replicas_status, unsigned);
+bool bch2_have_enough_devs(struct replicas_status, unsigned);
unsigned bch2_replicas_online(struct bch_fs *, bool);
unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *);
struct bch_fs *c;
unsigned i, iter_size;
+ pr_verbose_init(opts, "");
+
c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
if (!c)
- return NULL;
+ goto out;
__module_get(THIS_MODULE);
mutex_init(&c->btree_interior_update_lock);
mutex_init(&c->bio_bounce_pages_lock);
- mutex_init(&c->zlib_workspace_lock);
bio_list_init(&c->btree_write_error_list);
spin_lock_init(&c->btree_write_error_lock);
kobject_init(&c->internal, &bch2_fs_internal_ktype);
kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype);
kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype);
+out:
+ pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM);
return c;
err:
bch2_fs_free(c);
- return NULL;
+ c = NULL;
+ goto out;
}
static const char *__bch2_fs_online(struct bch_fs *c)
goto err;
bch_verbose(c, "fsck done");
- if (c->opts.usrquota || c->opts.grpquota) {
+ if (enabled_qtypes(c)) {
bch_verbose(c, "reading quotas:");
ret = bch2_fs_quota_read(c);
if (ret)
NULL, NULL, NULL, 0))
goto err;
- if (c->opts.usrquota || c->opts.grpquota) {
+ if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
if (ret)
goto err;
static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
{
struct bch_member *member;
- struct bch_dev *ca;
+ struct bch_dev *ca = NULL;
+ int ret = 0;
+
+ pr_verbose_init(c->opts, "");
if (bch2_fs_init_fault("dev_alloc"))
- return -ENOMEM;
+ goto err;
ca = kzalloc(sizeof(*ca), GFP_KERNEL);
if (!ca)
- return -ENOMEM;
+ goto err;
kobject_init(&ca->kobj, &bch2_dev_ktype);
init_completion(&ca->ref_completion);
if (bch2_dev_sysfs_online(c, ca))
pr_warn("error creating sysfs objects");
-
- return 0;
+out:
+ pr_verbose_init(c->opts, "ret %i", ret);
+ return ret;
err:
- bch2_dev_free(ca);
- return -ENOMEM;
+ if (ca)
+ bch2_dev_free(ca);
+ ret = -ENOMEM;
+ goto out;
}
static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
/* do we have enough devices to write to? */
for_each_member_device(ca2, c, i)
- nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW;
+ if (ca2 != ca)
+ nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW;
required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
? c->opts.metadata_replicas
? c->opts.data_replicas
: c->opts.data_replicas_required);
- return nr_rw - 1 <= required;
+ return nr_rw >= required;
case BCH_MEMBER_STATE_FAILED:
case BCH_MEMBER_STATE_SPARE:
if (ca->mi.state != BCH_MEMBER_STATE_RW &&
s = __bch2_replicas_status(c, new_online_devs);
- return bch2_have_enough_devs(c, s, flags);
+ return bch2_have_enough_devs(s, flags);
default:
BUG();
}
s = bch2_replicas_status(c);
- return bch2_have_enough_devs(c, s, flags);
+ return bch2_have_enough_devs(s, flags);
}
static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
if (!bch2_dev_state_allowed(c, ca, new_state, flags))
return -EINVAL;
- if (new_state == BCH_MEMBER_STATE_RW) {
- if (__bch2_dev_read_write(c, ca))
- return -ENOMEM;
- } else {
+ if (new_state != BCH_MEMBER_STATE_RW)
__bch2_dev_read_only(c, ca);
- }
bch_notice(ca, "%s", bch2_dev_state[new_state]);
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
+ if (new_state == BCH_MEMBER_STATE_RW)
+ return __bch2_dev_read_write(c, ca) ? -ENOMEM : 0;
+
return 0;
}
const char *err;
int ret = -ENOMEM;
- if (!nr_devices)
- return ERR_PTR(-EINVAL);
+ pr_verbose_init(opts, "");
- if (!try_module_get(THIS_MODULE))
- return ERR_PTR(-ENODEV);
+ if (!nr_devices) {
+ c = ERR_PTR(-EINVAL);
+ goto out2;
+ }
+
+ if (!try_module_get(THIS_MODULE)) {
+ c = ERR_PTR(-ENODEV);
+ goto out2;
+ }
sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL);
if (!sb)
if (err)
goto err_print;
+out:
kfree(sb);
module_put(THIS_MODULE);
+out2:
+ pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c));
return c;
err_print:
pr_err("bch_fs_open err opening %s: %s",
err:
if (c)
bch2_fs_stop(c);
-
for (i = 0; i < nr_devices; i++)
bch2_free_super(&sb[i]);
- kfree(sb);
- module_put(THIS_MODULE);
- return ERR_PTR(ret);
+ c = ERR_PTR(ret);
+ goto out;
}
static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb,
devs->devs[devs->nr++] = dev;
}
+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
+{
+ return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
+}
+
static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
const struct bch_devs_mask *mask)
{
struct bch_devs_list {
u8 nr;
- u8 devs[BCH_REPLICAS_MAX];
+ u8 devs[BCH_REPLICAS_MAX + 1];
};
struct bch_member_cpu {
rw_attribute(journal_write_delay_ms);
rw_attribute(journal_reclaim_delay_ms);
+rw_attribute(writeback_pages_max);
+
rw_attribute(discard);
rw_attribute(cache_replacement_policy);
sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms);
sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
+ sysfs_print(writeback_pages_max, c->writeback_pages_max);
+
sysfs_print(block_size, block_bytes(c));
sysfs_print(btree_node_size, btree_bytes(c));
sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
+ if (attr == &sysfs_writeback_pages_max)
+ c->writeback_pages_max = strtoul_restrict_or_return(buf, 1, UINT_MAX);
+
if (attr == &sysfs_btree_gc_periodic) {
ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
?: (ssize_t) size;
&sysfs_journal_write_delay_ms,
&sysfs_journal_reclaim_delay_ms,
+ &sysfs_writeback_pages_max,
+
&sysfs_tiering_percent,
&sysfs_compression_stats,
#include <linux/kthread.h>
#include <trace/events/bcachefs.h>
-static bool tiering_pred(void *arg, struct bkey_s_c_extent e)
+static bool __tiering_pred(struct bch_fs *c, struct bch_tier *tier,
+ struct bkey_s_c_extent e)
{
- struct bch_tier *tier = arg;
- struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]);
const struct bch_extent_ptr *ptr;
unsigned replicas = 0;
return replicas < c->opts.data_replicas;
}
+static enum data_cmd tiering_pred(struct bch_fs *c, void *arg,
+ enum bkey_type type,
+ struct bkey_s_c_extent e,
+ struct bch_io_opts *io_opts,
+ struct data_opts *data_opts)
+{
+ struct bch_tier *tier = arg;
+
+ if (!__tiering_pred(c, tier, e))
+ return DATA_SKIP;
+
+ data_opts->btree_insert_flags = 0;
+ return DATA_ADD_REPLICAS;
+}
+
static int bch2_tiering_thread(void *arg)
{
struct bch_tier *tier = arg;
SECTORS_IN_FLIGHT_PER_DEVICE * nr_devices,
&tier->devs,
writepoint_ptr(&tier->wp),
- 0,
- -1,
POS_MIN, POS_MAX,
tiering_pred, tier,
&move_stats);
#define array_remove_item(_array, _nr, _pos) \
array_remove_items(_array, _nr, _pos, 1)
+#define bubble_sort(_base, _nr, _cmp) \
+do { \
+ ssize_t _i, _end; \
+ bool _swapped = true; \
+ \
+ for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\
+ _swapped = false; \
+ for (_i = 0; _i < _end; _i++) \
+ if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \
+ swap((_base)[_i], (_base)[_i + 1]); \
+ _swapped = true; \
+ } \
+ } \
+} while (0)
+
#endif /* _BCACHEFS_UTIL_H */