From: Kent Overstreet Date: Fri, 16 Feb 2018 20:36:33 +0000 (-0500) Subject: Update bcachefs sources to e99d29e402 bcachefs: zstd support, compression refactoring X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=807abf36c1e119825d42cda6f6b249649ca44eb5;p=bcachefs-tools-debian Update bcachefs sources to e99d29e402 bcachefs: zstd support, compression refactoring --- diff --git a/.bcachefs_revision b/.bcachefs_revision index 274236e..76acdf9 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -d5e561b3cc023dd247d2b3d08b680709ec21b477 +e99d29e40210f6d9b7ec9e5b7aee1e48ae7655c5 diff --git a/Makefile b/Makefile index ef1eacf..af7a206 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,7 @@ CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall \ -D_GNU_SOURCE \ -D_LGPL_SOURCE \ -DRCU_MEMBARRIER \ + -DZSTD_STATIC_LINKING_ONLY \ -DNO_BCACHEFS_CHARDEV \ -DNO_BCACHEFS_FS \ -DNO_BCACHEFS_SYSFS \ @@ -31,9 +32,15 @@ ifdef D endif PKGCONFIG_LIBS="blkid uuid liburcu libsodium zlib" +PKGCONFIG_LIBS_STATIC="libzstd" + CFLAGS+=`pkg-config --cflags ${PKGCONFIG_LIBS}` -LDLIBS+=`pkg-config --libs ${PKGCONFIG_LIBS}` \ - -lm -lpthread -lrt -lscrypt -lkeyutils -laio +LDLIBS+=`pkg-config --libs ${PKGCONFIG_LIBS}` + +CFLAGS+=`pkg-config --static --cflags ${PKGCONFIG_LIBS_STATIC}` +LDLIBS+=`pkg-config --static --libs ${PKGCONFIG_LIBS_STATIC}` + +LDLIBS+=-lm -lpthread -lrt -lscrypt -lkeyutils -laio ifeq ($(PREFIX),/usr) ROOT_SBINDIR=/sbin diff --git a/cmd_migrate.c b/cmd_migrate.c index d676bb5..4ba3538 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -344,8 +344,8 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst, die("error reserving space in new filesystem: %s", strerror(-ret)); - bch2_check_mark_super(c, BCH_DATA_USER, - bch2_bkey_devs(extent_i_to_s_c(e).s_c)); + bch2_mark_bkey_replicas(c, BCH_DATA_USER, + extent_i_to_s_c(e).s_c); ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i, &res, NULL, NULL, 0); diff --git a/debian/control b/debian/control index 07f2f2f..08673f4 100644 --- a/debian/control +++ b/debian/control @@ -5,7 +5,7 @@ Priority: optional Standards-Version: 3.9.5 Build-Depends: debhelper (>= 9), pkg-config, libblkid-dev, uuid-dev, libscrypt-dev, libsodium-dev, libkeyutils-dev, liburcu-dev, zlib1g-dev, - libattr1-dev, libaio-dev + libattr1-dev, libaio-dev, libzstd-dev Homepage: http://bcache.evilpiepirate.org/ Package: bcachefs-tools diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index c195ffb..339ffd0 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -1201,43 +1201,56 @@ out: return ob - c->open_buckets; } -struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c, - struct write_point *wp, - struct bch_devs_mask *devs) +static int __dev_alloc_cmp(struct bch_fs *c, + struct write_point *wp, + unsigned l, unsigned r) { - struct dev_alloc_list ret = { .nr = 0 }; - struct bch_dev *ca, *ca2; - unsigned i, j; + struct bch_dev *ca_l = rcu_dereference(c->devs[l]); + struct bch_dev *ca_r = rcu_dereference(c->devs[r]); - for_each_member_device_rcu(ca, c, i, devs) { - for (j = 0; j < ret.nr; j++) { - unsigned idx = ret.devs[j]; + if (ca_l && ca_r && ca_l->mi.tier != ca_r->mi.tier) + return ((ca_l->mi.tier > ca_r->mi.tier) - + (ca_l->mi.tier < ca_r->mi.tier)); - ca2 = rcu_dereference(c->devs[idx]); - if (!ca2) - break; + return ((wp->next_alloc[l] > wp->next_alloc[r]) - + (wp->next_alloc[l] < wp->next_alloc[r])); +} - if (ca->mi.tier < ca2->mi.tier) - break; +#define dev_alloc_cmp(l, r) __dev_alloc_cmp(c, wp, l, r) - if (ca->mi.tier == ca2->mi.tier && - wp->next_alloc[i] < wp->next_alloc[idx]) - break; - } +struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c, + struct write_point *wp, + struct bch_devs_mask *devs) +{ + struct dev_alloc_list ret = { .nr = 0 }; + struct bch_dev *ca; + unsigned i; - array_insert_item(ret.devs, ret.nr, j, i); - } + for_each_member_device_rcu(ca, c, i, devs) + ret.devs[ret.nr++] = i; + bubble_sort(ret.devs, ret.nr, dev_alloc_cmp); return ret; } void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca, struct write_point *wp) { - unsigned i; + u64 *v = wp->next_alloc + ca->dev_idx; + u64 free_space = dev_buckets_free(c, ca); + u64 free_space_inv = free_space + ? div64_u64(1ULL << 48, free_space) + : 1ULL << 48; + u64 scale = *v / 4; + + if (*v + free_space_inv >= *v) + *v += free_space_inv; + else + *v = U64_MAX; - for (i = 0; i < ARRAY_SIZE(wp->next_alloc); i++) - wp->next_alloc[i] >>= 1; + for (v = wp->next_alloc; + v < wp->next_alloc + ARRAY_SIZE(wp->next_alloc); v++) + *v = *v < scale ? 0 : *v - scale; } static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c, @@ -1249,7 +1262,6 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c, { enum bucket_alloc_ret ret = NO_DEVICES; struct dev_alloc_list devs_sorted; - u64 buckets_free; unsigned i; BUG_ON(nr_replicas > ARRAY_SIZE(wp->ptrs)); @@ -1281,13 +1293,6 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c, BUG_ON(wp->nr_ptrs >= ARRAY_SIZE(wp->ptrs)); wp->ptrs[wp->nr_ptrs++] = c->open_buckets + ob; - buckets_free = U64_MAX, dev_buckets_free(c, ca); - if (buckets_free) - wp->next_alloc[ca->dev_idx] += - div64_u64(U64_MAX, buckets_free * - ca->mi.bucket_size); - else - wp->next_alloc[ca->dev_idx] = U64_MAX; bch2_wp_rescale(c, ca, wp); __clear_bit(ca->dev_idx, devs->d); diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index cb9906c..5a3e99b 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -194,6 +194,7 @@ #include #include #include +#include #include "bcachefs_format.h" #include "bset.h" @@ -231,6 +232,12 @@ do { \ bch_info(c, fmt, ##__VA_ARGS__); \ } while (0) +#define pr_verbose_init(opts, fmt, ...) \ +do { \ + if (opt_get(opts, verbose_init)) \ + pr_info(fmt, ##__VA_ARGS__); \ +} while (0) + /* Parameters that are useful for debugging, but should always be compiled in: */ #define BCH_DEBUG_PARAMS_ALWAYS() \ BCH_DEBUG_PARAM(key_merging_disabled, \ @@ -646,10 +653,10 @@ struct bch_fs { struct mutex bio_bounce_pages_lock; mempool_t bio_bounce_pages; - mempool_t lz4_workspace_pool; - void *zlib_workspace; - struct mutex zlib_workspace_lock; mempool_t compression_bounce[2]; + mempool_t compress_workspace[BCH_COMPRESSION_NR]; + mempool_t decompress_workspace; + ZSTD_parameters zstd_params; struct crypto_shash *sha256; struct crypto_skcipher *chacha20; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 854e1c3..5e40627 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -6,7 +6,6 @@ */ #include -#include #include #include @@ -370,7 +369,8 @@ enum bch_compression_type { BCH_COMPRESSION_LZ4_OLD = 1, BCH_COMPRESSION_GZIP = 2, BCH_COMPRESSION_LZ4 = 3, - BCH_COMPRESSION_NR = 4, + BCH_COMPRESSION_ZSTD = 4, + BCH_COMPRESSION_NR = 5, }; enum bch_extent_entry_type { @@ -1082,6 +1082,7 @@ LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); enum bch_sb_features { BCH_FEATURE_LZ4 = 0, BCH_FEATURE_GZIP = 1, + BCH_FEATURE_ZSTD = 2, }; /* options: */ @@ -1109,11 +1110,17 @@ enum bch_str_hash_opts { BCH_STR_HASH_NR = 3, }; +#define BCH_COMPRESSION_TYPES() \ + x(NONE) \ + x(LZ4) \ + x(GZIP) \ + x(ZSTD) + enum bch_compression_opts { - BCH_COMPRESSION_OPT_NONE = 0, - BCH_COMPRESSION_OPT_LZ4 = 1, - BCH_COMPRESSION_OPT_GZIP = 2, - BCH_COMPRESSION_OPT_NR = 3, +#define x(t) BCH_COMPRESSION_OPT_##t, + BCH_COMPRESSION_TYPES() +#undef x + BCH_COMPRESSION_OPT_NR }; /* @@ -1322,8 +1329,10 @@ struct btree_node { }; } __attribute__((packed, aligned(8))); -LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4); -LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); +LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4); +LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); +/* 8-32 unused */ +LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64); struct btree_node_entry { struct bch_csum csum; diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 0bde449..7eae4d2 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -373,19 +373,23 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) { struct btree_cache *bc = &c->btree_cache; unsigned i; - int ret; + int ret = 0; + + pr_verbose_init(c->opts, ""); ret = rhashtable_init(&bc->table, &bch_btree_cache_params); if (ret) - return ret; + goto out; bc->table_init_done = true; bch2_recalc_btree_reserve(c); for (i = 0; i < bc->reserve; i++) - if (!btree_node_mem_alloc(c, GFP_KERNEL)) - return -ENOMEM; + if (!btree_node_mem_alloc(c, GFP_KERNEL)) { + ret = -ENOMEM; + goto out; + } list_splice_init(&bc->live, &bc->freeable); @@ -393,12 +397,16 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) mutex_init(&c->verify_lock); c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); - if (!c->verify_ondisk) - return -ENOMEM; + if (!c->verify_ondisk) { + ret = -ENOMEM; + goto out; + } c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL); - if (!c->verify_data) - return -ENOMEM; + if (!c->verify_data) { + ret = -ENOMEM; + goto out; + } list_del_init(&c->verify_data->list); #endif @@ -408,8 +416,9 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bc->shrink.seeks = 4; bc->shrink.batch = btree_pages(c) * 2; register_shrinker(&bc->shrink); - - return 0; +out: + pr_verbose_init(c->opts, "ret %i", ret); + return ret; } void bch2_fs_btree_cache_init_early(struct btree_cache *bc) diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 6350866..f2e9c10 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -148,14 +148,13 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, { enum bch_data_type data_type = type == BKEY_TYPE_BTREE ? BCH_DATA_BTREE : BCH_DATA_USER; - struct bch_devs_list devs = bch2_bkey_devs(k); int ret = 0; if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_sb_has_replicas(c, data_type, devs), c, + fsck_err_on(!bch2_bkey_replicas_marked(c, data_type, k), c, "superblock not marked as containing replicas (type %u)", data_type)) { - ret = bch2_check_mark_super(c, data_type, devs); + ret = bch2_mark_bkey_replicas(c, data_type, k); if (ret) return ret; } diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 9b4eff1..d805fb4 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1135,6 +1135,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry unsigned sectors, whiteout_u64s = 0; struct nonce nonce; struct bch_csum csum; + bool first = !b->written; if (!b->written) { i = &b->data->keys; @@ -1194,10 +1195,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry } if (ret) { - btree_err_on(!b->written, + btree_err_on(first, BTREE_ERR_FIXABLE, c, b, i, "first btree node bset has blacklisted journal seq"); - if (b->written) + if (!first) continue; } diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index c45527a..0e0156d 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -430,6 +430,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, n->data->min_key = b->data->min_key; n->data->max_key = b->data->max_key; n->data->format = format; + SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); btree_node_set_format(n, format); @@ -559,8 +560,8 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c, goto err_free; } - ret = bch2_check_mark_super(c, BCH_DATA_BTREE, - bch2_bkey_devs(bkey_i_to_s_c(&b->key))); + ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE, + bkey_i_to_s_c(&b->key)); if (ret) goto err_free; @@ -1225,6 +1226,7 @@ static struct btree *__btree_split_node(struct btree_update *as, n2->data->max_key = n1->data->max_key; n2->data->format = n1->format; + SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data)); n2->key.k.p = n1->key.k.p; btree_node_set_format(n2, n2->data->format); @@ -2019,8 +2021,8 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, goto err; } - ret = bch2_check_mark_super(c, BCH_DATA_BTREE, - bch2_extent_devs(extent_i_to_s_c(new_key))); + ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE, + extent_i_to_s_c(new_key).s_c); if (ret) goto err_free_update; diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 4b252b6..007aa5e 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -272,15 +272,10 @@ static void multi_unlock_write(struct btree_insert *trans) bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter); } -static inline void btree_trans_sort(struct btree_insert *trans) +static inline int btree_trans_cmp(struct btree_insert_entry l, + struct btree_insert_entry r) { - int i, end = trans->nr; - - while (--end > 0) - for (i = 0; i < end; i++) - if (btree_iter_cmp(trans->entries[i].iter, - trans->entries[i + 1].iter) > 0) - swap(trans->entries[i], trans->entries[i + 1]); + return btree_iter_cmp(l.iter, r.iter); } /* Normal update interface: */ @@ -313,7 +308,7 @@ int __bch2_btree_insert_at(struct btree_insert *trans) bkey_i_to_s_c(i->k))); } - btree_trans_sort(trans); + bubble_sort(trans->entries, trans->nr, btree_trans_cmp); if (unlikely(!percpu_ref_tryget(&c->writes))) return -EROFS; diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index 0875585..56bd99f 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -219,12 +219,16 @@ int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, crypto_alloc_skcipher("chacha20", 0, 0); int ret; - if (!chacha20) + if (!chacha20) { + pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20)); return PTR_ERR(chacha20); + } ret = crypto_skcipher_setkey(chacha20, (void *) key, sizeof(*key)); - if (ret) + if (ret) { + pr_err("crypto_skcipher_setkey() error: %i", ret); goto err; + } do_encrypt(chacha20, nonce, buf, len); err: @@ -567,7 +571,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c, ret = bch2_request_key(c->disk_sb, &user_key); if (ret) { - bch_err(c, "error requesting encryption key"); + bch_err(c, "error requesting encryption key: %i", ret); goto err; } @@ -594,13 +598,19 @@ static int bch2_alloc_ciphers(struct bch_fs *c) { if (!c->chacha20) c->chacha20 = crypto_alloc_skcipher("chacha20", 0, 0); - if (IS_ERR(c->chacha20)) + if (IS_ERR(c->chacha20)) { + bch_err(c, "error requesting chacha20 module: %li", + PTR_ERR(c->chacha20)); return PTR_ERR(c->chacha20); + } if (!c->poly1305) c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); - if (IS_ERR(c->poly1305)) + if (IS_ERR(c->poly1305)) { + bch_err(c, "error requesting poly1305 module: %li", + PTR_ERR(c->poly1305)); return PTR_ERR(c->poly1305); + } return 0; } @@ -660,7 +670,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) if (keyed) { ret = bch2_request_key(c->disk_sb, &user_key); if (ret) { - bch_err(c, "error requesting encryption key"); + bch_err(c, "error requesting encryption key: %i", ret); goto err; } @@ -707,27 +717,35 @@ int bch2_fs_encryption_init(struct bch_fs *c) { struct bch_sb_field_crypt *crypt; struct bch_key key; - int ret; + int ret = 0; + + pr_verbose_init(c->opts, ""); c->sha256 = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(c->sha256)) - return PTR_ERR(c->sha256); + if (IS_ERR(c->sha256)) { + bch_err(c, "error requesting sha256 module"); + ret = PTR_ERR(c->sha256); + goto out; + } crypt = bch2_sb_get_crypt(c->disk_sb); if (!crypt) - return 0; + goto out; ret = bch2_alloc_ciphers(c); if (ret) - return ret; + goto out; ret = bch2_decrypt_sb_key(c, crypt, &key); if (ret) - goto err; + goto out; ret = crypto_skcipher_setkey(c->chacha20, (void *) &key.key, sizeof(key.key)); -err: + if (ret) + goto out; +out: memzero_explicit(&key, sizeof(key)); + pr_verbose_init(c->opts, "ret %i", ret); return ret; } diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index b0c8a50..7862294 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -91,20 +91,11 @@ static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); } -static inline enum bch_compression_type -bch2_compression_opt_to_type(enum bch_compression_opts type) -{ - switch (type) { - case BCH_COMPRESSION_OPT_NONE: - return BCH_COMPRESSION_NONE; - case BCH_COMPRESSION_OPT_LZ4: - return BCH_COMPRESSION_LZ4; - case BCH_COMPRESSION_OPT_GZIP: - return BCH_COMPRESSION_GZIP; - default: - BUG(); - } -} +static const unsigned bch2_compression_opt_to_type[] = { +#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_##t, + BCH_COMPRESSION_TYPES() +#undef x +}; static inline bool bch2_checksum_type_valid(const struct bch_fs *c, unsigned type) diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index 6407998..7726cfd 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -8,6 +8,7 @@ #include "lz4.h" #include #include +#include /* Bounce buffer: */ struct bbuf { @@ -151,6 +152,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, struct bbuf src_data = { NULL }; size_t src_len = src->bi_iter.bi_size; size_t dst_len = crc.uncompressed_size << 9; + void *workspace; int ret; src_data = bio_map_or_bounce(c, src, READ); @@ -159,57 +161,64 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, case BCH_COMPRESSION_LZ4_OLD: ret = bch2_lz4_decompress(src_data.b, &src_len, dst_data, dst_len); - if (ret) { - ret = -EIO; + if (ret) goto err; - } break; case BCH_COMPRESSION_LZ4: ret = LZ4_decompress_safe_partial(src_data.b, dst_data, src_len, dst_len, dst_len); - if (ret != dst_len) { - ret = -EIO; + if (ret != dst_len) goto err; - } break; case BCH_COMPRESSION_GZIP: { - void *workspace; - z_stream strm; - - workspace = kmalloc(zlib_inflate_workspacesize(), - GFP_NOIO|__GFP_NOWARN); - if (!workspace) { - mutex_lock(&c->zlib_workspace_lock); - workspace = c->zlib_workspace; - } + z_stream strm = { + .next_in = src_data.b, + .avail_in = src_len, + .next_out = dst_data, + .avail_out = dst_len, + }; + + workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); - strm.next_in = src_data.b; - strm.avail_in = src_len; - strm.next_out = dst_data; - strm.avail_out = dst_len; zlib_set_workspace(&strm, workspace); zlib_inflateInit2(&strm, -MAX_WBITS); - ret = zlib_inflate(&strm, Z_FINISH); - if (workspace == c->zlib_workspace) - mutex_unlock(&c->zlib_workspace_lock); - else - kfree(workspace); + mempool_free(workspace, &c->decompress_workspace); - if (ret != Z_STREAM_END) { - ret = -EIO; + if (ret != Z_STREAM_END) + goto err; + break; + } + case BCH_COMPRESSION_ZSTD: { + ZSTD_DCtx *ctx; + size_t len; + + workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); + ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound()); + + src_len = le32_to_cpup(src_data.b); + + len = ZSTD_decompressDCtx(ctx, + dst_data, dst_len, + src_data.b + 4, src_len); + + mempool_free(workspace, &c->decompress_workspace); + + if (len != dst_len) goto err; - } break; } default: BUG(); } ret = 0; -err: +out: bio_unmap_or_unbounce(c, src_data); return ret; +err: + ret = -EIO; + goto out; } int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, @@ -282,113 +291,129 @@ err: return ret; } +static int attempt_compress(struct bch_fs *c, + void *workspace, + void *dst, size_t dst_len, + void *src, size_t src_len, + unsigned compression_type) +{ + switch (compression_type) { + case BCH_COMPRESSION_LZ4: { + int len = src_len; + int ret = LZ4_compress_destSize( + src, dst, + &len, dst_len, + workspace); + + if (len < src_len) + return -len; + + return ret; + } + case BCH_COMPRESSION_GZIP: { + z_stream strm = { + .next_in = src, + .avail_in = src_len, + .next_out = dst, + .avail_out = dst_len, + }; + + zlib_set_workspace(&strm, workspace); + zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION, + Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, + Z_DEFAULT_STRATEGY); + + if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) + return 0; + + if (zlib_deflateEnd(&strm) != Z_OK) + return 0; + + return strm.total_out; + } + case BCH_COMPRESSION_ZSTD: { + ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace, + ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams)); + + size_t len = ZSTD_compressCCtx(ctx, + dst + 4, dst_len - 4, + src, src_len, + c->zstd_params); + if (ZSTD_isError(len)) + return 0; + + *((__le32 *) dst) = cpu_to_le32(len); + return len + 4; + } + default: + BUG(); + } +} + static unsigned __bio_compress(struct bch_fs *c, struct bio *dst, size_t *dst_len, struct bio *src, size_t *src_len, unsigned compression_type) { struct bbuf src_data = { NULL }, dst_data = { NULL }; + void *workspace; unsigned pad; int ret = 0; /* If it's only one block, don't bother trying to compress: */ if (bio_sectors(src) <= c->opts.block_size) - goto err; + return 0; dst_data = bio_map_or_bounce(c, dst, WRITE); src_data = bio_map_or_bounce(c, src, READ); - switch (compression_type) { - case BCH_COMPRESSION_LZ4_OLD: - compression_type = BCH_COMPRESSION_LZ4; + workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO); - case BCH_COMPRESSION_LZ4: { - void *workspace; - int len = src->bi_iter.bi_size; - - workspace = mempool_alloc(&c->lz4_workspace_pool, GFP_NOIO); - - while (1) { - if (len <= block_bytes(c)) { - ret = 0; - break; - } - - ret = LZ4_compress_destSize( - src_data.b, dst_data.b, - &len, dst->bi_iter.bi_size, - workspace); - if (ret >= len) { - /* uncompressible: */ - ret = 0; - break; - } - - if (!(len & (block_bytes(c) - 1))) - break; - len = round_down(len, block_bytes(c)); - } - mempool_free(workspace, &c->lz4_workspace_pool); + *src_len = src->bi_iter.bi_size; + *dst_len = dst->bi_iter.bi_size; - if (!ret) - goto err; - - *src_len = len; - *dst_len = ret; - ret = 0; - break; - } - case BCH_COMPRESSION_GZIP: { - void *workspace; - z_stream strm; - - workspace = kmalloc(zlib_deflate_workspacesize(MAX_WBITS, - DEF_MEM_LEVEL), - GFP_NOIO|__GFP_NOWARN); - if (!workspace) { - mutex_lock(&c->zlib_workspace_lock); - workspace = c->zlib_workspace; + /* + * XXX: this algorithm sucks when the compression code doesn't tell us + * how much would fit, like LZ4 does: + */ + while (1) { + if (*src_len <= block_bytes(c)) { + ret = -1; + break; } - strm.next_in = src_data.b; - strm.avail_in = min(src->bi_iter.bi_size, - dst->bi_iter.bi_size); - strm.next_out = dst_data.b; - strm.avail_out = dst->bi_iter.bi_size; - zlib_set_workspace(&strm, workspace); - zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION, - Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, - Z_DEFAULT_STRATEGY); - - ret = zlib_deflate(&strm, Z_FINISH); - if (ret != Z_STREAM_END) { - ret = -EIO; - goto zlib_err; + ret = attempt_compress(c, workspace, + dst_data.b, *dst_len, + src_data.b, *src_len, + compression_type); + if (ret > 0) { + *dst_len = ret; + ret = 0; + break; } - ret = zlib_deflateEnd(&strm); - if (ret != Z_OK) { - ret = -EIO; - goto zlib_err; + /* Didn't fit: should we retry with a smaller amount? */ + if (*src_len <= *dst_len) { + ret = -1; + break; } - ret = 0; -zlib_err: - if (workspace == c->zlib_workspace) - mutex_unlock(&c->zlib_workspace_lock); + /* + * If ret is negative, it's a hint as to how much data would fit + */ + BUG_ON(-ret >= *src_len); + + if (ret < 0) + *src_len = -ret; else - kfree(workspace); + *src_len -= (*src_len - *dst_len) / 2; + *src_len = round_down(*src_len, block_bytes(c)); + } - if (ret) - goto err; + mempool_free(workspace, &c->compress_workspace[compression_type]); - *dst_len = strm.total_out; - *src_len = strm.total_in; - break; - } - default: - BUG(); - } + if (ret) + goto err; /* Didn't get smaller: */ if (round_up(*dst_len, block_bytes(c)) >= *src_len) @@ -429,6 +454,9 @@ unsigned bch2_bio_compress(struct bch_fs *c, /* Don't generate a bigger output than input: */ dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); + if (compression_type == BCH_COMPRESSION_LZ4_OLD) + compression_type = BCH_COMPRESSION_LZ4; + compression_type = __bio_compress(c, dst, dst_len, src, src_len, compression_type); @@ -437,81 +465,147 @@ unsigned bch2_bio_compress(struct bch_fs *c, return compression_type; } +#define BCH_FEATURE_NONE 0 + +static const unsigned bch2_compression_opt_to_feature[] = { +#define x(t) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t, + BCH_COMPRESSION_TYPES() +#undef x +}; + +#undef BCH_FEATURE_NONE + /* doesn't write superblock: */ int bch2_check_set_has_compressed_data(struct bch_fs *c, unsigned compression_type) { - switch (compression_type) { - case BCH_COMPRESSION_OPT_NONE: - return 0; - case BCH_COMPRESSION_OPT_LZ4: - if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4)) - return 0; + unsigned f; + int ret = 0; - bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_LZ4); - break; - case BCH_COMPRESSION_OPT_GZIP: - if (bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) - return 0; + pr_verbose_init(c->opts, ""); - bch2_sb_set_feature(c->disk_sb, BCH_FEATURE_GZIP); - break; - default: - BUG(); - } + BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); + + if (!compression_type) + goto out; - return bch2_fs_compress_init(c); + f = bch2_compression_opt_to_feature[compression_type]; + if (bch2_sb_test_feature(c->disk_sb, f)) + goto out; + + bch2_sb_set_feature(c->disk_sb, f); + ret = bch2_fs_compress_init(c); +out: + pr_verbose_init(c->opts, "ret %i", ret); + return ret; } void bch2_fs_compress_exit(struct bch_fs *c) { - vfree(c->zlib_workspace); - mempool_exit(&c->lz4_workspace_pool); + unsigned i; + + mempool_exit(&c->decompress_workspace); + for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) + mempool_exit(&c->compress_workspace[i]); mempool_exit(&c->compression_bounce[WRITE]); mempool_exit(&c->compression_bounce[READ]); } -#define COMPRESSION_WORKSPACE_SIZE \ - max_t(size_t, zlib_inflate_workspacesize(), \ - zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL)) +static void *mempool_kvpmalloc(gfp_t gfp_mask, void *pool_data) +{ + size_t size = (size_t)pool_data; + return kvpmalloc(size, gfp_mask); +} + +void mempool_kvpfree(void *element, void *pool_data) +{ + size_t size = (size_t)pool_data; + kvpfree(element, size); +} + +static int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) +{ + return !mempool_initialized(pool) + ? mempool_init(pool, min_nr, mempool_kvpmalloc, + mempool_kvpfree, (void *) size) + : 0; +} int bch2_fs_compress_init(struct bch_fs *c) { - unsigned order = get_order(c->sb.encoded_extent_max << 9); - int ret; + size_t max_extent = c->sb.encoded_extent_max << 9; + size_t order = get_order(max_extent); + size_t decompress_workspace_size = 0; + bool decompress_workspace_needed; + ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0); + struct { + unsigned feature; + unsigned type; + size_t compress_workspace; + size_t decompress_workspace; + } compression_types[] = { + { BCH_FEATURE_LZ4, BCH_COMPRESSION_LZ4, LZ4_MEM_COMPRESS, 0 }, + { BCH_FEATURE_GZIP, BCH_COMPRESSION_GZIP, + zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), + zlib_inflate_workspacesize(), }, + { BCH_FEATURE_ZSTD, BCH_COMPRESSION_ZSTD, + ZSTD_CCtxWorkspaceBound(params.cParams), + ZSTD_DCtxWorkspaceBound() }, + }, *i; + int ret = 0; - if (!bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4) && - !bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) - return 0; + pr_verbose_init(c->opts, ""); + + c->zstd_params = params; + + for (i = compression_types; + i < compression_types + ARRAY_SIZE(compression_types); + i++) + if (bch2_sb_test_feature(c->disk_sb, i->feature)) + goto have_compressed; + + goto out; +have_compressed: if (!mempool_initialized(&c->compression_bounce[READ])) { ret = mempool_init_page_pool(&c->compression_bounce[READ], 1, order); if (ret) - return ret; + goto out; } if (!mempool_initialized(&c->compression_bounce[WRITE])) { ret = mempool_init_page_pool(&c->compression_bounce[WRITE], 1, order); if (ret) - return ret; + goto out; } - if (!mempool_initialized(&c->lz4_workspace_pool) && - bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_LZ4)) { - ret = mempool_init_kmalloc_pool(&c->lz4_workspace_pool, - 1, LZ4_MEM_COMPRESS); - if (ret) - return ret; - } + for (i = compression_types; + i < compression_types + ARRAY_SIZE(compression_types); + i++) { + decompress_workspace_size = + max(decompress_workspace_size, i->decompress_workspace); - if (!c->zlib_workspace && - bch2_sb_test_feature(c->disk_sb, BCH_FEATURE_GZIP)) { - c->zlib_workspace = vmalloc(COMPRESSION_WORKSPACE_SIZE); - if (!c->zlib_workspace) - return -ENOMEM; + if (!bch2_sb_test_feature(c->disk_sb, i->feature)) + continue; + + if (i->decompress_workspace) + decompress_workspace_needed = true; + + ret = mempool_init_kvpmalloc_pool( + &c->compress_workspace[i->type], + 1, i->compress_workspace); + if (ret) + goto out; } - return 0; + ret = mempool_init_kmalloc_pool( + &c->decompress_workspace, + 1, decompress_workspace_size); + if (ret) + goto out; +out: + pr_verbose_init(c->opts, "ret %i", ret); + return ret; } diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index f5dccfa..ce1f8ba 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -694,7 +694,7 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, goto err; } - if (!bch2_sb_has_replicas(c, BCH_DATA_BTREE, bch2_extent_devs(e))) { + if (!bch2_bkey_replicas_marked(c, BCH_DATA_BTREE, e.s_c)) { bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); bch2_fs_bug(c, @@ -1834,7 +1834,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, } if (!bkey_extent_is_cached(e.k) && - !bch2_sb_has_replicas(c, BCH_DATA_USER, bch2_extent_devs(e))) { + !bch2_bkey_replicas_marked(c, BCH_DATA_USER, e.s_c)) { bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), e.s_c); bch2_fs_bug(c, @@ -2013,17 +2013,18 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) } void bch2_extent_mark_replicas_cached(struct bch_fs *c, - struct bkey_s_extent e) + struct bkey_s_extent e, + unsigned nr_desired_replicas) { struct bch_extent_ptr *ptr; unsigned tier = 0, nr_cached = 0; unsigned nr_good = bch2_extent_nr_good_ptrs(c, e.c); bool have_higher_tier; - if (nr_good <= c->opts.data_replicas) + if (nr_good <= nr_desired_replicas) return; - nr_cached = nr_good - c->opts.data_replicas; + nr_cached = nr_good - nr_desired_replicas; do { have_higher_tier = false; diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index e8f54f2..7557927 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -38,7 +38,8 @@ bch2_insert_fixup_extent(struct btree_insert *, struct btree_insert_entry *); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); -void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent); +void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent, + unsigned); const struct bch_extent_ptr * bch2_extent_has_device(struct bkey_s_c_extent, unsigned); @@ -430,6 +431,18 @@ static inline struct bch_devs_list bch2_extent_dirty_devs(struct bkey_s_c_extent return ret; } +static inline struct bch_devs_list bch2_extent_cached_devs(struct bkey_s_c_extent e) +{ + struct bch_devs_list ret = (struct bch_devs_list) { 0 }; + const struct bch_extent_ptr *ptr; + + extent_for_each_ptr(e, ptr) + if (ptr->cached) + ret.devs[ret.nr++] = ptr->dev; + + return ret; +} + static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) { switch (k.k->type) { @@ -441,6 +454,28 @@ static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) } } +static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) +{ + switch (k.k->type) { + case BCH_EXTENT: + case BCH_EXTENT_CACHED: + return bch2_extent_dirty_devs(bkey_s_c_to_extent(k)); + default: + return (struct bch_devs_list) { .nr = 0 }; + } +} + +static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) +{ + switch (k.k->type) { + case BCH_EXTENT: + case BCH_EXTENT_CACHED: + return bch2_extent_cached_devs(bkey_s_c_to_extent(k)); + default: + return (struct bch_devs_list) { .nr = 0 }; + } +} + bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent, struct bch_extent_crc_unpacked); bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 1bffddf..00475b9 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -452,14 +452,18 @@ static int bchfs_write_index_update(struct bch_write_op *wop) ret = bch2_btree_insert_at(wop->c, &wop->res, &hook.hook, op_journal_seq(wop), - BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_ATOMIC| + BTREE_INSERT_USE_RESERVE, BTREE_INSERT_ENTRY(&extent_iter, k), BTREE_INSERT_ENTRY_EXTRA_RES(&inode_iter, &hook.inode_p.inode.k_i, 2)); } else { ret = bch2_btree_insert_at(wop->c, &wop->res, &hook.hook, op_journal_seq(wop), - BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_ATOMIC| + BTREE_INSERT_USE_RESERVE, BTREE_INSERT_ENTRY(&extent_iter, k)); } @@ -502,7 +506,7 @@ static inline void bch2_fswrite_op_init(struct bchfs_write_op *op, bch2_write_op_init(&op->op, c); op->op.csum_type = bch2_data_checksum_type(c, opts.data_checksum); - op->op.compression_type = bch2_compression_opt_to_type(opts.compression); + op->op.compression_type = bch2_compression_opt_to_type[opts.compression]; op->op.devs = c->fastest_devs; op->op.index_update_fn = bchfs_write_index_update; op_journal_seq_set(&op->op, &inode->ei_journal_seq); @@ -2692,6 +2696,10 @@ void bch2_fs_fsio_exit(struct bch_fs *c) int bch2_fs_fsio_init(struct bch_fs *c) { + int ret = 0; + + pr_verbose_init(c->opts, ""); + if (bioset_init(&c->writepage_bioset, 4, offsetof(struct bch_writepage_io, op.op.wbio.bio), BIOSET_NEED_BVECS) || @@ -2701,9 +2709,10 @@ int bch2_fs_fsio_init(struct bch_fs *c) bioset_init(&c->dio_write_bioset, 4, offsetof(struct dio_write, iop.op.wbio.bio), BIOSET_NEED_BVECS)) - return -ENOMEM; + ret = -ENOMEM; - return 0; + pr_verbose_init(c->opts, "ret %i", ret); + return ret; } #endif /* NO_BCACHEFS_FS */ diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 7cddbcc..13495d4 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -209,17 +209,6 @@ static void bch2_write_done(struct closure *cl) closure_return(cl); } -static u64 keylist_sectors(struct keylist *keys) -{ - struct bkey_i *k; - u64 ret = 0; - - for_each_keylist_key(keys, k) - ret += k->k.size; - - return ret; -} - int bch2_write_index_default(struct bch_write_op *op) { struct keylist *keys = &op->insert_keys; @@ -232,7 +221,8 @@ int bch2_write_index_default(struct bch_write_op *op) ret = bch2_btree_insert_list_at(&iter, keys, &op->res, NULL, op_journal_seq(op), - BTREE_INSERT_NOFAIL); + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE); bch2_btree_iter_unlock(&iter); return ret; @@ -268,8 +258,7 @@ static void bch2_write_index(struct closure *cl) } if (!(op->flags & BCH_WRITE_NOMARK_REPLICAS)) { - ret = bch2_check_mark_super(c, BCH_DATA_USER, - bch2_extent_devs(e.c)); + ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, e.s_c); if (ret) goto err; } @@ -910,18 +899,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) swap(bio->bi_vcnt, rbio->bio.bi_vcnt); rbio->promote = NULL; - bch2_write_op_init(&op->write.op, c); - op->write.op.csum_type = bch2_data_checksum_type(c, rbio->opts.data_checksum); - op->write.op.compression_type = - bch2_compression_opt_to_type(rbio->opts.compression); - - op->write.move_dev = -1; - op->write.op.devs = c->fastest_devs; - op->write.op.write_point = writepoint_hashed((unsigned long) current); - op->write.op.flags |= BCH_WRITE_ALLOC_NOWAIT; - op->write.op.flags |= BCH_WRITE_CACHED; - - bch2_migrate_write_init(&op->write, rbio); + bch2_migrate_read_done(&op->write, rbio); closure_init(cl, NULL); closure_call(&op->write.op.cl, bch2_write, c->wq, cl); @@ -932,13 +910,16 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) * XXX: multiple promotes can race with each other, wastefully. Keep a list of * outstanding promotes? */ -static struct promote_op *promote_alloc(struct bch_read_bio *rbio) +static struct promote_op *promote_alloc(struct bch_read_bio *rbio, + struct bkey_s_c k) { + struct bch_fs *c = rbio->c; struct promote_op *op; struct bio *bio; /* data might have to be decompressed in the write path: */ unsigned pages = DIV_ROUND_UP(rbio->pick.crc.uncompressed_size, PAGE_SECTORS); + int ret; BUG_ON(!rbio->bounce); BUG_ON(pages < rbio->bio.bi_vcnt); @@ -954,6 +935,14 @@ static struct promote_op *promote_alloc(struct bch_read_bio *rbio) memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, sizeof(struct bio_vec) * rbio->bio.bi_vcnt); + ret = bch2_migrate_write_init(c, &op->write, c->fastest_devs, + writepoint_hashed((unsigned long) current), + rbio->opts, + DATA_PROMOTE, + (struct data_opts) { 0 }, + k); + BUG_ON(ret); + return op; } @@ -1407,7 +1396,7 @@ noclone: rbio->pick = *pick; rbio->pos = pos; rbio->version = e.k->version; - rbio->promote = promote ? promote_alloc(rbio) : NULL; + rbio->promote = promote ? promote_alloc(rbio, e.s_c) : NULL; INIT_WORK(&rbio->work, NULL); bio_set_dev(&rbio->bio, pick->ca->disk_sb.bdev); diff --git a/libbcachefs/io.h b/libbcachefs/io.h index 71eee4f..4208fd4 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -70,7 +70,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c) op->error = 0; op->csum_type = bch2_data_checksum_type(c, c->opts.data_checksum); op->compression_type = - bch2_compression_opt_to_type(c->opts.compression); + bch2_compression_opt_to_type[c->opts.compression]; op->nr_replicas = 0; op->nr_replicas_required = c->opts.data_replicas_required; op->alloc_reserve = RESERVE_NONE; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index a1e4562..8ce1745 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -1046,12 +1046,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) if (!degraded && (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_sb_has_replicas(c, BCH_DATA_JOURNAL, + fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL, i->devs), c, "superblock not marked as containing replicas (type %u)", BCH_DATA_JOURNAL))) { - ret = bch2_check_mark_super(c, BCH_DATA_JOURNAL, - i->devs); + ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs); if (ret) return ret; } @@ -2232,7 +2231,7 @@ static void journal_write_done(struct closure *cl) goto err; } - if (bch2_check_mark_super(c, BCH_DATA_JOURNAL, devs)) + if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs)) goto err; out: __bch2_time_stats_update(j->write_time, j->write_start_time); @@ -2851,7 +2850,7 @@ int bch2_journal_flush_device(struct journal *j, int dev_idx) seq++; spin_unlock(&j->lock); - ret = bch2_check_mark_super(c, BCH_DATA_JOURNAL, devs); + ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs); spin_lock(&j->lock); } spin_unlock(&j->lock); @@ -2946,7 +2945,11 @@ void bch2_fs_journal_exit(struct journal *j) int bch2_fs_journal_init(struct journal *j) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); static struct lock_class_key res_key; + int ret = 0; + + pr_verbose_init(c->opts, ""); spin_lock_init(&j->lock); spin_lock_init(&j->err_lock); @@ -2972,12 +2975,15 @@ int bch2_fs_journal_init(struct journal *j) if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || !(j->buf[0].data = kvpmalloc(j->buf[0].size, GFP_KERNEL)) || - !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL))) - return -ENOMEM; + !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL))) { + ret = -ENOMEM; + goto out; + } j->pin.front = j->pin.back = 1; - - return 0; +out: + pr_verbose_init(c->opts, "ret %i", ret); + return ret; } /* debug: */ diff --git a/libbcachefs/keylist.h b/libbcachefs/keylist.h index b7c8a86..a8c8883 100644 --- a/libbcachefs/keylist.h +++ b/libbcachefs/keylist.h @@ -58,6 +58,17 @@ static inline struct bkey_i *bch2_keylist_front(struct keylist *l) #define keylist_single(k) \ ((struct keylist) { .keys = k, .top = bkey_next(k) }) +static inline u64 keylist_sectors(struct keylist *keys) +{ + struct bkey_i *k; + u64 ret = 0; + + for_each_keylist_key(keys, k) + ret += k->k.size; + + return ret; +} + #ifdef CONFIG_BCACHEFS_DEBUG void bch2_verify_keylist_sorted(struct keylist *); #else diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 9c2920c..9200ed9 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -13,118 +13,6 @@ #include "move.h" #include "super-io.h" -static bool migrate_pred(void *arg, struct bkey_s_c_extent e) -{ - struct bch_dev *ca = arg; - - return bch2_extent_has_device(e, ca->dev_idx); -} - -#define MAX_DATA_OFF_ITER 10 - -static int bch2_dev_usrdata_migrate(struct bch_fs *c, struct bch_dev *ca, - int flags) -{ - struct btree_iter iter; - struct bkey_s_c k; - struct bch_move_stats stats; - unsigned pass = 0; - int ret = 0; - - if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_USER))) - return 0; - - /* - * XXX: we should be able to do this in one pass, but bch2_move_data() - * can spuriously fail to move an extent due to racing with other move - * operations - */ - do { - memset(&stats, 0, sizeof(stats)); - - ret = bch2_move_data(c, NULL, - SECTORS_IN_FLIGHT_PER_DEVICE, - NULL, - writepoint_hashed((unsigned long) current), - 0, - ca->dev_idx, - POS_MIN, POS_MAX, - migrate_pred, ca, - &stats); - if (ret) { - bch_err(c, "error migrating data: %i", ret); - return ret; - } - } while (atomic64_read(&stats.keys_moved) && pass++ < MAX_DATA_OFF_ITER); - - if (atomic64_read(&stats.keys_moved)) { - bch_err(c, "unable to migrate all data in %d iterations", - MAX_DATA_OFF_ITER); - return -1; - } - - mutex_lock(&c->replicas_gc_lock); - bch2_replicas_gc_start(c, 1 << BCH_DATA_USER); - - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH, k) { - ret = bch2_check_mark_super(c, BCH_DATA_USER, bch2_bkey_devs(k)); - if (ret) { - bch_err(c, "error migrating data %i from check_mark_super()", ret); - break; - } - } - - bch2_replicas_gc_end(c, ret); - mutex_unlock(&c->replicas_gc_lock); - return ret; -} - -static int bch2_dev_metadata_migrate(struct bch_fs *c, struct bch_dev *ca, - int flags) -{ - struct btree_iter iter; - struct btree *b; - int ret = 0; - unsigned id; - - if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_BTREE))) - return 0; - - mutex_lock(&c->replicas_gc_lock); - bch2_replicas_gc_start(c, 1 << BCH_DATA_BTREE); - - for (id = 0; id < BTREE_ID_NR; id++) { - for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); - - if (!bch2_extent_has_device(e, ca->dev_idx)) - continue; - - ret = bch2_btree_node_rewrite(c, &iter, b->data->keys.seq, 0); - if (ret) { - bch2_btree_iter_unlock(&iter); - goto err; - } - } - ret = bch2_btree_iter_unlock(&iter); - if (ret) - goto err; - } -err: - bch2_replicas_gc_end(c, ret); - mutex_unlock(&c->replicas_gc_lock); - return ret; -} - -int bch2_dev_data_migrate(struct bch_fs *c, struct bch_dev *ca, int flags) -{ - BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW && - bch2_dev_is_online(ca)); - - return bch2_dev_usrdata_migrate(c, ca, flags) ?: - bch2_dev_metadata_migrate(c, ca, flags); -} - static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s_extent e, unsigned dev_idx, int flags, bool metadata) { @@ -152,7 +40,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) int ret = 0; mutex_lock(&c->replicas_gc_lock); - bch2_replicas_gc_start(c, 1 << BCH_DATA_USER); + bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED)); bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH); @@ -161,8 +49,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) !(ret = btree_iter_err(k))) { if (!bkey_extent_is_data(k.k) || !bch2_extent_has_device(bkey_s_c_to_extent(k), dev_idx)) { - ret = bch2_check_mark_super(c, BCH_DATA_USER, - bch2_bkey_devs(k)); + ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k); if (ret) break; bch2_btree_iter_next(&iter); @@ -183,8 +70,8 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) */ bch2_extent_normalize(c, e.s); - ret = bch2_check_mark_super(c, BCH_DATA_USER, - bch2_bkey_devs(bkey_i_to_s_c(&tmp.key))); + ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, + bkey_i_to_s_c(&tmp.key)); if (ret) break; @@ -240,8 +127,8 @@ retry: dev_idx)) { bch2_btree_iter_set_locks_want(&iter, 0); - ret = bch2_check_mark_super(c, BCH_DATA_BTREE, - bch2_bkey_devs(bkey_i_to_s_c(&b->key))); + ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE, + bkey_i_to_s_c(&b->key)); if (ret) goto err; } else { diff --git a/libbcachefs/migrate.h b/libbcachefs/migrate.h index 6db7b91..de2faab 100644 --- a/libbcachefs/migrate.h +++ b/libbcachefs/migrate.h @@ -1,7 +1,6 @@ #ifndef _BCACHEFS_MIGRATE_H #define _BCACHEFS_MIGRATE_H -int bch2_dev_data_migrate(struct bch_fs *, struct bch_dev *, int); int bch2_dev_data_drop(struct bch_fs *, unsigned, int); #endif /* _BCACHEFS_MIGRATE_H */ diff --git a/libbcachefs/move.c b/libbcachefs/move.c index e5a46ba..a176484 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -58,6 +58,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) BKEY_PADDED(k) _new, _insert; struct bch_extent_ptr *ptr; struct bch_extent_crc_unpacked crc; + unsigned nr_dirty; bool did_work = false; if (btree_iter_err(k)) { @@ -71,6 +72,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op) m->ptr, m->offset)) goto nomatch; + if (m->data_cmd == DATA_REWRITE && + !bch2_extent_has_device(bkey_s_c_to_extent(k), + m->data_opts.rewrite_dev)) + goto nomatch; + bkey_reassemble(&_insert.k, k); insert = bkey_i_to_extent(&_insert.k); @@ -81,11 +87,12 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bch2_cut_back(new->k.p, &insert->k); bch2_cut_back(insert->k.p, &new->k); - if (m->move_dev >= 0 && - (ptr = (struct bch_extent_ptr *) - bch2_extent_has_device(extent_i_to_s_c(insert), - m->move_dev))) + if (m->data_cmd == DATA_REWRITE) { + ptr = (struct bch_extent_ptr *) + bch2_extent_has_device(extent_i_to_s_c(insert), + m->data_opts.rewrite_dev); bch2_extent_drop_ptr(extent_i_to_s(insert), ptr); + } extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) { if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) { @@ -108,10 +115,35 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bch2_extent_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); bch2_extent_normalize(c, extent_i_to_s(insert).s); - bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert)); + bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert), + c->opts.data_replicas); + + /* + * It's possible we race, and for whatever reason the extent now + * has fewer replicas than when we last looked at it - meaning + * we need to get a disk reservation here: + */ + nr_dirty = bch2_extent_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)); + if (m->nr_ptrs_reserved < nr_dirty) { + unsigned sectors = (nr_dirty - m->nr_ptrs_reserved) * + keylist_sectors(keys); + + /* + * can't call bch2_disk_reservation_add() with btree + * locks held, at least not without a song and dance + */ + bch2_btree_iter_unlock(&iter); + + ret = bch2_disk_reservation_add(c, &op->res, sectors, 0); + if (ret) + goto out; + + m->nr_ptrs_reserved = nr_dirty; + goto next; + } - ret = bch2_check_mark_super(c, BCH_DATA_USER, - bch2_extent_devs(extent_i_to_s_c(insert))); + ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, + extent_i_to_s_c(insert).s_c); if (ret) break; @@ -119,7 +151,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op) NULL, op_journal_seq(op), BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| - m->btree_insert_flags, + BTREE_INSERT_USE_RESERVE| + m->data_opts.btree_insert_flags, BTREE_INSERT_ENTRY(&iter, &insert->k_i)); if (!ret) atomic_long_inc(&c->extent_migrate_done); @@ -150,8 +183,7 @@ out: return ret; } -void bch2_migrate_write_init(struct migrate_write *m, - struct bch_read_bio *rbio) +void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio) { /* write bio must own pages: */ BUG_ON(!m->op.wbio.bio.bi_vcnt); @@ -162,16 +194,39 @@ void bch2_migrate_write_init(struct migrate_write *m, m->op.pos = rbio->pos; m->op.version = rbio->version; m->op.crc = rbio->pick.crc; + m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) { m->op.nonce = m->op.crc.nonce + m->op.crc.offset; m->op.csum_type = m->op.crc.csum_type; } - if (m->move_dev >= 0) - bch2_dev_list_drop_dev(&m->op.devs_have, m->move_dev); + if (m->data_cmd == DATA_REWRITE) + bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev); +} + +int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, + struct bch_devs_mask *devs, + struct write_point_specifier wp, + struct bch_io_opts io_opts, + enum data_cmd data_cmd, + struct data_opts data_opts, + struct bkey_s_c k) +{ + int ret; - if (m->btree_insert_flags & BTREE_INSERT_USE_RESERVE) + m->data_cmd = data_cmd; + m->data_opts = data_opts; + m->nr_ptrs_reserved = bch2_extent_nr_dirty_ptrs(k); + + bch2_write_op_init(&m->op, c); + m->op.csum_type = bch2_data_checksum_type(c, io_opts.data_checksum); + m->op.compression_type = + bch2_compression_opt_to_type[io_opts.compression]; + m->op.devs = devs; + m->op.write_point = wp; + + if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) m->op.alloc_reserve = RESERVE_MOVINGGC; m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS| @@ -180,10 +235,35 @@ void bch2_migrate_write_init(struct migrate_write *m, BCH_WRITE_DATA_ENCODED| BCH_WRITE_NOMARK_REPLICAS; - m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; m->op.nr_replicas = 1; m->op.nr_replicas_required = 1; m->op.index_update_fn = bch2_migrate_index_update; + + switch (data_cmd) { + case DATA_ADD_REPLICAS: + if (m->nr_ptrs_reserved < c->opts.data_replicas) { + m->op.nr_replicas = c->opts.data_replicas - m->nr_ptrs_reserved; + + ret = bch2_disk_reservation_get(c, &m->op.res, + k.k->size, + m->op.nr_replicas, 0); + if (ret) + return ret; + + m->nr_ptrs_reserved = c->opts.data_replicas; + } + break; + case DATA_REWRITE: + break; + case DATA_PROMOTE: + m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; + m->op.flags |= BCH_WRITE_CACHED; + break; + default: + BUG(); + } + + return 0; } static void move_free(struct closure *cl) @@ -210,7 +290,7 @@ static void move_write(struct closure *cl) struct moving_io *io = container_of(cl, struct moving_io, cl); if (likely(!io->rbio.bio.bi_status)) { - bch2_migrate_write_init(&io->write, &io->rbio); + bch2_migrate_read_done(&io->write, &io->rbio); closure_call(&io->write.op.cl, bch2_write, NULL, cl); } @@ -238,19 +318,19 @@ static void move_read_endio(struct bio *bio) } static int bch2_move_extent(struct bch_fs *c, - struct moving_context *ctxt, - struct bch_devs_mask *devs, - struct write_point_specifier wp, - int btree_insert_flags, - int move_device, - struct bch_io_opts opts, - struct bkey_s_c_extent e) + struct moving_context *ctxt, + struct bch_devs_mask *devs, + struct write_point_specifier wp, + struct bch_io_opts io_opts, + struct bkey_s_c_extent e, + enum data_cmd data_cmd, + struct data_opts data_opts) { struct extent_pick_ptr pick; struct moving_io *io; const struct bch_extent_ptr *ptr; struct bch_extent_crc_unpacked crc; - unsigned sectors = e.k->size, pages, nr_good; + unsigned sectors = e.k->size, pages; int ret = -ENOMEM; bch2_extent_pick_ptr(c, e.s_c, NULL, &pick); @@ -279,7 +359,7 @@ static int bch2_move_extent(struct bch_fs *c, if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL)) goto err_free; - io->rbio.opts = opts; + io->rbio.opts = io_opts; bio_init(&io->rbio.bio, io->bi_inline_vecs, pages); bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); io->rbio.bio.bi_iter.bi_size = sectors << 9; @@ -288,27 +368,10 @@ static int bch2_move_extent(struct bch_fs *c, io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(e.k); io->rbio.bio.bi_end_io = move_read_endio; - io->write.btree_insert_flags = btree_insert_flags; - io->write.move_dev = move_device; - - bch2_write_op_init(&io->write.op, c); - io->write.op.csum_type = bch2_data_checksum_type(c, opts.data_checksum); - io->write.op.compression_type = - bch2_compression_opt_to_type(opts.compression); - io->write.op.devs = devs; - io->write.op.write_point = wp; - - if (move_device < 0 && - ((nr_good = bch2_extent_nr_good_ptrs(c, e)) < - c->opts.data_replicas)) { - io->write.op.nr_replicas = c->opts.data_replicas - nr_good; - - ret = bch2_disk_reservation_get(c, &io->write.op.res, - e.k->size, - io->write.op.nr_replicas, 0); - if (ret) - goto err_free_pages; - } + ret = bch2_migrate_write_init(c, &io->write, devs, wp, + io_opts, data_cmd, data_opts, e.s_c); + if (ret) + goto err_free_pages; atomic64_inc(&ctxt->stats->keys_moved); atomic64_add(e.k->size, &ctxt->stats->sectors_moved); @@ -369,8 +432,6 @@ int bch2_move_data(struct bch_fs *c, unsigned sectors_in_flight, struct bch_devs_mask *devs, struct write_point_specifier wp, - int btree_insert_flags, - int move_device, struct bpos start, struct bpos end, move_pred_fn pred, void *arg, @@ -378,12 +439,14 @@ int bch2_move_data(struct bch_fs *c, { bool kthread = (current->flags & PF_KTHREAD) != 0; struct moving_context ctxt = { .stats = stats }; - struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts); + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); BKEY_PADDED(k) tmp; struct bkey_s_c k; struct bkey_s_c_extent e; + struct data_opts data_opts; + enum data_cmd data_cmd; u64 cur_inum = U64_MAX; - int ret = 0; + int ret = 0, ret2; closure_init_stack(&ctxt.cl); INIT_LIST_HEAD(&ctxt.reads); @@ -430,28 +493,44 @@ peek: /* don't hold btree locks while looking up inode: */ bch2_btree_iter_unlock(&stats->iter); - opts = bch2_opts_to_inode_opts(c->opts); + io_opts = bch2_opts_to_inode_opts(c->opts); if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode)) - bch2_io_opts_apply(&opts, bch2_inode_opts_get(&inode)); + bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode)); cur_inum = k.k->p.inode; goto peek; } - if (!pred(arg, e)) + switch ((data_cmd = pred(c, arg, BKEY_TYPE_EXTENTS, e, + &io_opts, &data_opts))) { + case DATA_SKIP: goto next; + case DATA_SCRUB: + BUG(); + case DATA_ADD_REPLICAS: + case DATA_REWRITE: + case DATA_PROMOTE: + break; + default: + BUG(); + } /* unlock before doing IO: */ bkey_reassemble(&tmp.k, k); k = bkey_i_to_s_c(&tmp.k); bch2_btree_iter_unlock(&stats->iter); - if (bch2_move_extent(c, &ctxt, devs, wp, - btree_insert_flags, - move_device, opts, - bkey_s_c_to_extent(k))) { - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(&ctxt); - continue; + ret2 = bch2_move_extent(c, &ctxt, devs, wp, io_opts, + bkey_s_c_to_extent(k), + data_cmd, data_opts); + if (ret2) { + if (ret2 == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(&ctxt); + continue; + } + + /* XXX signal failure */ + goto next; } if (rate) @@ -486,11 +565,11 @@ static int bch2_gc_data_replicas(struct bch_fs *c) int ret; mutex_lock(&c->replicas_gc_lock); - bch2_replicas_gc_start(c, 1 << BCH_DATA_USER); + bch2_replicas_gc_start(c, (1 << BCH_DATA_USER)|(1 << BCH_DATA_CACHED)); for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH, k) { - ret = bch2_check_mark_super(c, BCH_DATA_USER, bch2_bkey_devs(k)); + ret = bch2_mark_bkey_replicas(c, BCH_DATA_USER, k); if (ret) break; } @@ -514,8 +593,8 @@ static int bch2_gc_btree_replicas(struct bch_fs *c) for (id = 0; id < BTREE_ID_NR; id++) { for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { - ret = bch2_check_mark_super(c, BCH_DATA_BTREE, - bch2_bkey_devs(bkey_i_to_s_c(&b->key))); + ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE, + bkey_i_to_s_c(&b->key)); bch2_btree_iter_cond_resched(&iter); } @@ -534,18 +613,35 @@ static int bch2_move_btree(struct bch_fs *c, void *arg, struct bch_move_stats *stats) { + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); struct btree *b; unsigned id; + struct data_opts data_opts; + enum data_cmd cmd; int ret = 0; stats->data_type = BCH_DATA_BTREE; for (id = 0; id < BTREE_ID_NR; id++) { for_each_btree_node(&stats->iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { - if (pred(arg, bkey_i_to_s_c_extent(&b->key))) - ret = bch2_btree_node_rewrite(c, &stats->iter, - b->data->keys.seq, 0) ?: ret; + switch ((cmd = pred(c, arg, BKEY_TYPE_BTREE, + bkey_i_to_s_c_extent(&b->key), + &io_opts, + &data_opts))) { + case DATA_SKIP: + goto next; + case DATA_SCRUB: + BUG(); + case DATA_ADD_REPLICAS: + case DATA_REWRITE: + break; + default: + BUG(); + } + ret = bch2_btree_node_rewrite(c, &stats->iter, + b->data->keys.seq, 0) ?: ret; +next: bch2_btree_iter_cond_resched(&stats->iter); } @@ -556,32 +652,48 @@ static int bch2_move_btree(struct bch_fs *c, } #if 0 -static bool scrub_data_pred(void *arg, struct bkey_s_c_extent e) +static enum data_cmd scrub_pred(struct bch_fs *c, void *arg, + enum bkey_type type, + struct bkey_s_c_extent e, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) { + return DATA_SCRUB; } #endif -static bool rereplicate_metadata_pred(void *arg, struct bkey_s_c_extent e) +static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, + enum bkey_type type, + struct bkey_s_c_extent e, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) { - struct bch_fs *c = arg; unsigned nr_good = bch2_extent_nr_good_ptrs(c, e); + unsigned replicas = type == BKEY_TYPE_BTREE + ? c->opts.metadata_replicas + : c->opts.data_replicas; - return nr_good && nr_good < c->opts.metadata_replicas; -} + if (!nr_good || nr_good >= replicas) + return DATA_SKIP; -static bool rereplicate_data_pred(void *arg, struct bkey_s_c_extent e) -{ - struct bch_fs *c = arg; - unsigned nr_good = bch2_extent_nr_good_ptrs(c, e); - - return nr_good && nr_good < c->opts.data_replicas; + data_opts->btree_insert_flags = 0; + return DATA_ADD_REPLICAS; } -static bool migrate_pred(void *arg, struct bkey_s_c_extent e) +static enum data_cmd migrate_pred(struct bch_fs *c, void *arg, + enum bkey_type type, + struct bkey_s_c_extent e, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) { struct bch_ioctl_data *op = arg; - return bch2_extent_has_device(e, op->migrate.dev); + if (!bch2_extent_has_device(e, op->migrate.dev)) + return DATA_SKIP; + + data_opts->btree_insert_flags = 0; + data_opts->rewrite_dev = op->migrate.dev; + return DATA_REWRITE; } int bch2_data_job(struct bch_fs *c, @@ -595,16 +707,15 @@ int bch2_data_job(struct bch_fs *c, stats->data_type = BCH_DATA_JOURNAL; ret = bch2_journal_flush_device(&c->journal, -1); - ret = bch2_move_btree(c, rereplicate_metadata_pred, c, stats) ?: ret; + ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; ret = bch2_gc_btree_replicas(c) ?: ret; ret = bch2_move_data(c, NULL, SECTORS_IN_FLIGHT_PER_DEVICE, NULL, writepoint_hashed((unsigned long) current), - 0, -1, op.start, op.end, - rereplicate_data_pred, c, stats) ?: ret; + rereplicate_pred, c, stats) ?: ret; ret = bch2_gc_data_replicas(c) ?: ret; break; case BCH_DATA_OP_MIGRATE: @@ -620,7 +731,6 @@ int bch2_data_job(struct bch_fs *c, ret = bch2_move_data(c, NULL, SECTORS_IN_FLIGHT_PER_DEVICE, NULL, writepoint_hashed((unsigned long) current), - 0, -1, op.start, op.end, migrate_pred, &op, stats) ?: ret; diff --git a/libbcachefs/move.h b/libbcachefs/move.h index 07aa566..819e5d9 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -8,23 +8,47 @@ struct bch_read_bio; struct moving_context; +enum data_cmd { + DATA_SKIP, + DATA_SCRUB, + DATA_ADD_REPLICAS, + DATA_REWRITE, + DATA_PROMOTE, +}; + +struct data_opts { + unsigned rewrite_dev; + int btree_insert_flags; +}; + struct migrate_write { + enum data_cmd data_cmd; + struct data_opts data_opts; + + unsigned nr_ptrs_reserved; + struct moving_context *ctxt; /* what we read: */ struct bch_extent_ptr ptr; u64 offset; - int move_dev; - int btree_insert_flags; struct bch_write_op op; }; -void bch2_migrate_write_init(struct migrate_write *, struct bch_read_bio *); +void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *); +int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *, + struct bch_devs_mask *, + struct write_point_specifier, + struct bch_io_opts, + enum data_cmd, struct data_opts, + struct bkey_s_c); #define SECTORS_IN_FLIGHT_PER_DEVICE 2048 -typedef bool (*move_pred_fn)(void *, struct bkey_s_c_extent); +typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, + enum bkey_type, struct bkey_s_c_extent, + struct bch_io_opts *, struct data_opts *); struct bch_move_stats { enum bch_data_type data_type; @@ -39,7 +63,7 @@ struct bch_move_stats { int bch2_move_data(struct bch_fs *, struct bch_ratelimit *, unsigned, struct bch_devs_mask *, struct write_point_specifier, - int, int, struct bpos, struct bpos, + struct bpos, struct bpos, move_pred_fn, void *, struct bch_move_stats *); diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 515d500..c306a89 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -61,9 +61,9 @@ static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) return (l->offset > r->offset) - (l->offset < r->offset); } -static bool copygc_pred(void *arg, struct bkey_s_c_extent e) +static bool __copygc_pred(struct bch_dev *ca, + struct bkey_s_c_extent e) { - struct bch_dev *ca = arg; copygc_heap *h = &ca->copygc_heap; const struct bch_extent_ptr *ptr = bch2_extent_has_device(e, ca->dev_idx); @@ -83,6 +83,22 @@ static bool copygc_pred(void *arg, struct bkey_s_c_extent e) return false; } +static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, + enum bkey_type type, + struct bkey_s_c_extent e, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) +{ + struct bch_dev *ca = arg; + + if (!__copygc_pred(ca, e)) + return DATA_SKIP; + + data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE, + data_opts->rewrite_dev = ca->dev_idx; + return DATA_REWRITE; +} + static bool have_copygc_reserve(struct bch_dev *ca) { bool ret; @@ -165,8 +181,6 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) SECTORS_IN_FLIGHT_PER_DEVICE, &ca->self, writepoint_ptr(&ca->copygc_write_point), - BTREE_INSERT_USE_RESERVE, - ca->dev_idx, POS_MIN, POS_MAX, copygc_pred, ca, &move_stats); diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index eae63cf..ec50345 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -22,6 +22,7 @@ const char * const bch2_compression_types[] = { "none", "lz4", "gzip", + "zstd", NULL }; diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 5d42dd5..8a3ac66 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -73,10 +73,10 @@ enum opt_type { BCH_OPT(errors, u8, OPT_RUNTIME, \ OPT_STR(bch2_error_actions), \ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_RO) \ - BCH_OPT(metadata_replicas, u8, OPT_MOUNT, \ + BCH_OPT(metadata_replicas, u8, OPT_RUNTIME, \ OPT_UINT(1, BCH_REPLICAS_MAX), \ BCH_SB_META_REPLICAS_WANT, 1) \ - BCH_OPT(data_replicas, u8, OPT_MOUNT, \ + BCH_OPT(data_replicas, u8, OPT_RUNTIME, \ OPT_UINT(1, BCH_REPLICAS_MAX), \ BCH_SB_DATA_REPLICAS_WANT, 1) \ BCH_OPT(metadata_replicas_required, u8, OPT_MOUNT, \ @@ -127,6 +127,9 @@ enum opt_type { BCH_OPT(verbose_recovery, u8, OPT_MOUNT, \ OPT_BOOL(), \ NO_SB_OPT, false) \ + BCH_OPT(verbose_init, u8, OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false) \ BCH_OPT(journal_flush_disabled, u8, OPT_RUNTIME, \ OPT_BOOL(), \ NO_SB_OPT, false) \ diff --git a/libbcachefs/quota.c b/libbcachefs/quota.c index 6ab2c86..d28f133 100644 --- a/libbcachefs/quota.c +++ b/libbcachefs/quota.c @@ -74,13 +74,6 @@ static inline unsigned __next_qtype(unsigned i, unsigned qtypes) _i < QTYP_NR); \ _i++) -static inline unsigned enabled_qtypes(struct bch_fs *c) -{ - return ((c->opts.usrquota << QTYP_USR)| - (c->opts.grpquota << QTYP_GRP)| - (c->opts.prjquota << QTYP_PRJ)); -} - static bool ignore_hardlimit(struct bch_memquota_type *q) { if (capable(CAP_SYS_RESOURCE)) @@ -478,7 +471,7 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags) if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota) return -EINVAL; - if (uflags & FS_QUOTA_PDQ_ENFD) + if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota) return -EINVAL; mutex_lock(&c->sb_lock); @@ -487,10 +480,9 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags) if (uflags & FS_QUOTA_GDQ_ENFD) SET_BCH_SB_GRPQUOTA(c->disk_sb, true); -#if 0 + if (uflags & FS_QUOTA_PDQ_ENFD) SET_BCH_SB_PRJQUOTA(c->disk_sb, true); -#endif bch2_write_super(c); mutex_unlock(&c->sb_lock); diff --git a/libbcachefs/quota.h b/libbcachefs/quota.h index b5536be..509b7f0 100644 --- a/libbcachefs/quota.h +++ b/libbcachefs/quota.h @@ -20,6 +20,13 @@ static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) }; } +static inline unsigned enabled_qtypes(struct bch_fs *c) +{ + return ((c->opts.usrquota << QTYP_USR)| + (c->opts.grpquota << QTYP_GRP)| + (c->opts.prjquota << QTYP_PRJ)); +} + #ifdef CONFIG_BCACHEFS_QUOTA int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters, diff --git a/libbcachefs/siphash.c b/libbcachefs/siphash.c index d689a7b..3a6c9c8 100644 --- a/libbcachefs/siphash.c +++ b/libbcachefs/siphash.c @@ -43,7 +43,6 @@ * https://131002.net/siphash/ */ -#include #include #include #include diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index f333b8f..c747391 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -546,6 +546,8 @@ int bch2_read_super(const char *path, struct bch_opts *opts, __le64 *i; int ret; + pr_verbose_init(*opts, ""); + memset(sb, 0, sizeof(*sb)); sb->mode = FMODE_READ; @@ -566,8 +568,10 @@ int bch2_read_super(const char *path, struct bch_opts *opts, opt_set(*opts, nochanges, true); } - if (IS_ERR(sb->bdev)) - return PTR_ERR(sb->bdev); + if (IS_ERR(sb->bdev)) { + ret = PTR_ERR(sb->bdev); + goto out; + } err = "cannot allocate memory"; ret = __bch2_super_realloc(sb, 0); @@ -638,12 +642,14 @@ got_super: if (sb->mode & FMODE_WRITE) bdev_get_queue(sb->bdev)->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; - - return 0; + ret = 0; +out: + pr_verbose_init(*opts, "ret %i", ret); + return ret; err: bch2_free_super(sb); pr_err("error reading superblock: %s", err); - return ret; + goto out; } /* write superblock: */ @@ -744,17 +750,15 @@ void bch2_write_super(struct bch_fs *c) nr_wrote = dev_mask_nr(&sb_written); can_mount_with_written = - bch2_have_enough_devs(c, - __bch2_replicas_status(c, sb_written), - BCH_FORCE_IF_DEGRADED); + bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), + BCH_FORCE_IF_DEGRADED); for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) sb_written.d[i] = ~sb_written.d[i]; can_mount_without_written = - bch2_have_enough_devs(c, - __bch2_replicas_status(c, sb_written), - BCH_FORCE_IF_DEGRADED); + bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), + BCH_FORCE_IF_DEGRADED); /* * If we would be able to mount _without_ the devices we successfully @@ -1052,7 +1056,7 @@ static bool replicas_has_entry(struct bch_replicas_cpu *r, } noinline -static int bch2_check_mark_super_slowpath(struct bch_fs *c, +static int bch2_mark_replicas_slowpath(struct bch_fs *c, struct bch_replicas_cpu_entry new_entry, unsigned max_dev) { @@ -1109,9 +1113,9 @@ err: return ret; } -int bch2_check_mark_super(struct bch_fs *c, - enum bch_data_type data_type, - struct bch_devs_list devs) +int bch2_mark_replicas(struct bch_fs *c, + enum bch_data_type data_type, + struct bch_devs_list devs) { struct bch_replicas_cpu_entry search; struct bch_replicas_cpu *r, *gc_r; @@ -1121,6 +1125,8 @@ int bch2_check_mark_super(struct bch_fs *c, if (!devs.nr) return 0; + BUG_ON(devs.nr >= BCH_REPLICAS_MAX); + devlist_to_replicas(devs, data_type, &search, &max_dev); rcu_read_lock(); @@ -1131,7 +1137,23 @@ int bch2_check_mark_super(struct bch_fs *c, rcu_read_unlock(); return likely(marked) ? 0 - : bch2_check_mark_super_slowpath(c, search, max_dev); + : bch2_mark_replicas_slowpath(c, search, max_dev); +} + +int bch2_mark_bkey_replicas(struct bch_fs *c, + enum bch_data_type data_type, + struct bkey_s_c k) +{ + struct bch_devs_list cached = bch2_bkey_cached_devs(k); + unsigned i; + int ret; + + for (i = 0; i < cached.nr; i++) + if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED, + bch2_dev_list_single(cached.devs[i])))) + return ret; + + return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k)); } int bch2_replicas_gc_end(struct bch_fs *c, int err) @@ -1417,7 +1439,7 @@ int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *r, char *buf, size_t /* Query replicas: */ -bool bch2_sb_has_replicas(struct bch_fs *c, +bool bch2_replicas_marked(struct bch_fs *c, enum bch_data_type data_type, struct bch_devs_list devs) { @@ -1438,6 +1460,21 @@ bool bch2_sb_has_replicas(struct bch_fs *c, return ret; } +bool bch2_bkey_replicas_marked(struct bch_fs *c, + enum bch_data_type data_type, + struct bkey_s_c k) +{ + struct bch_devs_list cached = bch2_bkey_cached_devs(k); + unsigned i; + + for (i = 0; i < cached.nr; i++) + if (!bch2_replicas_marked(c, BCH_DATA_CACHED, + bch2_dev_list_single(cached.devs[i]))) + return false; + + return bch2_replicas_marked(c, data_type, bch2_bkey_dirty_devs(k)); +} + struct replicas_status __bch2_replicas_status(struct bch_fs *c, struct bch_devs_mask online_devs) { @@ -1495,29 +1532,26 @@ struct replicas_status bch2_replicas_status(struct bch_fs *c) return __bch2_replicas_status(c, bch2_online_devs(c)); } -bool bch2_have_enough_devs(struct bch_fs *c, - struct replicas_status s, - unsigned flags) +static bool have_enough_devs(struct replicas_status s, + enum bch_data_type type, + bool force_if_degraded, + bool force_if_lost) { - if ((s.replicas[BCH_DATA_JOURNAL].nr_offline || - s.replicas[BCH_DATA_BTREE].nr_offline) && - !(flags & BCH_FORCE_IF_METADATA_DEGRADED)) - return false; - - if ((!s.replicas[BCH_DATA_JOURNAL].nr_online || - !s.replicas[BCH_DATA_BTREE].nr_online) && - !(flags & BCH_FORCE_IF_METADATA_LOST)) - return false; - - if (s.replicas[BCH_DATA_USER].nr_offline && - !(flags & BCH_FORCE_IF_DATA_DEGRADED)) - return false; - - if (!s.replicas[BCH_DATA_USER].nr_online && - !(flags & BCH_FORCE_IF_DATA_LOST)) - return false; + return (!s.replicas[type].nr_offline || force_if_degraded) && + (s.replicas[type].nr_online || force_if_lost); +} - return true; +bool bch2_have_enough_devs(struct replicas_status s, unsigned flags) +{ + return (have_enough_devs(s, BCH_DATA_JOURNAL, + flags & BCH_FORCE_IF_METADATA_DEGRADED, + flags & BCH_FORCE_IF_METADATA_LOST) && + have_enough_devs(s, BCH_DATA_BTREE, + flags & BCH_FORCE_IF_METADATA_DEGRADED, + flags & BCH_FORCE_IF_METADATA_LOST) && + have_enough_devs(s, BCH_DATA_USER, + flags & BCH_FORCE_IF_DATA_DEGRADED, + flags & BCH_FORCE_IF_DATA_LOST)); } unsigned bch2_replicas_online(struct bch_fs *c, bool meta) diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index eb85410..d7fecf0 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -139,10 +139,14 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) /* BCH_SB_FIELD_replicas: */ -bool bch2_sb_has_replicas(struct bch_fs *, enum bch_data_type, - struct bch_devs_list); -int bch2_check_mark_super(struct bch_fs *, enum bch_data_type, +bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type, struct bch_devs_list); +bool bch2_bkey_replicas_marked(struct bch_fs *, enum bch_data_type, + struct bkey_s_c); +int bch2_mark_replicas(struct bch_fs *, enum bch_data_type, + struct bch_devs_list); +int bch2_mark_bkey_replicas(struct bch_fs *, enum bch_data_type, + struct bkey_s_c); int bch2_cpu_replicas_to_text(struct bch_replicas_cpu *, char *, size_t); int bch2_sb_replicas_to_text(struct bch_sb_field_replicas *, char *, size_t); @@ -157,7 +161,7 @@ struct replicas_status { struct replicas_status __bch2_replicas_status(struct bch_fs *, struct bch_devs_mask); struct replicas_status bch2_replicas_status(struct bch_fs *); -bool bch2_have_enough_devs(struct bch_fs *, struct replicas_status, unsigned); +bool bch2_have_enough_devs(struct replicas_status, unsigned); unsigned bch2_replicas_online(struct bch_fs *, bool); unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index f836c19..58bcd7d 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -507,9 +507,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) struct bch_fs *c; unsigned i, iter_size; + pr_verbose_init(opts, ""); + c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); if (!c) - return NULL; + goto out; __module_get(THIS_MODULE); @@ -539,7 +541,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mutex_init(&c->btree_interior_update_lock); mutex_init(&c->bio_bounce_pages_lock); - mutex_init(&c->zlib_workspace_lock); bio_list_init(&c->btree_write_error_list); spin_lock_init(&c->btree_write_error_lock); @@ -646,10 +647,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) kobject_init(&c->internal, &bch2_fs_internal_ktype); kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); +out: + pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM); return c; err: bch2_fs_free(c); - return NULL; + c = NULL; + goto out; } static const char *__bch2_fs_online(struct bch_fs *c) @@ -809,7 +813,7 @@ static const char *__bch2_fs_start(struct bch_fs *c) goto err; bch_verbose(c, "fsck done"); - if (c->opts.usrquota || c->opts.grpquota) { + if (enabled_qtypes(c)) { bch_verbose(c, "reading quotas:"); ret = bch2_fs_quota_read(c); if (ret) @@ -864,7 +868,7 @@ static const char *__bch2_fs_start(struct bch_fs *c) NULL, NULL, NULL, 0)) goto err; - if (c->opts.usrquota || c->opts.grpquota) { + if (enabled_qtypes(c)) { ret = bch2_fs_quota_read(c); if (ret) goto err; @@ -1084,14 +1088,17 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) { struct bch_member *member; - struct bch_dev *ca; + struct bch_dev *ca = NULL; + int ret = 0; + + pr_verbose_init(c->opts, ""); if (bch2_fs_init_fault("dev_alloc")) - return -ENOMEM; + goto err; ca = kzalloc(sizeof(*ca), GFP_KERNEL); if (!ca) - return -ENOMEM; + goto err; kobject_init(&ca->kobj, &bch2_dev_ktype); init_completion(&ca->ref_completion); @@ -1133,11 +1140,14 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) if (bch2_dev_sysfs_online(c, ca)) pr_warn("error creating sysfs objects"); - - return 0; +out: + pr_verbose_init(c->opts, "ret %i", ret); + return ret; err: - bch2_dev_free(ca); - return -ENOMEM; + if (ca) + bch2_dev_free(ca); + ret = -ENOMEM; + goto out; } static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb) @@ -1240,7 +1250,8 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, /* do we have enough devices to write to? */ for_each_member_device(ca2, c, i) - nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW; + if (ca2 != ca) + nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW; required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) ? c->opts.metadata_replicas @@ -1249,7 +1260,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, ? c->opts.data_replicas : c->opts.data_replicas_required); - return nr_rw - 1 <= required; + return nr_rw >= required; case BCH_MEMBER_STATE_FAILED: case BCH_MEMBER_STATE_SPARE: if (ca->mi.state != BCH_MEMBER_STATE_RW && @@ -1262,7 +1273,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, s = __bch2_replicas_status(c, new_online_devs); - return bch2_have_enough_devs(c, s, flags); + return bch2_have_enough_devs(s, flags); default: BUG(); } @@ -1299,7 +1310,7 @@ static bool bch2_fs_may_start(struct bch_fs *c) s = bch2_replicas_status(c); - return bch2_have_enough_devs(c, s, flags); + return bch2_have_enough_devs(s, flags); } static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) @@ -1346,12 +1357,8 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, if (!bch2_dev_state_allowed(c, ca, new_state, flags)) return -EINVAL; - if (new_state == BCH_MEMBER_STATE_RW) { - if (__bch2_dev_read_write(c, ca)) - return -ENOMEM; - } else { + if (new_state != BCH_MEMBER_STATE_RW) __bch2_dev_read_only(c, ca); - } bch_notice(ca, "%s", bch2_dev_state[new_state]); @@ -1361,6 +1368,9 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, bch2_write_super(c); mutex_unlock(&c->sb_lock); + if (new_state == BCH_MEMBER_STATE_RW) + return __bch2_dev_read_write(c, ca) ? -ENOMEM : 0; + return 0; } @@ -1701,11 +1711,17 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, const char *err; int ret = -ENOMEM; - if (!nr_devices) - return ERR_PTR(-EINVAL); + pr_verbose_init(opts, ""); - if (!try_module_get(THIS_MODULE)) - return ERR_PTR(-ENODEV); + if (!nr_devices) { + c = ERR_PTR(-EINVAL); + goto out2; + } + + if (!try_module_get(THIS_MODULE)) { + c = ERR_PTR(-ENODEV); + goto out2; + } sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); if (!sb) @@ -1760,8 +1776,11 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, if (err) goto err_print; +out: kfree(sb); module_put(THIS_MODULE); +out2: + pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); return c; err_print: pr_err("bch_fs_open err opening %s: %s", @@ -1770,12 +1789,10 @@ err_print: err: if (c) bch2_fs_stop(c); - for (i = 0; i < nr_devices; i++) bch2_free_super(&sb[i]); - kfree(sb); - module_put(THIS_MODULE); - return ERR_PTR(ret); + c = ERR_PTR(ret); + goto out; } static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, diff --git a/libbcachefs/super.h b/libbcachefs/super.h index d0a38cf..1718f5c 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -67,6 +67,11 @@ static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, devs->devs[devs->nr++] = dev; } +static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) +{ + return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; +} + static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, const struct bch_devs_mask *mask) { diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h index 966da4a..d76d917 100644 --- a/libbcachefs/super_types.h +++ b/libbcachefs/super_types.h @@ -15,7 +15,7 @@ struct bch_devs_mask { struct bch_devs_list { u8 nr; - u8 devs[BCH_REPLICAS_MAX]; + u8 devs[BCH_REPLICAS_MAX + 1]; }; struct bch_member_cpu { diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 597e1f0..2e958a8 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -164,6 +164,8 @@ read_attribute(extent_migrate_raced); rw_attribute(journal_write_delay_ms); rw_attribute(journal_reclaim_delay_ms); +rw_attribute(writeback_pages_max); + rw_attribute(discard); rw_attribute(cache_replacement_policy); @@ -310,6 +312,8 @@ SHOW(bch2_fs) sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms); sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); + sysfs_print(writeback_pages_max, c->writeback_pages_max); + sysfs_print(block_size, block_bytes(c)); sysfs_print(btree_node_size, btree_bytes(c)); sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); @@ -370,6 +374,9 @@ STORE(__bch2_fs) sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms); sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); + if (attr == &sysfs_writeback_pages_max) + c->writeback_pages_max = strtoul_restrict_or_return(buf, 1, UINT_MAX); + if (attr == &sysfs_btree_gc_periodic) { ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) ?: (ssize_t) size; @@ -459,6 +466,8 @@ struct attribute *bch2_fs_files[] = { &sysfs_journal_write_delay_ms, &sysfs_journal_reclaim_delay_ms, + &sysfs_writeback_pages_max, + &sysfs_tiering_percent, &sysfs_compression_stats, diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c index c4625c8..775c2e2 100644 --- a/libbcachefs/tier.c +++ b/libbcachefs/tier.c @@ -14,10 +14,9 @@ #include #include -static bool tiering_pred(void *arg, struct bkey_s_c_extent e) +static bool __tiering_pred(struct bch_fs *c, struct bch_tier *tier, + struct bkey_s_c_extent e) { - struct bch_tier *tier = arg; - struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]); const struct bch_extent_ptr *ptr; unsigned replicas = 0; @@ -33,6 +32,21 @@ static bool tiering_pred(void *arg, struct bkey_s_c_extent e) return replicas < c->opts.data_replicas; } +static enum data_cmd tiering_pred(struct bch_fs *c, void *arg, + enum bkey_type type, + struct bkey_s_c_extent e, + struct bch_io_opts *io_opts, + struct data_opts *data_opts) +{ + struct bch_tier *tier = arg; + + if (!__tiering_pred(c, tier, e)) + return DATA_SKIP; + + data_opts->btree_insert_flags = 0; + return DATA_ADD_REPLICAS; +} + static int bch2_tiering_thread(void *arg) { struct bch_tier *tier = arg; @@ -90,8 +104,6 @@ static int bch2_tiering_thread(void *arg) SECTORS_IN_FLIGHT_PER_DEVICE * nr_devices, &tier->devs, writepoint_ptr(&tier->wp), - 0, - -1, POS_MIN, POS_MAX, tiering_pred, tier, &move_stats); diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 6e97e83..d475f98 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -817,4 +817,19 @@ do { \ #define array_remove_item(_array, _nr, _pos) \ array_remove_items(_array, _nr, _pos, 1) +#define bubble_sort(_base, _nr, _cmp) \ +do { \ + ssize_t _i, _end; \ + bool _swapped = true; \ + \ + for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\ + _swapped = false; \ + for (_i = 0; _i < _end; _i++) \ + if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \ + swap((_base)[_i], (_base)[_i + 1]); \ + _swapped = true; \ + } \ + } \ +} while (0) + #endif /* _BCACHEFS_UTIL_H */