X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fsuper-io.c;h=3903b730bba31bd8f0f61433109a0fe76d4e9095;hb=a315a3f664f9cf5cc687738ad97210721ccde8fd;hp=3f55c244c6392ba05dfa0af091902579bd673a57;hpb=1cf4d51dc4661f336f5318c176a3561ddf5bf04f;p=bcachefs-tools-debian diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 3f55c24..3903b73 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -1,9 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_update_interior.h" +#include "buckets.h" #include "checksum.h" +#include "disk_groups.h" +#include "ec.h" #include "error.h" #include "io.h" #include "journal.h" +#include "journal_io.h" +#include "journal_seq_blacklist.h" +#include "replicas.h" +#include "quota.h" #include "super-io.h" #include "super.h" #include "vstructs.h" @@ -11,15 +20,15 @@ #include #include -static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); -static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, - struct bch_replicas_cpu *); -static const char *bch2_sb_validate_replicas(struct bch_sb *); +const char * const bch2_sb_fields[] = { +#define x(name, nr) #name, + BCH_SB_FIELDS() +#undef x + NULL +}; -static inline void __bch2_sb_layout_size_assert(void) -{ - BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); -} +static const char *bch2_sb_field_validate(struct bch_sb *, + struct bch_sb_field *); struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, enum bch_sb_field_type type) @@ -34,117 +43,119 @@ struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, return NULL; } -void bch2_free_super(struct bch_sb_handle *sb) +static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, + struct bch_sb_field *f, + unsigned u64s) { - if (sb->bio) - bio_put(sb->bio); - if (!IS_ERR_OR_NULL(sb->bdev)) - blkdev_put(sb->bdev, sb->mode); - - free_pages((unsigned long) sb->sb, sb->page_order); - memset(sb, 0, sizeof(*sb)); -} - -static int __bch2_super_realloc(struct bch_sb_handle *sb, unsigned order) -{ - struct bch_sb *new_sb; - struct bio *bio; - - if (sb->page_order >= order && sb->sb) - return 0; + unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; + unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; - if (dynamic_fault("bcachefs:add:super_realloc")) - return -ENOMEM; + BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size); - bio = bio_kmalloc(GFP_KERNEL, 1 << order); - if (!bio) - return -ENOMEM; + if (!f && !u64s) { + /* nothing to do: */ + } else if (!f) { + f = vstruct_last(sb->sb); + memset(f, 0, sizeof(u64) * u64s); + f->u64s = cpu_to_le32(u64s); + f->type = 0; + } else { + void *src, *dst; - if (sb->bio) - bio_put(sb->bio); - sb->bio = bio; + src = vstruct_end(f); - new_sb = (void *) __get_free_pages(GFP_KERNEL, order); - if (!new_sb) - return -ENOMEM; + if (u64s) { + f->u64s = cpu_to_le32(u64s); + dst = vstruct_end(f); + } else { + dst = f; + } - if (sb->sb) - memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order); + memmove(dst, src, vstruct_end(sb->sb) - src); - free_pages((unsigned long) sb->sb, sb->page_order); - sb->sb = new_sb; + if (dst > src) + memset(src, 0, dst - src); + } - sb->page_order = order; + sb->sb->u64s = cpu_to_le32(sb_u64s); - return 0; + return u64s ? f : NULL; } -static int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) +void bch2_sb_field_delete(struct bch_sb_handle *sb, + enum bch_sb_field_type type) { - u64 new_bytes = __vstruct_bytes(struct bch_sb, u64s); - u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; + struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); - if (new_bytes > max_bytes) { - char buf[BDEVNAME_SIZE]; + if (f) + __bch2_sb_field_resize(sb, f, 0); +} - pr_err("%s: superblock too big: want %llu but have %llu", - bdevname(sb->bdev, buf), new_bytes, max_bytes); - return -ENOSPC; - } +/* Superblock realloc/free: */ - return __bch2_super_realloc(sb, get_order(new_bytes)); +void bch2_free_super(struct bch_sb_handle *sb) +{ + if (sb->bio) + bio_put(sb->bio); + if (!IS_ERR_OR_NULL(sb->bdev)) + blkdev_put(sb->bdev, sb->mode); + + kfree(sb->sb); + memset(sb, 0, sizeof(*sb)); } -static int bch2_fs_sb_realloc(struct bch_fs *c, unsigned u64s) +int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) { - u64 bytes = __vstruct_bytes(struct bch_sb, u64s); - struct bch_sb *sb; - unsigned order = get_order(bytes); + size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); + size_t new_buffer_size; + struct bch_sb *new_sb; + struct bio *bio; - if (c->disk_sb && order <= c->disk_sb_order) - return 0; + if (sb->bdev) + new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev)); - sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order); - if (!sb) - return -ENOMEM; + new_buffer_size = roundup_pow_of_two(new_bytes); - if (c->disk_sb) - memcpy(sb, c->disk_sb, PAGE_SIZE << c->disk_sb_order); + if (sb->sb && sb->buffer_size >= new_buffer_size) + return 0; - free_pages((unsigned long) c->disk_sb, c->disk_sb_order); + if (sb->have_layout) { + u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; - c->disk_sb = sb; - c->disk_sb_order = order; - return 0; -} + if (new_bytes > max_bytes) { + char buf[BDEVNAME_SIZE]; -static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb *sb, - struct bch_sb_field *f, - unsigned u64s) -{ - unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; + pr_err("%s: superblock too big: want %zu but have %llu", + bdevname(sb->bdev, buf), new_bytes, max_bytes); + return -ENOSPC; + } + } - if (!f) { - f = vstruct_last(sb); - memset(f, 0, sizeof(u64) * u64s); - f->u64s = cpu_to_le32(u64s); - f->type = 0; - } else { - void *src, *dst; + if (sb->buffer_size >= new_buffer_size && sb->sb) + return 0; - src = vstruct_end(f); - f->u64s = cpu_to_le32(u64s); - dst = vstruct_end(f); + if (dynamic_fault("bcachefs:add:super_realloc")) + return -ENOMEM; - memmove(dst, src, vstruct_end(sb) - src); + if (sb->have_bio) { + bio = bio_kmalloc(GFP_KERNEL, + DIV_ROUND_UP(new_buffer_size, PAGE_SIZE)); + if (!bio) + return -ENOMEM; - if (dst > src) - memset(src, 0, dst - src); + if (sb->bio) + bio_put(sb->bio); + sb->bio = bio; } - le32_add_cpu(&sb->u64s, u64s - old_u64s); + new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO); + if (!new_sb) + return -ENOMEM; + + sb->sb = new_sb; + sb->buffer_size = new_buffer_size; - return f; + return 0; } struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, @@ -158,42 +169,39 @@ struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) return NULL; - f = __bch2_sb_field_resize(sb->sb, f, u64s); - f->type = cpu_to_le32(type); - return f; -} + if (sb->fs_sb) { + struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); + struct bch_dev *ca; + unsigned i; -struct bch_sb_field *bch2_fs_sb_field_resize(struct bch_fs *c, - enum bch_sb_field_type type, - unsigned u64s) -{ - struct bch_sb_field *f = bch2_sb_field_get(c->disk_sb, type); - ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; - ssize_t d = -old_u64s + u64s; - struct bch_dev *ca; - unsigned i; + lockdep_assert_held(&c->sb_lock); - lockdep_assert_held(&c->sb_lock); + /* XXX: we're not checking that offline device have enough space */ - if (bch2_fs_sb_realloc(c, le32_to_cpu(c->disk_sb->u64s) + d)) - return NULL; - - /* XXX: we're not checking that offline device have enough space */ + for_each_online_member(ca, c, i) { + struct bch_sb_handle *sb = &ca->disk_sb; - for_each_online_member(ca, c, i) { - struct bch_sb_handle *sb = &ca->disk_sb; - - if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { - percpu_ref_put(&ca->ref); - return NULL; + if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { + percpu_ref_put(&ca->ref); + return NULL; + } } } - f = __bch2_sb_field_resize(c->disk_sb, f, u64s); - f->type = cpu_to_le32(type); + f = bch2_sb_field_get(sb->sb, type); + f = __bch2_sb_field_resize(sb, f, u64s); + if (f) + f->type = cpu_to_le32(type); return f; } +/* Superblock validate: */ + +static inline void __bch2_sb_layout_size_assert(void) +{ + BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); +} + static const char *validate_sb_layout(struct bch_sb_layout *layout) { u64 offset, prev_offset, max_sectors; @@ -226,104 +234,30 @@ static const char *validate_sb_layout(struct bch_sb_layout *layout) return NULL; } -static int u64_cmp(const void *_l, const void *_r) -{ - u64 l = *((const u64 *) _l), r = *((const u64 *) _r); - - return l < r ? -1 : l > r ? 1 : 0; -} - -const char *bch2_sb_validate_journal(struct bch_sb *sb, - struct bch_member_cpu mi) -{ - struct bch_sb_field_journal *journal; - const char *err; - unsigned nr; - unsigned i; - u64 *b; - - journal = bch2_sb_get_journal(sb); - if (!journal) - return NULL; - - nr = bch2_nr_journal_buckets(journal); - if (!nr) - return NULL; - - b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); - if (!b) - return "cannot allocate memory"; - - for (i = 0; i < nr; i++) - b[i] = le64_to_cpu(journal->buckets[i]); - - sort(b, nr, sizeof(u64), u64_cmp, NULL); - - err = "journal bucket at sector 0"; - if (!b[0]) - goto err; - - err = "journal bucket before first bucket"; - if (b[0] < mi.first_bucket) - goto err; - - err = "journal bucket past end of device"; - if (b[nr - 1] >= mi.nbuckets) - goto err; - - err = "duplicate journal buckets"; - for (i = 0; i + 1 < nr; i++) - if (b[i] == b[i + 1]) - goto err; - - err = NULL; -err: - kfree(b); - return err; -} - -static const char *bch2_sb_validate_members(struct bch_sb *sb) -{ - struct bch_sb_field_members *mi; - unsigned i; - - mi = bch2_sb_get_members(sb); - if (!mi) - return "Invalid superblock: member info area missing"; - - if ((void *) (mi->members + sb->nr_devices) > - vstruct_end(&mi->field)) - return "Invalid superblock: bad member info"; - - for (i = 0; i < sb->nr_devices; i++) { - if (!bch2_dev_exists(sb, mi, i)) - continue; - - if (le16_to_cpu(mi->members[i].bucket_size) < - BCH_SB_BTREE_NODE_SIZE(sb)) - return "bucket size smaller than btree node size"; - } - - return NULL; -} - const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) { struct bch_sb *sb = disk_sb->sb; struct bch_sb_field *f; - struct bch_sb_field_members *sb_mi; - struct bch_member_cpu mi; + struct bch_sb_field_members *mi; const char *err; + u32 version, version_min; u16 block_size; - if (le64_to_cpu(sb->version) < BCH_SB_VERSION_MIN || - le64_to_cpu(sb->version) > BCH_SB_VERSION_MAX) - return"Unsupported superblock version"; + version = le16_to_cpu(sb->version); + version_min = version >= bcachefs_metadata_version_new_versioning + ? le16_to_cpu(sb->version_min) + : version; - if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX) { - SET_BCH_SB_ENCODED_EXTENT_MAX_BITS(sb, 7); - SET_BCH_SB_POSIX_ACL(sb, 1); - } + if (version >= bcachefs_metadata_version_max || + version_min < bcachefs_metadata_version_min) + return "Unsupported superblock version"; + + if (version_min > version) + return "Bad minimum version"; + + if (sb->features[1] || + (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) + return "Filesystem has incompatible features"; block_size = le16_to_cpu(sb->block_size); @@ -340,22 +274,22 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) if (!sb->nr_devices || sb->nr_devices <= sb->dev_idx || sb->nr_devices > BCH_SB_MEMBERS_MAX) - return "Bad cache device number in set"; + return "Bad number of member devices"; if (!BCH_SB_META_REPLICAS_WANT(sb) || - BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) + BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX) return "Invalid number of metadata replicas"; if (!BCH_SB_META_REPLICAS_REQ(sb) || - BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) + BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX) return "Invalid number of metadata replicas"; if (!BCH_SB_DATA_REPLICAS_WANT(sb) || - BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) + BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX) return "Invalid number of data replicas"; if (!BCH_SB_DATA_REPLICAS_REQ(sb) || - BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) + BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX) return "Invalid number of data replicas"; if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) @@ -373,9 +307,6 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb))) return "Btree node size not a power of two"; - if (BCH_SB_BTREE_NODE_SIZE(sb) > BTREE_NODE_SIZE_MAX) - return "Btree node size too large"; - if (BCH_SB_GC_RESERVE(sb) < 5) return "gc reserve percentage too small"; @@ -394,84 +325,34 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) if (vstruct_next(f) > vstruct_last(sb)) return "Invalid superblock: invalid optional field"; - - if (le32_to_cpu(f->type) >= BCH_SB_FIELD_NR) - return "Invalid superblock: unknown optional field type"; } - err = bch2_sb_validate_members(sb); - if (err) - return err; - - sb_mi = bch2_sb_get_members(sb); - mi = bch2_mi_to_cpu(sb_mi->members + sb->dev_idx); - - if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_MAX) { - struct bch_member *m; - - for (m = sb_mi->members; - m < sb_mi->members + sb->nr_devices; - m++) - SET_BCH_MEMBER_DATA_ALLOWED(m, ~0); - } - - if (mi.nbuckets > LONG_MAX) - return "Too many buckets"; - - if (mi.nbuckets - mi.first_bucket < 1 << 10) - return "Not enough buckets"; - - if (mi.bucket_size < block_size) - return "Bad bucket size"; - - if (get_capacity(disk_sb->bdev->bd_disk) < - mi.bucket_size * mi.nbuckets) - return "Invalid superblock: device too small"; - - err = bch2_sb_validate_journal(sb, mi); - if (err) - return err; + /* members must be validated first: */ + mi = bch2_sb_get_members(sb); + if (!mi) + return "Invalid superblock: member info area missing"; - err = bch2_sb_validate_replicas(sb); + err = bch2_sb_field_validate(sb, &mi->field); if (err) return err; - if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_NONCE_V1 && - bch2_sb_get_crypt(sb) && - BCH_SB_INITIALIZED(sb)) - return "Incompatible extent nonces"; + vstruct_for_each(sb, f) { + if (le32_to_cpu(f->type) == BCH_SB_FIELD_members) + continue; - sb->version = cpu_to_le64(BCH_SB_VERSION_MAX); + err = bch2_sb_field_validate(sb, f); + if (err) + return err; + } return NULL; } /* device open: */ -static const char *bch2_blkdev_open(const char *path, fmode_t mode, - void *holder, struct block_device **ret) -{ - struct block_device *bdev; - - *ret = NULL; - bdev = blkdev_get_by_path(path, mode, holder); - if (bdev == ERR_PTR(-EBUSY)) - return "device busy"; - - if (IS_ERR(bdev)) - return "failed to open device"; - - if (mode & FMODE_WRITE) - bdev_get_queue(bdev)->backing_dev_info->capabilities - |= BDI_CAP_STABLE_WRITES; - - *ret = bdev; - return NULL; -} - static void bch2_sb_update(struct bch_fs *c) { - struct bch_sb *src = c->disk_sb; + struct bch_sb *src = c->disk_sb.sb; struct bch_sb_field_members *mi = bch2_sb_get_members(src); struct bch_dev *ca; unsigned i; @@ -480,24 +361,36 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.uuid = src->uuid; c->sb.user_uuid = src->user_uuid; + c->sb.version = le16_to_cpu(src->version); + c->sb.version_min = le16_to_cpu(src->version_min); c->sb.nr_devices = src->nr_devices; c->sb.clean = BCH_SB_CLEAN(src); c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src); - c->sb.time_base_lo = le64_to_cpu(src->time_base_lo); + + c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision); + c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit; + + /* XXX this is wrong, we need a 96 or 128 bit integer type */ + c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo), + c->sb.nsec_per_time_unit); c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); - c->sb.time_precision = le32_to_cpu(src->time_precision); + + c->sb.features = le64_to_cpu(src->features[0]); + c->sb.compat = le64_to_cpu(src->compat[0]); for_each_member_device(ca, c, i) ca->mi = bch2_mi_to_cpu(mi->members + i); } -/* doesn't copy member info */ -static void __copy_super(struct bch_sb *dst, struct bch_sb *src) +static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) { struct bch_sb_field *src_f, *dst_f; + struct bch_sb *dst = dst_handle->sb; + unsigned i; dst->version = src->version; + dst->version_min = src->version_min; dst->seq = src->seq; dst->uuid = src->uuid; dst->user_uuid = src->user_uuid; @@ -514,15 +407,17 @@ static void __copy_super(struct bch_sb *dst, struct bch_sb *src) memcpy(dst->features, src->features, sizeof(dst->features)); memcpy(dst->compat, src->compat, sizeof(dst->compat)); - vstruct_for_each(src, src_f) { - if (src_f->type == BCH_SB_FIELD_journal) + for (i = 0; i < BCH_SB_FIELD_NR; i++) { + if (i == BCH_SB_FIELD_journal) continue; - dst_f = bch2_sb_field_get(dst, le32_to_cpu(src_f->type)); - dst_f = __bch2_sb_field_resize(dst, dst_f, - le32_to_cpu(src_f->u64s)); + src_f = bch2_sb_field_get(src, i); + dst_f = bch2_sb_field_get(dst, i); + dst_f = __bch2_sb_field_resize(dst_handle, dst_f, + src_f ? le32_to_cpu(src_f->u64s) : 0); - memcpy(dst_f, src_f, vstruct_bytes(src_f)); + if (src_f) + memcpy(dst_f, src_f, vstruct_bytes(src_f)); } } @@ -537,22 +432,33 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) lockdep_assert_held(&c->sb_lock); - if (bch2_fs_sb_realloc(c, le32_to_cpu(src->u64s) - journal_u64s)) - return -ENOMEM; + ret = bch2_sb_realloc(&c->disk_sb, + le32_to_cpu(src->u64s) - journal_u64s); + if (ret) + return ret; + + __copy_super(&c->disk_sb, src); - __copy_super(c->disk_sb, src); + if (BCH_SB_HAS_ERRORS(c->disk_sb.sb)) + set_bit(BCH_FS_ERROR, &c->flags); + if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) + set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); ret = bch2_sb_replicas_to_cpu_replicas(c); if (ret) return ret; + ret = bch2_sb_disk_groups_to_cpu(c); + if (ret) + return ret; + bch2_sb_update(c); return 0; } int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) { - struct bch_sb *src = c->disk_sb, *dst = ca->disk_sb.sb; + struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb; struct bch_sb_field_journal *journal_buckets = bch2_sb_get_journal(dst); unsigned journal_u64s = journal_buckets @@ -565,8 +471,7 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) if (ret) return ret; - __copy_super(dst, src); - + __copy_super(&ca->disk_sb, src); return 0; } @@ -576,14 +481,12 @@ static const char *read_one_super(struct bch_sb_handle *sb, u64 offset) { struct bch_csum csum; size_t bytes; - unsigned order; reread: bio_reset(sb->bio); - sb->bio->bi_bdev = sb->bdev; + bio_set_dev(sb->bio, sb->bdev); sb->bio->bi_iter.bi_sector = offset; - sb->bio->bi_iter.bi_size = PAGE_SIZE << sb->page_order; bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); - bch2_bio_map(sb->bio, sb->sb); + bch2_bio_map(sb->bio, sb->sb, sb->buffer_size); if (submit_bio_wait(sb->bio)) return "IO error"; @@ -591,18 +494,17 @@ reread: if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) return "Not a bcachefs superblock"; - if (le64_to_cpu(sb->sb->version) < BCH_SB_VERSION_MIN || - le64_to_cpu(sb->sb->version) > BCH_SB_VERSION_MAX) - return"Unsupported superblock version"; + if (le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_min || + le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max) + return "Unsupported superblock version"; bytes = vstruct_bytes(sb->sb); if (bytes > 512 << sb->sb->layout.sb_max_size_bits) return "Bad superblock: too big"; - order = get_order(bytes); - if (order > sb->page_order) { - if (__bch2_super_realloc(sb, order)) + if (bytes > sb->buffer_size) { + if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s))) return "cannot allocate memory"; goto reread; } @@ -617,47 +519,65 @@ reread: if (bch2_crc_cmp(csum, sb->sb->csum)) return "bad checksum reading superblock"; + sb->seq = le64_to_cpu(sb->sb->seq); + return NULL; } -const char *bch2_read_super(const char *path, - struct bch_opts opts, - struct bch_sb_handle *ret) +int bch2_read_super(const char *path, struct bch_opts *opts, + struct bch_sb_handle *sb) { - u64 offset = opt_get(opts, sb); + u64 offset = opt_get(*opts, sb); struct bch_sb_layout layout; const char *err; - unsigned i; + __le64 *i; + int ret; - memset(ret, 0, sizeof(*ret)); - ret->mode = FMODE_READ; + pr_verbose_init(*opts, ""); - if (!opt_get(opts, noexcl)) - ret->mode |= FMODE_EXCL; + memset(sb, 0, sizeof(*sb)); + sb->mode = FMODE_READ; + sb->have_bio = true; - if (!opt_get(opts, nochanges)) - ret->mode |= FMODE_WRITE; + if (!opt_get(*opts, noexcl)) + sb->mode |= FMODE_EXCL; - err = bch2_blkdev_open(path, ret->mode, ret, &ret->bdev); - if (err) - return err; + if (!opt_get(*opts, nochanges)) + sb->mode |= FMODE_WRITE; + + sb->bdev = blkdev_get_by_path(path, sb->mode, sb); + if (IS_ERR(sb->bdev) && + PTR_ERR(sb->bdev) == -EACCES && + opt_get(*opts, read_only)) { + sb->mode &= ~FMODE_WRITE; + + sb->bdev = blkdev_get_by_path(path, sb->mode, sb); + if (!IS_ERR(sb->bdev)) + opt_set(*opts, nochanges, true); + } + + if (IS_ERR(sb->bdev)) { + ret = PTR_ERR(sb->bdev); + goto out; + } err = "cannot allocate memory"; - if (__bch2_super_realloc(ret, 0)) + ret = bch2_sb_realloc(sb, 0); + if (ret) goto err; + ret = -EFAULT; err = "dynamic fault"; if (bch2_fs_init_fault("read_super")) goto err; - err = read_one_super(ret, offset); + ret = -EINVAL; + err = read_one_super(sb, offset); if (!err) goto got_super; - if (offset != BCH_SB_SECTOR) { - pr_err("error reading superblock: %s", err); + if (opt_defined(*opts, sb)) goto err; - } pr_err("error reading default superblock: %s", err); @@ -665,53 +585,56 @@ const char *bch2_read_super(const char *path, * Error reading primary superblock - read location of backup * superblocks: */ - bio_reset(ret->bio); - ret->bio->bi_bdev = ret->bdev; - ret->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; - ret->bio->bi_iter.bi_size = sizeof(struct bch_sb_layout); - bio_set_op_attrs(ret->bio, REQ_OP_READ, REQ_SYNC|REQ_META); + bio_reset(sb->bio); + bio_set_dev(sb->bio, sb->bdev); + sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; + bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); /* * use sb buffer to read layout, since sb buffer is page aligned but * layout won't be: */ - bch2_bio_map(ret->bio, ret->sb); + bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); err = "IO error"; - if (submit_bio_wait(ret->bio)) + if (submit_bio_wait(sb->bio)) goto err; - memcpy(&layout, ret->sb, sizeof(layout)); + memcpy(&layout, sb->sb, sizeof(layout)); err = validate_sb_layout(&layout); if (err) goto err; - for (i = 0; i < layout.nr_superblocks; i++) { - u64 offset = le64_to_cpu(layout.sb_offset[i]); + for (i = layout.sb_offset; + i < layout.sb_offset + layout.nr_superblocks; i++) { + offset = le64_to_cpu(*i); - if (offset == BCH_SB_SECTOR) + if (offset == opt_get(*opts, sb)) continue; - err = read_one_super(ret, offset); + err = read_one_super(sb, offset); if (!err) goto got_super; } + + ret = -EINVAL; goto err; -got_super: - pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", - le64_to_cpu(ret->sb->version), - le64_to_cpu(ret->sb->flags[0]), - le64_to_cpu(ret->sb->seq), - le32_to_cpu(ret->sb->u64s)); +got_super: err = "Superblock block size smaller than device block size"; - if (le16_to_cpu(ret->sb->block_size) << 9 < - bdev_logical_block_size(ret->bdev)) + ret = -EINVAL; + if (le16_to_cpu(sb->sb->block_size) << 9 < + bdev_logical_block_size(sb->bdev)) goto err; - return NULL; + ret = 0; + sb->have_layout = true; +out: + pr_verbose_init(*opts, "ret %i", ret); + return ret; err: - bch2_free_super(ret); - return err; + bch2_free_super(sb); + pr_err("error reading superblock: %s", err); + goto out; } /* write superblock: */ @@ -722,13 +645,34 @@ static void write_super_endio(struct bio *bio) /* XXX: return errors directly */ - if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write")) + if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s", + bch2_blk_status_to_str(bio->bi_status))) ca->sb_write_error = 1; closure_put(&ca->fs->sb_write); percpu_ref_put(&ca->io_ref); } +static void read_back_super(struct bch_fs *c, struct bch_dev *ca) +{ + struct bch_sb *sb = ca->disk_sb.sb; + struct bio *bio = ca->disk_sb.bio; + + bio_reset(bio); + bio_set_dev(bio, ca->disk_sb.bdev); + bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); + bio->bi_end_io = write_super_endio; + bio->bi_private = ca; + bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META); + bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); + + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], + bio_sectors(bio)); + + percpu_ref_get(&ca->io_ref); + closure_bio_submit(bio, &c->sb_write); +} + static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) { struct bch_sb *sb = ca->disk_sb.sb; @@ -736,29 +680,28 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) sb->offset = sb->layout.sb_offset[idx]; - SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum); + SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false)); sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), null_nonce(), sb); bio_reset(bio); - bio->bi_bdev = ca->disk_sb.bdev; + bio_set_dev(bio, ca->disk_sb.bdev); bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); - bio->bi_iter.bi_size = - roundup(vstruct_bytes(sb), - bdev_logical_block_size(ca->disk_sb.bdev)); bio->bi_end_io = write_super_endio; bio->bi_private = ca; bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); - bch2_bio_map(bio, sb); + bch2_bio_map(bio, sb, + roundup((size_t) vstruct_bytes(sb), + bdev_logical_block_size(ca->disk_sb.bdev))); - this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB], + this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], bio_sectors(bio)); percpu_ref_get(&ca->io_ref); closure_bio_submit(bio, &c->sb_write); } -void bch2_write_super(struct bch_fs *c) +int bch2_write_super(struct bch_fs *c) { struct closure *cl = &c->sb_write; struct bch_dev *ca; @@ -766,13 +709,25 @@ void bch2_write_super(struct bch_fs *c) const char *err; struct bch_devs_mask sb_written; bool wrote, can_mount_without_written, can_mount_with_written; + unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; + int ret = 0; + + if (c->opts.very_degraded) + degraded_flags |= BCH_FORCE_IF_LOST; lockdep_assert_held(&c->sb_lock); closure_init_stack(cl); memset(&sb_written, 0, sizeof(sb_written)); - le64_add_cpu(&c->disk_sb->seq, 1); + le64_add_cpu(&c->disk_sb.sb->seq, 1); + + if (test_bit(BCH_FS_ERROR, &c->flags)) + SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); + if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags)) + SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1); + + SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); for_each_online_member(ca, c, i) bch2_sb_from_fs(c, ca); @@ -781,12 +736,12 @@ void bch2_write_super(struct bch_fs *c) err = bch2_sb_validate(&ca->disk_sb); if (err) { bch2_fs_inconsistent(c, "sb invalid before write: %s", err); + ret = -1; goto out; } } - if (c->opts.nochanges || - test_bit(BCH_FS_ERROR, &c->flags)) + if (c->opts.nochanges) goto out; for_each_online_member(ca, c, i) { @@ -794,10 +749,27 @@ void bch2_write_super(struct bch_fs *c) ca->sb_write_error = 0; } + for_each_online_member(ca, c, i) + read_back_super(c, ca); + closure_sync(cl); + + for_each_online_member(ca, c, i) { + if (!ca->sb_write_error && + ca->disk_sb.seq != + le64_to_cpu(ca->sb_read_scratch->seq)) { + bch2_fs_fatal_error(c, + "Superblock modified by another process"); + percpu_ref_put(&ca->io_ref); + ret = -EROFS; + goto out; + } + } + do { wrote = false; for_each_online_member(ca, c, i) - if (sb < ca->disk_sb.sb->layout.nr_superblocks) { + if (!ca->sb_write_error && + sb < ca->disk_sb.sb->layout.nr_superblocks) { write_one_super(c, ca, sb); wrote = true; } @@ -805,24 +777,23 @@ void bch2_write_super(struct bch_fs *c) sb++; } while (wrote); - for_each_online_member(ca, c, i) + for_each_online_member(ca, c, i) { if (ca->sb_write_error) __clear_bit(ca->dev_idx, sb_written.d); + else + ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq); + } nr_wrote = dev_mask_nr(&sb_written); can_mount_with_written = - bch2_have_enough_devs(c, - __bch2_replicas_status(c, sb_written), - BCH_FORCE_IF_DEGRADED); + bch2_have_enough_devs(c, sb_written, degraded_flags, false); for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) sb_written.d[i] = ~sb_written.d[i]; can_mount_without_written = - bch2_have_enough_devs(c, - __bch2_replicas_status(c, sb_written), - BCH_FORCE_IF_DEGRADED); + bch2_have_enough_devs(c, sb_written, degraded_flags, false); /* * If we would be able to mount _without_ the devices we successfully @@ -832,605 +803,401 @@ void bch2_write_super(struct bch_fs *c) * written anything (new filesystem), we continue if we'd be able to * mount with the devices we did successfully write to: */ - bch2_fs_fatal_err_on(!nr_wrote || - (can_mount_without_written && - !can_mount_with_written), c, - "Unable to write superblock to sufficient devices"); + if (bch2_fs_fatal_err_on(!nr_wrote || + !can_mount_with_written || + (can_mount_without_written && + !can_mount_with_written), c, + "Unable to write superblock to sufficient devices")) + ret = -1; out: /* Make new options visible after they're persistent: */ bch2_sb_update(c); + return ret; } -/* Replicas tracking - in memory: */ - -#define for_each_cpu_replicas_entry(_r, _i) \ - for (_i = (_r)->entries; \ - (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ - _i = (void *) (_i) + (_r)->entry_size) - -static inline struct bch_replicas_cpu_entry * -cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) -{ - return (void *) r->entries + r->entry_size * i; -} - -static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) -{ - eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); -} - -static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e, - unsigned dev) +void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) { - return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0; -} + mutex_lock(&c->sb_lock); + if (!(c->sb.features & (1ULL << feat))) { + c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); -static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e, - unsigned dev) -{ - e->devs[dev >> 3] |= 1 << (dev & 7); + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); } -static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r) -{ - return (r->entry_size - - offsetof(struct bch_replicas_cpu_entry, devs)) * 8; -} +/* BCH_SB_FIELD_journal: */ -static unsigned bkey_to_replicas(struct bkey_s_c_extent e, - enum bch_data_type data_type, - struct bch_replicas_cpu_entry *r, - unsigned *max_dev) +static int u64_cmp(const void *_l, const void *_r) { - const struct bch_extent_ptr *ptr; - unsigned nr = 0; - - BUG_ON(!data_type || - data_type == BCH_DATA_SB || - data_type >= BCH_DATA_NR); - - memset(r, 0, sizeof(*r)); - r->data_type = data_type; - - *max_dev = 0; + u64 l = *((const u64 *) _l), r = *((const u64 *) _r); - extent_for_each_ptr(e, ptr) - if (!ptr->cached) { - *max_dev = max_t(unsigned, *max_dev, ptr->dev); - replicas_set_dev(r, ptr->dev); - nr++; - } - return nr; + return l < r ? -1 : l > r ? 1 : 0; } -static struct bch_replicas_cpu * -cpu_replicas_add_entry(struct bch_replicas_cpu *old, - struct bch_replicas_cpu_entry new_entry, - unsigned max_dev) +static const char *bch2_sb_validate_journal(struct bch_sb *sb, + struct bch_sb_field *f) { - struct bch_replicas_cpu *new; - unsigned i, nr, entry_size; - - entry_size = offsetof(struct bch_replicas_cpu_entry, devs) + - DIV_ROUND_UP(max_dev + 1, 8); - entry_size = max(entry_size, old->entry_size); - nr = old->nr + 1; + struct bch_sb_field_journal *journal = field_to_type(f, journal); + struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; + const char *err; + unsigned nr; + unsigned i; + u64 *b; - new = kzalloc(sizeof(struct bch_replicas_cpu) + - nr * entry_size, GFP_NOIO); - if (!new) + journal = bch2_sb_get_journal(sb); + if (!journal) return NULL; - new->nr = nr; - new->entry_size = entry_size; - - for (i = 0; i < old->nr; i++) - memcpy(cpu_replicas_entry(new, i), - cpu_replicas_entry(old, i), - min(new->entry_size, old->entry_size)); - - memcpy(cpu_replicas_entry(new, old->nr), - &new_entry, - new->entry_size); - - bch2_cpu_replicas_sort(new); - return new; -} - -static bool replicas_has_entry(struct bch_replicas_cpu *r, - struct bch_replicas_cpu_entry search, - unsigned max_dev) -{ - return max_dev < replicas_dev_slots(r) && - eytzinger0_find(r->entries, r->nr, - r->entry_size, - memcmp, &search) < r->nr; -} - -noinline -static int bch2_check_mark_super_slowpath(struct bch_fs *c, - struct bch_replicas_cpu_entry new_entry, - unsigned max_dev) -{ - struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r; - int ret = -ENOMEM; + nr = bch2_nr_journal_buckets(journal); + if (!nr) + return NULL; - mutex_lock(&c->sb_lock); + b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); + if (!b) + return "cannot allocate memory"; - old_gc = rcu_dereference_protected(c->replicas_gc, - lockdep_is_held(&c->sb_lock)); - if (old_gc && !replicas_has_entry(old_gc, new_entry, max_dev)) { - new_gc = cpu_replicas_add_entry(old_gc, new_entry, max_dev); - if (!new_gc) - goto err; - } + for (i = 0; i < nr; i++) + b[i] = le64_to_cpu(journal->buckets[i]); - old_r = rcu_dereference_protected(c->replicas, - lockdep_is_held(&c->sb_lock)); - /* recheck, might have raced */ - if (replicas_has_entry(old_r, new_entry, max_dev)) - goto out; + sort(b, nr, sizeof(u64), u64_cmp, NULL); - new_r = cpu_replicas_add_entry(old_r, new_entry, max_dev); - if (!new_r) + err = "journal bucket at sector 0"; + if (!b[0]) goto err; - ret = bch2_cpu_replicas_to_sb_replicas(c, new_r); - if (ret) + err = "journal bucket before first bucket"; + if (m && b[0] < le16_to_cpu(m->first_bucket)) goto err; - if (new_gc) { - rcu_assign_pointer(c->replicas_gc, new_gc); - kfree_rcu(old_gc, rcu); - } + err = "journal bucket past end of device"; + if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets)) + goto err; - rcu_assign_pointer(c->replicas, new_r); - kfree_rcu(old_r, rcu); + err = "duplicate journal buckets"; + for (i = 0; i + 1 < nr; i++) + if (b[i] == b[i + 1]) + goto err; - bch2_write_super(c); -out: - ret = 0; + err = NULL; err: - mutex_unlock(&c->sb_lock); - return ret; -} - -static inline int __bch2_check_mark_super(struct bch_fs *c, - struct bch_replicas_cpu_entry search, - unsigned max_dev) -{ - struct bch_replicas_cpu *r, *gc_r; - bool marked; - - rcu_read_lock(); - r = rcu_dereference(c->replicas); - gc_r = rcu_dereference(c->replicas_gc); - marked = replicas_has_entry(r, search, max_dev) && - (!likely(gc_r) || replicas_has_entry(gc_r, search, max_dev)); - rcu_read_unlock(); - - return likely(marked) ? 0 - : bch2_check_mark_super_slowpath(c, search, max_dev); + kfree(b); + return err; } -int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e, - enum bch_data_type data_type) -{ - struct bch_replicas_cpu_entry search; - unsigned max_dev; +static const struct bch_sb_field_ops bch_sb_field_ops_journal = { + .validate = bch2_sb_validate_journal, +}; - if (!bkey_to_replicas(e, data_type, &search, &max_dev)) - return 0; +/* BCH_SB_FIELD_members: */ - return __bch2_check_mark_super(c, search, max_dev); -} - -int bch2_check_mark_super_devlist(struct bch_fs *c, - struct bch_devs_list *devs, - enum bch_data_type data_type) +static const char *bch2_sb_validate_members(struct bch_sb *sb, + struct bch_sb_field *f) { - struct bch_replicas_cpu_entry search = { .data_type = data_type }; - unsigned i, max_dev = 0; - - if (!devs->nr) - return 0; - - for (i = 0; i < devs->nr; i++) { - max_dev = max_t(unsigned, max_dev, devs->devs[i]); - replicas_set_dev(&search, devs->devs[i]); - } + struct bch_sb_field_members *mi = field_to_type(f, members); + struct bch_member *m; - return __bch2_check_mark_super(c, search, max_dev); -} - -int bch2_replicas_gc_end(struct bch_fs *c, int err) -{ - struct bch_replicas_cpu *new_r, *old_r; - int ret = 0; + if ((void *) (mi->members + sb->nr_devices) > + vstruct_end(&mi->field)) + return "Invalid superblock: bad member info"; - lockdep_assert_held(&c->replicas_gc_lock); + for (m = mi->members; + m < mi->members + sb->nr_devices; + m++) { + if (!bch2_member_exists(m)) + continue; - mutex_lock(&c->sb_lock); + if (le64_to_cpu(m->nbuckets) > LONG_MAX) + return "Too many buckets"; - new_r = rcu_dereference_protected(c->replicas_gc, - lockdep_is_held(&c->sb_lock)); + if (le64_to_cpu(m->nbuckets) - + le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) + return "Not enough buckets"; - if (err) { - rcu_assign_pointer(c->replicas_gc, NULL); - kfree_rcu(new_r, rcu); - goto err; - } + if (le16_to_cpu(m->bucket_size) < + le16_to_cpu(sb->block_size)) + return "bucket size smaller than block size"; - if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) { - ret = -ENOSPC; - goto err; + if (le16_to_cpu(m->bucket_size) < + BCH_SB_BTREE_NODE_SIZE(sb)) + return "bucket size smaller than btree node size"; } - old_r = rcu_dereference_protected(c->replicas, - lockdep_is_held(&c->sb_lock)); - - rcu_assign_pointer(c->replicas, new_r); - rcu_assign_pointer(c->replicas_gc, NULL); - kfree_rcu(old_r, rcu); - - bch2_write_super(c); -err: - mutex_unlock(&c->sb_lock); - return ret; + return NULL; } -int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) -{ - struct bch_replicas_cpu *dst, *src; - struct bch_replicas_cpu_entry *e; - - lockdep_assert_held(&c->replicas_gc_lock); +static const struct bch_sb_field_ops bch_sb_field_ops_members = { + .validate = bch2_sb_validate_members, +}; - mutex_lock(&c->sb_lock); - BUG_ON(c->replicas_gc); - - src = rcu_dereference_protected(c->replicas, - lockdep_is_held(&c->sb_lock)); +/* BCH_SB_FIELD_crypt: */ - dst = kzalloc(sizeof(struct bch_replicas_cpu) + - src->nr * src->entry_size, GFP_NOIO); - if (!dst) { - mutex_unlock(&c->sb_lock); - return -ENOMEM; - } - - dst->nr = 0; - dst->entry_size = src->entry_size; - - for_each_cpu_replicas_entry(src, e) - if (!((1 << e->data_type) & typemask)) - memcpy(cpu_replicas_entry(dst, dst->nr++), - e, dst->entry_size); +static const char *bch2_sb_validate_crypt(struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); - bch2_cpu_replicas_sort(dst); + if (vstruct_bytes(&crypt->field) != sizeof(*crypt)) + return "invalid field crypt: wrong size"; - rcu_assign_pointer(c->replicas_gc, dst); - mutex_unlock(&c->sb_lock); + if (BCH_CRYPT_KDF_TYPE(crypt)) + return "invalid field crypt: bad kdf type"; - return 0; + return NULL; } -/* Replicas tracking - superblock: */ +static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { + .validate = bch2_sb_validate_crypt, +}; -static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r, - unsigned *nr, - unsigned *bytes, - unsigned *max_dev) -{ - struct bch_replicas_entry *i; - unsigned j; +/* BCH_SB_FIELD_clean: */ - *nr = 0; - *bytes = sizeof(*r); - *max_dev = 0; - - if (!r) - return; +int bch2_sb_clean_validate(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) +{ + struct jset_entry *entry; + int ret; - for_each_replicas_entry(r, i) { - for (j = 0; j < i->nr; j++) - *max_dev = max_t(unsigned, *max_dev, i->devs[j]); - (*nr)++; + for (entry = clean->start; + entry < (struct jset_entry *) vstruct_end(&clean->field); + entry = vstruct_next(entry)) { + ret = bch2_journal_entry_validate(c, "superblock", entry, + le16_to_cpu(c->disk_sb.sb->version), + BCH_SB_BIG_ENDIAN(c->disk_sb.sb), + write); + if (ret) + return ret; } - *bytes = (void *) i - (void *) r; + return 0; } -static struct bch_replicas_cpu * -__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r) +int bch2_fs_mark_dirty(struct bch_fs *c) { - struct bch_replicas_cpu *cpu_r; - unsigned i, nr, bytes, max_dev, entry_size; - - bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev); - - entry_size = offsetof(struct bch_replicas_cpu_entry, devs) + - DIV_ROUND_UP(max_dev + 1, 8); - - cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) + - nr * entry_size, GFP_NOIO); - if (!cpu_r) - return NULL; - - cpu_r->nr = nr; - cpu_r->entry_size = entry_size; - - if (nr) { - struct bch_replicas_cpu_entry *dst = - cpu_replicas_entry(cpu_r, 0); - struct bch_replicas_entry *src = sb_r->entries; + int ret; - while (dst < cpu_replicas_entry(cpu_r, nr)) { - dst->data_type = src->data_type; - for (i = 0; i < src->nr; i++) - replicas_set_dev(dst, src->devs[i]); + /* + * Unconditionally write superblock, to verify it hasn't changed before + * we go rw: + */ - src = replicas_entry_next(src); - dst = (void *) dst + entry_size; - } - } + mutex_lock(&c->sb_lock); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); + c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); + ret = bch2_write_super(c); + mutex_unlock(&c->sb_lock); - bch2_cpu_replicas_sort(cpu_r); - return cpu_r; + return ret; } -static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) +static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) { - struct bch_sb_field_replicas *sb_r; - struct bch_replicas_cpu *cpu_r, *old_r; + struct jset_entry *entry = *end; + unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); - sb_r = bch2_sb_get_replicas(c->disk_sb); - cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r); - if (!cpu_r) - return -ENOMEM; - - old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock)); - rcu_assign_pointer(c->replicas, cpu_r); - if (old_r) - kfree_rcu(old_r, rcu); + memset(entry, 0, u64s * sizeof(u64)); + /* + * The u64s field counts from the start of data, ignoring the shared + * fields. + */ + entry->u64s = cpu_to_le16(u64s - 1); - return 0; + *end = vstruct_next(*end); + return entry; } -static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, - struct bch_replicas_cpu *r) +void bch2_journal_super_entries_add_common(struct bch_fs *c, + struct jset_entry **end, + u64 journal_seq) { - struct bch_sb_field_replicas *sb_r; - struct bch_replicas_entry *sb_e; - struct bch_replicas_cpu_entry *e; - size_t i, bytes; + struct bch_dev *ca; + unsigned i, dev; - bytes = sizeof(struct bch_sb_field_replicas); + percpu_down_read(&c->mark_lock); - for_each_cpu_replicas_entry(r, e) { - bytes += sizeof(struct bch_replicas_entry); - for (i = 0; i < r->entry_size - 1; i++) - bytes += hweight8(e->devs[i]); + if (!journal_seq) { + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); + } else { + bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); } - sb_r = bch2_fs_sb_resize_replicas(c, - DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64))); - if (!sb_r) - return -ENOSPC; - - memset(&sb_r->entries, 0, - vstruct_end(&sb_r->field) - - (void *) &sb_r->entries); - - sb_e = sb_r->entries; - for_each_cpu_replicas_entry(r, e) { - sb_e->data_type = e->data_type; - - for (i = 0; i < replicas_dev_slots(r); i++) - if (replicas_test_dev(e, i)) - sb_e->devs[sb_e->nr++] = i; - - sb_e = replicas_entry_next(sb_e); + { + struct jset_entry_usage *u = + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); - BUG_ON((void *) sb_e > vstruct_end(&sb_r->field)); + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = FS_USAGE_INODES; + u->v = cpu_to_le64(c->usage_base->nr_inodes); } - return 0; -} - -static const char *bch2_sb_validate_replicas(struct bch_sb *sb) -{ - struct bch_sb_field_members *mi; - struct bch_sb_field_replicas *sb_r; - struct bch_replicas_cpu *cpu_r = NULL; - struct bch_replicas_entry *e; - const char *err; - unsigned i; - - mi = bch2_sb_get_members(sb); - sb_r = bch2_sb_get_replicas(sb); - if (!sb_r) - return NULL; - - for_each_replicas_entry(sb_r, e) { - err = "invalid replicas entry: invalid data type"; - if (e->data_type >= BCH_DATA_NR) - goto err; - - err = "invalid replicas entry: no devices"; - if (!e->nr) - goto err; - - err = "invalid replicas entry: too many devices"; - if (e->nr >= BCH_REPLICAS_MAX) - goto err; + { + struct jset_entry_usage *u = + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); - err = "invalid replicas entry: invalid device"; - for (i = 0; i < e->nr; i++) - if (!bch2_dev_exists(sb, mi, e->devs[i])) - goto err; + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = FS_USAGE_KEY_VERSION; + u->v = cpu_to_le64(atomic64_read(&c->key_version)); } - err = "cannot allocate memory"; - cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r); - if (!cpu_r) - goto err; - - sort_cmp_size(cpu_r->entries, - cpu_r->nr, - cpu_r->entry_size, - memcmp, NULL); - - for (i = 0; i + 1 < cpu_r->nr; i++) { - struct bch_replicas_cpu_entry *l = - cpu_replicas_entry(cpu_r, i); - struct bch_replicas_cpu_entry *r = - cpu_replicas_entry(cpu_r, i + 1); + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + struct jset_entry_usage *u = + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); - BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); - - err = "duplicate replicas entry"; - if (!memcmp(l, r, cpu_r->entry_size)) - goto err; + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = FS_USAGE_RESERVED; + u->entry.level = i; + u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); } - err = NULL; -err: - kfree(cpu_r); - return err; -} + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + struct jset_entry_data_usage *u = + container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), + struct jset_entry_data_usage, entry); -/* Query replicas: */ + u->entry.type = BCH_JSET_ENTRY_data_usage; + u->v = cpu_to_le64(c->usage_base->replicas[i]); + memcpy(&u->r, e, replicas_entry_bytes(e)); + } -bool bch2_sb_has_replicas(struct bch_fs *c, struct bkey_s_c_extent e, - enum bch_data_type data_type) -{ - struct bch_replicas_cpu_entry search; - unsigned max_dev; - bool ret; + for_each_member_device(ca, c, dev) { + unsigned b = sizeof(struct jset_entry_dev_usage) + + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; + struct jset_entry_dev_usage *u = + container_of(jset_entry_init(end, b), + struct jset_entry_dev_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_dev_usage; + u->dev = cpu_to_le32(dev); + u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); + u->buckets_unavailable = cpu_to_le64(ca->usage_base->buckets_unavailable); + + for (i = 0; i < BCH_DATA_NR; i++) { + u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); + u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); + u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); + } + } - if (!bkey_to_replicas(e, data_type, &search, &max_dev)) - return true; + percpu_up_read(&c->mark_lock); - rcu_read_lock(); - ret = replicas_has_entry(rcu_dereference(c->replicas), - search, max_dev); - rcu_read_unlock(); + for (i = 0; i < 2; i++) { + struct jset_entry_clock *clock = + container_of(jset_entry_init(end, sizeof(*clock)), + struct jset_entry_clock, entry); - return ret; + clock->entry.type = BCH_JSET_ENTRY_clock; + clock->rw = i; + clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now)); + } } -struct replicas_status __bch2_replicas_status(struct bch_fs *c, - struct bch_devs_mask online_devs) +void bch2_fs_mark_clean(struct bch_fs *c) { - struct bch_sb_field_members *mi; - struct bch_replicas_cpu_entry *e; - struct bch_replicas_cpu *r; - unsigned i, dev, dev_slots, nr_online, nr_offline; - struct replicas_status ret; - - memset(&ret, 0, sizeof(ret)); + struct bch_sb_field_clean *sb_clean; + struct jset_entry *entry; + unsigned u64s; + int ret; - for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) - ret.replicas[i].nr_online = UINT_MAX; + mutex_lock(&c->sb_lock); + if (BCH_SB_CLEAN(c->disk_sb.sb)) + goto out; - mi = bch2_sb_get_members(c->disk_sb); - rcu_read_lock(); + SET_BCH_SB_CLEAN(c->disk_sb.sb, true); - r = rcu_dereference(c->replicas); - dev_slots = replicas_dev_slots(r); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata); + c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates)); + c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled)); - for_each_cpu_replicas_entry(r, e) { - if (e->data_type >= ARRAY_SIZE(ret.replicas)) - panic("e %p data_type %u\n", e, e->data_type); + u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; - nr_online = nr_offline = 0; + sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s); + if (!sb_clean) { + bch_err(c, "error resizing superblock while setting filesystem clean"); + goto out; + } - for (dev = 0; dev < dev_slots; dev++) { - if (!replicas_test_dev(e, dev)) - continue; + sb_clean->flags = 0; + sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); - BUG_ON(!bch2_dev_exists(c->disk_sb, mi, dev)); + /* Trying to catch outstanding bug: */ + BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); - if (test_bit(dev, online_devs.d)) - nr_online++; - else - nr_offline++; - } + entry = sb_clean->start; + bch2_journal_super_entries_add_common(c, &entry, 0); + entry = bch2_btree_roots_to_journal_entries(c, entry, entry); + BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); - ret.replicas[e->data_type].nr_online = - min(ret.replicas[e->data_type].nr_online, - nr_online); + memset(entry, 0, + vstruct_end(&sb_clean->field) - (void *) entry); - ret.replicas[e->data_type].nr_offline = - max(ret.replicas[e->data_type].nr_offline, - nr_offline); + /* + * this should be in the write path, and we should be validating every + * superblock section: + */ + ret = bch2_sb_clean_validate(c, sb_clean, WRITE); + if (ret) { + bch_err(c, "error writing marking filesystem clean: validate error"); + goto out; } - rcu_read_unlock(); - - return ret; -} - -struct replicas_status bch2_replicas_status(struct bch_fs *c) -{ - return __bch2_replicas_status(c, bch2_online_devs(c)); + bch2_write_super(c); +out: + mutex_unlock(&c->sb_lock); } -bool bch2_have_enough_devs(struct bch_fs *c, - struct replicas_status s, - unsigned flags) +static const char *bch2_sb_validate_clean(struct bch_sb *sb, + struct bch_sb_field *f) { - if ((s.replicas[BCH_DATA_JOURNAL].nr_offline || - s.replicas[BCH_DATA_BTREE].nr_offline) && - !(flags & BCH_FORCE_IF_METADATA_DEGRADED)) - return false; + struct bch_sb_field_clean *clean = field_to_type(f, clean); - if ((!s.replicas[BCH_DATA_JOURNAL].nr_online || - !s.replicas[BCH_DATA_BTREE].nr_online) && - !(flags & BCH_FORCE_IF_METADATA_LOST)) - return false; + if (vstruct_bytes(&clean->field) < sizeof(*clean)) + return "invalid field crypt: wrong size"; - if (s.replicas[BCH_DATA_USER].nr_offline && - !(flags & BCH_FORCE_IF_DATA_DEGRADED)) - return false; + return NULL; +} - if (!s.replicas[BCH_DATA_USER].nr_online && - !(flags & BCH_FORCE_IF_DATA_LOST)) - return false; +static const struct bch_sb_field_ops bch_sb_field_ops_clean = { + .validate = bch2_sb_validate_clean, +}; - return true; -} +static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { +#define x(f, nr) \ + [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, + BCH_SB_FIELDS() +#undef x +}; -unsigned bch2_replicas_online(struct bch_fs *c, bool meta) +static const char *bch2_sb_field_validate(struct bch_sb *sb, + struct bch_sb_field *f) { - struct replicas_status s = bch2_replicas_status(c); + unsigned type = le32_to_cpu(f->type); - return meta - ? min(s.replicas[BCH_DATA_JOURNAL].nr_online, - s.replicas[BCH_DATA_BTREE].nr_online) - : s.replicas[BCH_DATA_USER].nr_online; + return type < BCH_SB_FIELD_NR + ? bch2_sb_field_ops[type]->validate(sb, f) + : NULL; } -unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) +void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) { - struct bch_replicas_cpu_entry *e; - struct bch_replicas_cpu *r; - unsigned ret = 0; + unsigned type = le32_to_cpu(f->type); + const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR + ? bch2_sb_field_ops[type] : NULL; - rcu_read_lock(); - r = rcu_dereference(c->replicas); + if (ops) + pr_buf(out, "%s", bch2_sb_fields[type]); + else + pr_buf(out, "(unknown field %u)", type); - if (ca->dev_idx >= replicas_dev_slots(r)) - goto out; + pr_buf(out, " (size %llu):", vstruct_bytes(f)); - for_each_cpu_replicas_entry(r, e) - if (replicas_test_dev(e, ca->dev_idx)) { - ret |= 1 << e->data_type; - break; - } -out: - rcu_read_unlock(); - - return ret; + if (ops && ops->to_text) + bch2_sb_field_ops[type]->to_text(out, sb, f); }