From 63065c01285601afbe2457e92729efc11581e37d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 8 May 2017 02:28:15 -0800 Subject: [PATCH] Update bcachefs sources to 9ceb982d77 bcachefs: Store bucket gens in a btree --- .bcachefs_revision | 2 +- cmd_debug.c | 8 +- cmd_fsck.c | 2 + cmd_migrate.c | 2 +- include/linux/bitops.h | 5 + libbcachefs.c | 10 +- libbcachefs/alloc.c | 898 ++++++++++++++++++---------------- libbcachefs/alloc.h | 25 +- libbcachefs/alloc_types.h | 17 +- libbcachefs/bcachefs.h | 43 +- libbcachefs/bcachefs_format.h | 194 +++----- libbcachefs/bcachefs_ioctl.h | 14 +- libbcachefs/bkey.h | 2 + libbcachefs/bkey_methods.c | 2 + libbcachefs/btree_gc.c | 60 ++- libbcachefs/btree_io.c | 2 +- libbcachefs/btree_update.c | 137 +++++- libbcachefs/btree_update.h | 10 +- libbcachefs/buckets.c | 55 ++- libbcachefs/buckets.h | 4 +- libbcachefs/buckets_types.h | 24 +- libbcachefs/extents.c | 19 +- libbcachefs/io.c | 3 +- libbcachefs/journal.c | 151 ++---- libbcachefs/journal.h | 22 + libbcachefs/journal_types.h | 15 - libbcachefs/migrate.c | 69 ++- libbcachefs/opts.h | 2 + libbcachefs/str_hash.h | 3 - libbcachefs/super-io.c | 530 ++++++++++++++++++-- libbcachefs/super-io.h | 131 +++-- libbcachefs/super.c | 322 ++++++------ libbcachefs/sysfs.c | 13 +- libbcachefs/util.c | 44 ++ libbcachefs/util.h | 4 + 35 files changed, 1734 insertions(+), 1110 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index 81d9f67..c5ef773 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -4231dd5cf0f04dd61b0b8bae44a357da8331c0e2 +9ceb982d7790f552e2f5c96bebeab176516cf144 diff --git a/cmd_debug.c b/cmd_debug.c index 974e862..d4613ec 100644 --- a/cmd_debug.c +++ b/cmd_debug.c @@ -55,12 +55,6 @@ static void dump_one_device(struct bch_fs *c, struct bch_dev *ca, int fd) bucket_bytes(ca)); } - /* Prios/gens: */ - for (i = 0; i < prio_buckets(ca); i++) - range_add(&data, - bucket_bytes(ca) * ca->prio_last_buckets[i], - bucket_bytes(ca)); - /* Btree: */ for (i = 0; i < BTREE_ID_NR; i++) { const struct bch_extent_ptr *ptr; @@ -97,6 +91,7 @@ int cmd_dump(int argc, char *argv[]) opts.nochanges = true; opts.noreplay = true; opts.errors = BCH_ON_ERROR_CONTINUE; + opts.degraded = true; while ((opt = getopt(argc, argv, "o:fh")) != -1) switch (opt) { @@ -273,6 +268,7 @@ int cmd_list(int argc, char *argv[]) opts.nochanges = true; opts.norecovery = true; opts.errors = BCH_ON_ERROR_CONTINUE; + opts.degraded = true; while ((opt = getopt(argc, argv, "b:s:e:i:m:fvh")) != -1) switch (opt) { diff --git a/cmd_fsck.c b/cmd_fsck.c index 1775067..5ca9b82 100644 --- a/cmd_fsck.c +++ b/cmd_fsck.c @@ -27,6 +27,8 @@ int cmd_fsck(int argc, char *argv[]) const char *err; int opt; + opts.degraded = true; + while ((opt = getopt(argc, argv, "pynfvh")) != -1) switch (opt) { case 'p': diff --git a/cmd_migrate.c b/cmd_migrate.c index 72cc004..bf8f0be 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -333,7 +333,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst, die("error reserving space in new filesystem: %s", strerror(-ret)); - bch2_check_mark_super(c, &e->k_i, false); + bch2_check_mark_super(c, extent_i_to_s_c(e), false); ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &e->k_i, &res, NULL, NULL, 0); diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 47fffb7..239574c 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -112,6 +112,11 @@ static inline unsigned long hweight_long(unsigned long w) return __builtin_popcountl(w); } +static inline unsigned long hweight8(unsigned long w) +{ + return __builtin_popcountl(w); +} + /** * rol64 - rotate a 64-bit value left * @word: value to rotate diff --git a/libbcachefs.c b/libbcachefs.c index 73ea2d1..f68a45f 100644 --- a/libbcachefs.c +++ b/libbcachefs.c @@ -176,10 +176,8 @@ struct bch_sb *bch2_format(struct format_opts opts, SET_BCH_SB_BTREE_NODE_SIZE(sb, opts.btree_node_size); SET_BCH_SB_GC_RESERVE(sb, 8); SET_BCH_SB_META_REPLICAS_WANT(sb, opts.meta_replicas); - SET_BCH_SB_META_REPLICAS_HAVE(sb, opts.meta_replicas); SET_BCH_SB_META_REPLICAS_REQ(sb, opts.meta_replicas_required); SET_BCH_SB_DATA_REPLICAS_WANT(sb, opts.data_replicas); - SET_BCH_SB_DATA_REPLICAS_HAVE(sb, opts.data_replicas); SET_BCH_SB_DATA_REPLICAS_REQ(sb, opts.data_replicas_required); SET_BCH_SB_ERROR_ACTION(sb, opts.on_error_action); SET_BCH_SB_STR_HASH_TYPE(sb, BCH_STR_HASH_SIPHASH); @@ -339,9 +337,9 @@ void bch2_super_print(struct bch_sb *sb, int units) BCH_SB_CLEAN(sb), - BCH_SB_META_REPLICAS_HAVE(sb), + 0LLU, //BCH_SB_META_REPLICAS_HAVE(sb), BCH_SB_META_REPLICAS_WANT(sb), - BCH_SB_DATA_REPLICAS_HAVE(sb), + 0LLU, //BCH_SB_DATA_REPLICAS_HAVE(sb), BCH_SB_DATA_REPLICAS_WANT(sb), BCH_SB_META_CSUM_TYPE(sb) < BCH_CSUM_NR @@ -405,8 +403,8 @@ void bch2_super_print(struct bch_sb *sb, int units) : "unknown", BCH_MEMBER_TIER(m), - BCH_MEMBER_HAS_METADATA(m), - BCH_MEMBER_HAS_DATA(m), + 0LLU, //BCH_MEMBER_HAS_METADATA(m), + 0LLU, //BCH_MEMBER_HAS_DATA(m), BCH_MEMBER_REPLACEMENT(m) < CACHE_REPLACEMENT_NR ? bch2_cache_replacement_policies[BCH_MEMBER_REPLACEMENT(m)] diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index 9d54dd8..5a258cb 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -75,7 +75,6 @@ #include #include -static void __bch2_bucket_free(struct bch_dev *, struct bucket *); static void bch2_recalc_min_prio(struct bch_dev *, int); /* Allocation groups: */ @@ -206,268 +205,244 @@ static void pd_controllers_update(struct work_struct *work) c->pd_controllers_update_seconds * HZ); } -/* - * Bucket priorities/gens: - * - * For each bucket, we store on disk its - * 8 bit gen - * 16 bit priority - * - * See alloc.c for an explanation of the gen. The priority is used to implement - * lru (and in the future other) cache replacement policies; for most purposes - * it's just an opaque integer. - * - * The gens and the priorities don't have a whole lot to do with each other, and - * it's actually the gens that must be written out at specific times - it's no - * big deal if the priorities don't get written, if we lose them we just reuse - * buckets in suboptimal order. - * - * On disk they're stored in a packed array, and in as many buckets are required - * to fit them all. The buckets we use to store them form a list; the journal - * header points to the first bucket, the first bucket points to the second - * bucket, et cetera. - * - * This code is used by the allocation code; periodically (whenever it runs out - * of buckets to allocate from) the allocation code will invalidate some - * buckets, but it can't use those buckets until their new gens are safely on - * disk. - */ +static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) +{ + unsigned bytes = offsetof(struct bch_alloc, data); + + if (a->fields & (1 << BCH_ALLOC_FIELD_READ_TIME)) + bytes += 2; + if (a->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) + bytes += 2; + + return DIV_ROUND_UP(bytes, sizeof(u64)); +} -static int prio_io(struct bch_dev *ca, uint64_t bucket, int op) +static const char *bch2_alloc_invalid(const struct bch_fs *c, + struct bkey_s_c k) { - bio_init(ca->bio_prio, ca->bio_prio->bi_inline_vecs, bucket_pages(ca)); - ca->bio_prio->bi_opf = op|REQ_SYNC|REQ_META; - ca->bio_prio->bi_iter.bi_sector = bucket * ca->mi.bucket_size; - ca->bio_prio->bi_bdev = ca->disk_sb.bdev; - ca->bio_prio->bi_iter.bi_size = bucket_bytes(ca); - bch2_bio_map(ca->bio_prio, ca->disk_buckets); - - return submit_bio_wait(ca->bio_prio); + if (k.k->p.inode >= c->sb.nr_devices || + !c->devs[k.k->p.inode]) + return "invalid device"; + + switch (k.k->type) { + case BCH_ALLOC: { + struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); + + if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k)) + return "incorrect value size"; + break; + } + default: + return "invalid type"; + } + + return NULL; } -static struct nonce prio_nonce(struct prio_set *p) +static void bch2_alloc_to_text(struct bch_fs *c, char *buf, + size_t size, struct bkey_s_c k) { - return (struct nonce) {{ - [0] = 0, - [1] = p->nonce[0], - [2] = p->nonce[1], - [3] = p->nonce[2]^BCH_NONCE_PRIO, - }}; + buf[0] = '\0'; + + switch (k.k->type) { + case BCH_ALLOC: + break; + } } -int bch2_prio_write(struct bch_dev *ca) +const struct bkey_ops bch2_bkey_alloc_ops = { + .key_invalid = bch2_alloc_invalid, + .val_to_text = bch2_alloc_to_text, +}; + +static inline unsigned get_alloc_field(const u8 **p, unsigned bytes) { - struct bch_fs *c = ca->fs; - struct journal *j = &c->journal; - struct journal_res res = { 0 }; - bool need_new_journal_entry; - int i, ret = 0; + unsigned v; - if (c->opts.nochanges) - return 0; + switch (bytes) { + case 1: + v = **p; + break; + case 2: + v = le16_to_cpup((void *) *p); + break; + case 4: + v = le32_to_cpup((void *) *p); + break; + default: + BUG(); + } - mutex_lock(&ca->prio_write_lock); - trace_prio_write_start(ca); + *p += bytes; + return v; +} - ca->need_prio_write = false; +static inline void put_alloc_field(u8 **p, unsigned bytes, unsigned v) +{ + switch (bytes) { + case 1: + **p = v; + break; + case 2: + *((__le16 *) *p) = cpu_to_le16(v); + break; + case 4: + *((__le32 *) *p) = cpu_to_le32(v); + break; + default: + BUG(); + } - atomic64_add(ca->mi.bucket_size * prio_buckets(ca), - &ca->meta_sectors_written); + *p += bytes; +} - for (i = prio_buckets(ca) - 1; i >= 0; --i) { - struct bucket *g; - struct prio_set *p = ca->disk_buckets; - struct bucket_disk *d = p->data; - struct bucket_disk *end = d + prios_per_bucket(ca); - size_t r; +static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k) +{ + struct bch_dev *ca; + struct bkey_s_c_alloc a; + struct bucket_mark new; + struct bucket *g; + const u8 *d; - for (r = i * prios_per_bucket(ca); - r < ca->mi.nbuckets && d < end; - r++, d++) { - g = ca->buckets + r; - d->prio[READ] = cpu_to_le16(g->prio[READ]); - d->prio[WRITE] = cpu_to_le16(g->prio[WRITE]); - d->gen = ca->buckets[r].mark.gen; - } + if (k.k->type != BCH_ALLOC) + return; - p->next_bucket = cpu_to_le64(ca->prio_buckets[i + 1]); - p->magic = cpu_to_le64(pset_magic(c)); - get_random_bytes(&p->nonce, sizeof(p->nonce)); + a = bkey_s_c_to_alloc(k); + ca = c->devs[a.k->p.inode]; - spin_lock(&ca->prio_buckets_lock); - r = bch2_bucket_alloc(ca, RESERVE_PRIO); - BUG_ON(!r); + if (a.k->p.offset >= ca->mi.nbuckets) + return; - /* - * goes here before dropping prio_buckets_lock to guard against - * it getting gc'd from under us - */ - ca->prio_buckets[i] = r; - bch2_mark_metadata_bucket(ca, ca->buckets + r, - BUCKET_PRIOS, false); - spin_unlock(&ca->prio_buckets_lock); - - SET_PSET_CSUM_TYPE(p, bch2_meta_checksum_type(c)); - - bch2_encrypt(c, PSET_CSUM_TYPE(p), - prio_nonce(p), - p->encrypted_start, - bucket_bytes(ca) - - offsetof(struct prio_set, encrypted_start)); - - p->csum = bch2_checksum(c, PSET_CSUM_TYPE(p), - prio_nonce(p), - (void *) p + sizeof(p->csum), - bucket_bytes(ca) - sizeof(p->csum)); - - ret = prio_io(ca, r, REQ_OP_WRITE); - if (bch2_dev_fatal_io_err_on(ret, ca, - "prio write to bucket %zu", r) || - bch2_meta_write_fault("prio")) - goto err; - } + g = ca->buckets + a.k->p.offset; + bucket_cmpxchg(g, new, ({ + new.gen = a.v->gen; + new.gen_valid = 1; + })); + + d = a.v->data; + if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME)) + g->prio[READ] = get_alloc_field(&d, 2); + if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) + g->prio[WRITE] = get_alloc_field(&d, 2); +} - spin_lock(&j->lock); - j->prio_buckets[ca->dev_idx] = cpu_to_le64(ca->prio_buckets[0]); - j->nr_prio_buckets = max_t(unsigned, - ca->dev_idx + 1, - j->nr_prio_buckets); - spin_unlock(&j->lock); +int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) +{ + struct journal_replay *r; + struct btree_iter iter; + struct bkey_s_c k; + int ret; - do { - unsigned u64s = jset_u64s(0); + if (!c->btree_roots[BTREE_ID_ALLOC].b) + return 0; - if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) - break; + for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS_MIN, 0, k) { + bch2_alloc_read_key(c, k); + bch2_btree_iter_cond_resched(&iter); + } - ret = bch2_journal_res_get(j, &res, u64s, u64s); - if (ret) - goto err; + ret = bch2_btree_iter_unlock(&iter); + if (ret) + return ret; - need_new_journal_entry = j->buf[res.idx].nr_prio_buckets < - ca->dev_idx + 1; - bch2_journal_res_put(j, &res); + list_for_each_entry(r, journal_replay_list, list) { + struct bkey_i *k, *n; + struct jset_entry *entry; - ret = bch2_journal_flush_seq(j, res.seq); - if (ret) - goto err; - } while (need_new_journal_entry); + for_each_jset_key(k, n, entry, &r->j) + if (entry->btree_id == BTREE_ID_ALLOC) + bch2_alloc_read_key(c, bkey_i_to_s_c(k)); + } - /* - * Don't want the old priorities to get garbage collected until after we - * finish writing the new ones, and they're journalled - */ + return 0; +} - spin_lock(&ca->prio_buckets_lock); +static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, + struct bucket *g, struct btree_iter *iter, + u64 *journal_seq) +{ + struct bucket_mark m = READ_ONCE(g->mark); + __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key; + struct bkey_i_alloc *a; + u8 *d; + int ret; - for (i = 0; i < prio_buckets(ca); i++) { - if (ca->prio_last_buckets[i]) - __bch2_bucket_free(ca, - &ca->buckets[ca->prio_last_buckets[i]]); + bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, g - ca->buckets)); - ca->prio_last_buckets[i] = ca->prio_buckets[i]; - } + do { + ret = bch2_btree_iter_traverse(iter); + if (ret) + break; - spin_unlock(&ca->prio_buckets_lock); + a = bkey_alloc_init(&alloc_key.k); + a->k.p = iter->pos; + a->v.fields = 0; + a->v.gen = m.gen; + set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v)); + + d = a->v.data; + if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME)) + put_alloc_field(&d, 2, g->prio[READ]); + if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) + put_alloc_field(&d, 2, g->prio[WRITE]); + + bch2_btree_iter_set_pos(iter, a->k.p); + ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_USE_ALLOC_RESERVE| + BTREE_INSERT_NOWAIT, + BTREE_INSERT_ENTRY(iter, &a->k_i)); + bch2_btree_iter_cond_resched(iter); + } while (ret == -EINTR); - trace_prio_write_end(ca); -err: - mutex_unlock(&ca->prio_write_lock); return ret; } -int bch2_prio_read(struct bch_dev *ca) +int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos) { - struct bch_fs *c = ca->fs; - struct prio_set *p = ca->disk_buckets; - struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d; - struct bucket_mark new; - struct bch_csum csum; - unsigned bucket_nr = 0; - u64 bucket, expect, got; - size_t b; - int ret = 0; + struct bch_dev *ca; + struct bucket *g; + struct btree_iter iter; + int ret; - if (ca->prio_read_done) - return 0; + lockdep_assert_held(&c->state_lock); - ca->prio_read_done = true; + if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode]) + return 0; - spin_lock(&c->journal.lock); - bucket = le64_to_cpu(c->journal.prio_buckets[ca->dev_idx]); - spin_unlock(&c->journal.lock); + ca = c->devs[pos.inode]; - /* - * If the device hasn't been used yet, there won't be a prio bucket ptr - */ - if (!bucket) + if (pos.offset >= ca->mi.nbuckets) return 0; - if (mustfix_fsck_err_on(bucket < ca->mi.first_bucket || - bucket >= ca->mi.nbuckets, c, - "bad prio bucket %llu", bucket)) - return 0; + g = ca->buckets + pos.offset; - for (b = 0; b < ca->mi.nbuckets; b++, d++) { - if (d == end) { - ca->prio_last_buckets[bucket_nr] = bucket; - bucket_nr++; - - ret = prio_io(ca, bucket, REQ_OP_READ) || - bch2_meta_read_fault("prio"); - - if (mustfix_fsck_err_on(ret, c, - "IO error reading bucket gens (%i)", - ret)) - return 0; - - got = le64_to_cpu(p->magic); - expect = pset_magic(c); - if (mustfix_fsck_err_on(got != expect, c, - "bad magic (got %llu expect %llu) while reading prios from bucket %llu", - got, expect, bucket)) - return 0; - - if (mustfix_fsck_err_on(PSET_CSUM_TYPE(p) >= BCH_CSUM_NR, c, - "prio bucket with unknown csum type %llu bucket %lluu", - PSET_CSUM_TYPE(p), bucket)) - return 0; - - csum = bch2_checksum(c, PSET_CSUM_TYPE(p), - prio_nonce(p), - (void *) p + sizeof(p->csum), - bucket_bytes(ca) - sizeof(p->csum)); - if (fsck_err_on(bch2_crc_cmp(csum, p->csum), c, - "bad checksum reading prios from bucket %llu", - bucket)) - return 0; - - bch2_encrypt(c, PSET_CSUM_TYPE(p), - prio_nonce(p), - p->encrypted_start, - bucket_bytes(ca) - - offsetof(struct prio_set, encrypted_start)); - - bucket = le64_to_cpu(p->next_bucket); - d = p->data; - } + bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN, + BTREE_ITER_INTENT); - ca->buckets[b].prio[READ] = le16_to_cpu(d->prio[READ]); - ca->buckets[b].prio[WRITE] = le16_to_cpu(d->prio[WRITE]); + ret = __bch2_alloc_write_key(c, ca, g, &iter, NULL); + bch2_btree_iter_unlock(&iter); + return ret; +} - bucket_cmpxchg(&ca->buckets[b], new, ({ - new.gen = d->gen; - new.gen_valid = 1; - })); - } +int bch2_alloc_write(struct bch_fs *c, struct bch_dev *ca, u64 *journal_seq) +{ + struct btree_iter iter; + struct bucket *g; + int ret = 0; - mutex_lock(&c->bucket_lock); - bch2_recalc_min_prio(ca, READ); - bch2_recalc_min_prio(ca, WRITE); - mutex_unlock(&c->bucket_lock); + bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN, + BTREE_ITER_INTENT); + + for_each_bucket(g, ca) { + ret = __bch2_alloc_write_key(c, ca, g, &iter, journal_seq); + if (ret) + break; + } - ret = 0; -fsck_err: + bch2_btree_iter_unlock(&iter); return ret; } @@ -516,9 +491,6 @@ static void verify_not_on_freelist(struct bch_dev *ca, size_t bucket) long i; unsigned j; - for (iter = 0; iter < prio_buckets(ca) * 2; iter++) - BUG_ON(ca->prio_buckets[iter] == bucket); - for (j = 0; j < RESERVE_NR; j++) fifo_for_each_entry(i, &ca->free[j], iter) BUG_ON(i == bucket); @@ -651,17 +623,37 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g, static void bch2_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g) { - spin_lock(&ca->freelist_lock); - - bch2_invalidate_bucket(ca, g); + struct bch_fs *c = ca->fs; + struct bucket_mark m; - g->prio[READ] = ca->fs->prio_clock[READ].hand; - g->prio[WRITE] = ca->fs->prio_clock[WRITE].hand; + spin_lock(&ca->freelist_lock); + if (!bch2_invalidate_bucket(ca, g, &m)) { + spin_unlock(&ca->freelist_lock); + return; + } verify_not_on_freelist(ca, g - ca->buckets); BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets)); - spin_unlock(&ca->freelist_lock); + + g->prio[READ] = c->prio_clock[READ].hand; + g->prio[WRITE] = c->prio_clock[WRITE].hand; + + if (m.cached_sectors) { + ca->allocator_invalidating_data = true; + } else if (m.journal_seq_valid) { + u64 journal_seq = atomic64_read(&c->journal.seq); + u64 bucket_seq = journal_seq; + + bucket_seq &= ~((u64) U16_MAX); + bucket_seq |= m.journal_seq; + + if (bucket_seq > journal_seq) + bucket_seq -= 1 << 16; + + ca->allocator_journal_seq_flush = + max(ca->allocator_journal_seq_flush, bucket_seq); + } } /* @@ -686,11 +678,23 @@ static unsigned long bucket_sort_key(struct bch_dev *ca, struct bucket *g, struct bucket_mark m) { + /* + * Time since last read, scaled to [0, 8) where larger value indicates + * more recently read data: + */ unsigned long hotness = (g->prio[READ] - ca->min_prio[READ]) * 7 / (ca->fs->prio_clock[READ].hand - ca->min_prio[READ]); - return (((hotness + 1) * bucket_sectors_used(m)) << 8) | + /* How much we want to keep the data in this bucket: */ + unsigned long data_wantness = + (hotness + 1) * bucket_sectors_used(m); + + unsigned long needs_journal_commit = + bucket_needs_journal_commit(m, ca->fs->journal.last_seq_ondisk); + + return (data_wantness << 9) | + (needs_journal_commit << 8) | bucket_gc_gen(ca, g); } @@ -790,8 +794,8 @@ static void invalidate_buckets_random(struct bch_dev *ca) static void invalidate_buckets(struct bch_dev *ca) { - ca->inc_gen_needs_gc = 0; - ca->inc_gen_really_needs_gc = 0; + ca->inc_gen_needs_gc = 0; + ca->inc_gen_really_needs_gc = 0; switch (ca->mi.replacement) { case CACHE_REPLACEMENT_LRU: @@ -806,73 +810,82 @@ static void invalidate_buckets(struct bch_dev *ca) } } -static bool __bch2_allocator_push(struct bch_dev *ca, long bucket) +static int size_t_cmp(const void *_l, const void *_r) { - if (fifo_push(&ca->free[RESERVE_PRIO], bucket)) - goto success; - - if (fifo_push(&ca->free[RESERVE_MOVINGGC], bucket)) - goto success; - - if (fifo_push(&ca->free[RESERVE_BTREE], bucket)) - goto success; - - if (fifo_push(&ca->free[RESERVE_NONE], bucket)) - goto success; + const size_t *l = _l, *r = _r; - return false; -success: - closure_wake_up(&ca->fs->freelist_wait); - return true; + return (*l > *r) - (*l < *r); } -static bool bch2_allocator_push(struct bch_dev *ca, long bucket) +static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca, + u64 *journal_seq) { - bool ret; + struct btree_iter iter; + unsigned nr_invalidated = 0; + size_t b, i; + int ret = 0; - spin_lock(&ca->freelist_lock); - ret = __bch2_allocator_push(ca, bucket); - if (ret) - fifo_pop(&ca->free_inc, bucket); - spin_unlock(&ca->freelist_lock); + bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), + BTREE_ITER_INTENT); - return ret; + fifo_for_each_entry(b, &ca->free_inc, i) { + ret = __bch2_alloc_write_key(c, ca, ca->buckets + b, + &iter, journal_seq); + if (ret) + break; + + nr_invalidated++; + } + + bch2_btree_iter_unlock(&iter); + return nr_invalidated ?: ret; } -static void bch2_find_empty_buckets(struct bch_fs *c, struct bch_dev *ca) +/* + * Given an invalidated, ready to use bucket: issue a discard to it if enabled, + * then add it to the freelist, waiting until there's room if necessary: + */ +static void discard_invalidated_bucket(struct bch_dev *ca, long bucket) { - u16 last_seq_ondisk = c->journal.last_seq_ondisk; - struct bucket *g; + if (ca->mi.discard && + blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) + blkdev_issue_discard(ca->disk_sb.bdev, + bucket_to_sector(ca, bucket), + ca->mi.bucket_size, GFP_NOIO, 0); - for_each_bucket(g, ca) { - struct bucket_mark m = READ_ONCE(g->mark); - if (is_available_bucket(m) && - !m.cached_sectors && - !m.had_metadata && - !bucket_needs_journal_commit(m, last_seq_ondisk)) { - spin_lock(&ca->freelist_lock); + while (1) { + bool pushed = false; + unsigned i; - bch2_mark_alloc_bucket(ca, g, true); - g->prio[READ] = c->prio_clock[READ].hand; - g->prio[WRITE] = c->prio_clock[WRITE].hand; + set_current_state(TASK_INTERRUPTIBLE); - verify_not_on_freelist(ca, g - ca->buckets); - BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets)); + /* + * Don't remove from free_inc until after it's added to + * freelist, so gc can find it: + */ + spin_lock(&ca->freelist_lock); + for (i = 0; i < RESERVE_NR; i++) + if (fifo_push(&ca->free[i], bucket)) { + fifo_pop(&ca->free_inc, bucket); + closure_wake_up(&ca->fs->freelist_wait); + pushed = true; + break; + } + spin_unlock(&ca->freelist_lock); - spin_unlock(&ca->freelist_lock); + if (pushed) + break; - if (fifo_full(&ca->free_inc)) - break; + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + break; } + schedule(); + try_to_freeze(); } -} - -static int size_t_cmp(const void *_l, const void *_r) -{ - const size_t *l = _l, *r = _r; - return (*l > *r) - (*l < *r); + __set_current_state(TASK_RUNNING); } /** @@ -887,57 +900,26 @@ static int bch2_allocator_thread(void *arg) { struct bch_dev *ca = arg; struct bch_fs *c = ca->fs; - long bucket; + size_t bucket; int ret; set_freezable(); - bch2_find_empty_buckets(c, ca); - - while (1) { - /* - * First, we pull buckets off of the free_inc list, possibly - * issue discards to them, then we add the bucket to a - * free list: - */ - - while (!fifo_empty(&ca->free_inc)) { - bucket = fifo_peek(&ca->free_inc); - - /* - * Don't remove from free_inc until after it's added - * to freelist, so gc doesn't miss it while we've - * dropped bucket lock - */ - - if (ca->mi.discard && - blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, bucket), - ca->mi.bucket_size, GFP_NOIO, 0); - - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - if (bch2_allocator_push(ca, bucket)) - break; - - if (kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - goto out; - } - schedule(); - try_to_freeze(); - } - - __set_current_state(TASK_RUNNING); - } - - /* We've run out of free buckets! */ + while (!kthread_should_stop()) { + u64 journal_seq = 0; + /* Reset front/back so we can easily sort fifo entries later: */ BUG_ON(fifo_used(&ca->free_inc)); - ca->free_inc.front = ca->free_inc.back = 0; + ca->free_inc.front = ca->free_inc.back = 0; + ca->allocator_journal_seq_flush = 0; + ca->allocator_invalidating_data = false; down_read(&c->gc_lock); + if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) { + up_read(&c->gc_lock); + goto out; + } + while (1) { /* * Find some buckets that we can invalidate, either @@ -947,7 +929,6 @@ static int bch2_allocator_thread(void *arg) */ invalidate_buckets(ca); - trace_alloc_batch(ca, fifo_used(&ca->free_inc), ca->free_inc.size); @@ -980,28 +961,32 @@ static int bch2_allocator_thread(void *arg) spin_unlock(&ca->freelist_lock); /* - * free_inc is full of newly-invalidated buckets, must write out - * prios and gens before they can be re-used + * free_inc is now full of newly-invalidated buckets: next, + * write out the new bucket gens: */ - ret = bch2_prio_write(ca); - if (ret) { - /* - * Emergency read only - allocator thread has to - * shutdown. - * - * N.B. we better be going into RO mode, else - * allocations would hang indefinitely - whatever - * generated the error will have sent us into RO mode. - * - * Clear out the free_inc freelist so things are - * consistent-ish: - */ - spin_lock(&ca->freelist_lock); - while (fifo_pop(&ca->free_inc, bucket)) - bch2_mark_free_bucket(ca, ca->buckets + bucket); - spin_unlock(&ca->freelist_lock); - goto out; + + while (!fifo_empty(&ca->free_inc) && !kthread_should_stop()) { + ret = bch2_invalidate_free_inc(c, ca, &journal_seq); + if (bch2_fs_fatal_err_on(ret < 0, c, + "error invalidating buckets: %i", ret)) + goto err; + + if (ca->allocator_invalidating_data) + bch2_journal_flush_seq(&c->journal, journal_seq); + else if (ca->allocator_journal_seq_flush) + bch2_journal_flush_seq(&c->journal, + ca->allocator_journal_seq_flush); + + while (ret && !kthread_should_stop()) { + BUG_ON(fifo_empty(&ca->free_inc)); + + bucket = fifo_peek(&ca->free_inc); + discard_invalidated_bucket(ca, bucket); + --ret; + } } + + ca->alloc_thread_started = true; } out: /* @@ -1010,50 +995,104 @@ out: */ synchronize_rcu(); return 0; +err: + /* + * Emergency read only - allocator thread has to shutdown. + * + * N.B. we better be going into RO mode, else allocations would hang + * indefinitely - whatever generated the error will have sent us into RO + * mode. + * + * Clear out the free_inc freelist so things are consistent-ish: + */ + spin_lock(&ca->freelist_lock); + while (fifo_pop(&ca->free_inc, bucket)) + bch2_mark_free_bucket(ca, ca->buckets + bucket); + spin_unlock(&ca->freelist_lock); + goto out; } /* Allocation */ +static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca) +{ + struct bucket *g; + long r = -1; + + if (!down_read_trylock(&c->gc_lock)) + return r; + + if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) + goto out; + + for_each_bucket(g, ca) + if (!g->mark.touched_this_mount && + is_available_bucket(g->mark) && + bch2_mark_alloc_bucket_startup(ca, g)) { + r = g - ca->buckets; + break; + } +out: + up_read(&c->gc_lock); + return r; +} + /** * bch_bucket_alloc - allocate a single bucket from a specific device * * Returns index of bucket on success, 0 on failure * */ -size_t bch2_bucket_alloc(struct bch_dev *ca, enum alloc_reserve reserve) +long bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, + enum alloc_reserve reserve) { - struct bucket *g; - long r; + size_t r; spin_lock(&ca->freelist_lock); - if (fifo_pop(&ca->free[RESERVE_NONE], r) || - fifo_pop(&ca->free[reserve], r)) + if (likely(fifo_pop(&ca->free[RESERVE_NONE], r))) goto out; + switch (reserve) { + case RESERVE_ALLOC: + if (fifo_pop(&ca->free[RESERVE_BTREE], r)) + goto out; + break; + case RESERVE_BTREE: + if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >= + ca->free[RESERVE_BTREE].size && + fifo_pop(&ca->free[RESERVE_BTREE], r)) + goto out; + break; + case RESERVE_MOVINGGC: + if (fifo_pop(&ca->free[RESERVE_MOVINGGC], r)) + goto out; + break; + default: + break; + } + spin_unlock(&ca->freelist_lock); + if (unlikely(!ca->alloc_thread_started) && + (r = bch2_bucket_alloc_startup(c, ca)) >= 0) { + verify_not_on_freelist(ca, r); + goto out2; + } + trace_bucket_alloc_fail(ca, reserve); - return 0; + return -1; out: verify_not_on_freelist(ca, r); spin_unlock(&ca->freelist_lock); - trace_bucket_alloc(ca, reserve); - bch2_wake_allocator(ca); +out2: + ca->buckets[r].prio[READ] = c->prio_clock[READ].hand; + ca->buckets[r].prio[WRITE] = c->prio_clock[WRITE].hand; - g = ca->buckets + r; - - g->prio[READ] = ca->fs->prio_clock[READ].hand; - g->prio[WRITE] = ca->fs->prio_clock[WRITE].hand; - + trace_bucket_alloc(ca, reserve); return r; } -static void __bch2_bucket_free(struct bch_dev *ca, struct bucket *g) -{ - bch2_mark_free_bucket(ca, g); -} - enum bucket_alloc_ret { ALLOC_SUCCESS, NO_DEVICES, /* -EROFS */ @@ -1116,7 +1155,7 @@ static enum bucket_alloc_ret bch2_bucket_alloc_group(struct bch_fs *c, while (ob->nr_ptrs < nr_replicas) { struct bch_dev *ca; - u64 bucket; + long bucket; if (!available) { ret = NO_DEVICES; @@ -1139,8 +1178,8 @@ static enum bucket_alloc_ret bch2_bucket_alloc_group(struct bch_fs *c, get_random_int() > devs->d[i].weight) continue; - bucket = bch2_bucket_alloc(ca, reserve); - if (!bucket) { + bucket = bch2_bucket_alloc(c, ca, reserve); + if (bucket < 0) { if (fail_idx == -1) fail_idx = i; continue; @@ -1456,7 +1495,6 @@ struct open_bucket *bch2_alloc_sectors_start(struct bch_fs *c, ? 0 : BTREE_NODE_RESERVE; int ret; - BUG_ON(!reserve); BUG_ON(!nr_replicas); retry: ob = lock_writepoint(c, wp); @@ -1705,7 +1743,9 @@ set_capacity: capacity *= (100 - c->opts.gc_reserve_percent); capacity = div64_u64(capacity, 100); - BUG_ON(capacity + reserved_sectors > total_capacity); + BUG_ON(reserved_sectors > total_capacity); + + capacity = min(capacity, total_capacity - reserved_sectors); c->capacity = capacity; @@ -1725,10 +1765,9 @@ set_capacity: closure_wake_up(&c->freelist_wait); } -static void bch2_stop_write_point(struct bch_dev *ca, - struct write_point *wp) +static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca, + struct write_point *wp) { - struct bch_fs *c = ca->fs; struct open_bucket *ob; struct bch_extent_ptr *ptr; @@ -1750,9 +1789,8 @@ found: bch2_open_bucket_put(c, ob); } -static bool bch2_dev_has_open_write_point(struct bch_dev *ca) +static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) { - struct bch_fs *c = ca->fs; struct bch_extent_ptr *ptr; struct open_bucket *ob; @@ -1773,55 +1811,36 @@ static bool bch2_dev_has_open_write_point(struct bch_dev *ca) } /* device goes ro: */ -void bch2_dev_allocator_stop(struct bch_dev *ca) +void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) { - struct bch_fs *c = ca->fs; struct dev_group *tier = &c->tiers[ca->mi.tier].devs; - struct task_struct *p; struct closure cl; unsigned i; + BUG_ON(ca->alloc_thread); + closure_init_stack(&cl); /* First, remove device from allocation groups: */ + bch2_dev_group_remove(&c->journal.devs, ca); bch2_dev_group_remove(tier, ca); bch2_dev_group_remove(&c->all_devs, ca); - bch2_recalc_capacity(c); - /* - * Stopping the allocator thread comes after removing from allocation - * groups, else pending allocations will hang: - */ - - p = ca->alloc_thread; - ca->alloc_thread = NULL; - smp_wmb(); - - /* - * We need an rcu barrier between setting ca->alloc_thread = NULL and - * the thread shutting down to avoid a race with bch2_usage_update() - - * the allocator thread itself does a synchronize_rcu() on exit. - * - * XXX: it would be better to have the rcu barrier be asynchronous - * instead of blocking us here + * Capacity is calculated based off of devices in allocation groups: */ - if (p) { - kthread_stop(p); - put_task_struct(p); - } + bch2_recalc_capacity(c); /* Next, close write points that point to this device... */ - for (i = 0; i < ARRAY_SIZE(c->write_points); i++) - bch2_stop_write_point(ca, &c->write_points[i]); + bch2_stop_write_point(c, ca, &c->write_points[i]); - bch2_stop_write_point(ca, &ca->copygc_write_point); - bch2_stop_write_point(ca, &c->promote_write_point); - bch2_stop_write_point(ca, &ca->tiering_write_point); - bch2_stop_write_point(ca, &c->migration_write_point); - bch2_stop_write_point(ca, &c->btree_write_point); + bch2_stop_write_point(c, ca, &ca->copygc_write_point); + bch2_stop_write_point(c, ca, &c->promote_write_point); + bch2_stop_write_point(c, ca, &ca->tiering_write_point); + bch2_stop_write_point(c, ca, &c->migration_write_point); + bch2_stop_write_point(c, ca, &c->btree_write_point); mutex_lock(&c->btree_reserve_cache_lock); while (c->btree_reserve_cache_nr) { @@ -1832,9 +1851,16 @@ void bch2_dev_allocator_stop(struct bch_dev *ca) } mutex_unlock(&c->btree_reserve_cache_lock); - /* Avoid deadlocks.. */ - + /* + * Wake up threads that were blocked on allocation, so they can notice + * the device can no longer be removed and the capacity has changed: + */ closure_wake_up(&c->freelist_wait); + + /* + * journal_res_get() can block waiting for free space in the journal - + * it needs to notice there may not be devices to allocate from anymore: + */ wake_up(&c->journal.wait); /* Now wait for any in flight writes: */ @@ -1842,7 +1868,7 @@ void bch2_dev_allocator_stop(struct bch_dev *ca) while (1) { closure_wait(&c->open_buckets_wait, &cl); - if (!bch2_dev_has_open_write_point(ca)) { + if (!bch2_dev_has_open_write_point(c, ca)) { closure_wake_up(&c->open_buckets_wait); break; } @@ -1851,32 +1877,15 @@ void bch2_dev_allocator_stop(struct bch_dev *ca) } } -/* - * Startup the allocator thread for transition to RW mode: - */ -int bch2_dev_allocator_start(struct bch_dev *ca) +/* device goes rw: */ +void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) { - struct bch_fs *c = ca->fs; struct dev_group *tier = &c->tiers[ca->mi.tier].devs; struct bch_sb_field_journal *journal_buckets; bool has_journal; - struct task_struct *k; - /* - * allocator thread already started? - */ - if (ca->alloc_thread) - return 0; - - k = kthread_create(bch2_allocator_thread, ca, "bcache_allocator"); - if (IS_ERR(k)) - return 0; - - get_task_struct(k); - ca->alloc_thread = k; - - bch2_dev_group_add(tier, ca); bch2_dev_group_add(&c->all_devs, ca); + bch2_dev_group_add(tier, ca); mutex_lock(&c->sb_lock); journal_buckets = bch2_sb_get_journal(ca->disk_sb.sb); @@ -1886,15 +1895,44 @@ int bch2_dev_allocator_start(struct bch_dev *ca) if (has_journal) bch2_dev_group_add(&c->journal.devs, ca); +} - bch2_recalc_capacity(c); +/* stop allocator thread: */ +void bch2_dev_allocator_stop(struct bch_dev *ca) +{ + struct task_struct *p = ca->alloc_thread; + + ca->alloc_thread = NULL; + smp_wmb(); + + /* + * We need an rcu barrier between setting ca->alloc_thread = NULL and + * the thread shutting down to avoid a race with bch2_usage_update() - + * the allocator thread itself does a synchronize_rcu() on exit. + * + * XXX: it would be better to have the rcu barrier be asynchronous + * instead of blocking us here + */ + if (p) + kthread_stop(p); +} + +/* start allocator thread: */ +int bch2_dev_allocator_start(struct bch_dev *ca) +{ + struct task_struct *p; /* - * Don't wake up allocator thread until after adding device to - * allocator groups - otherwise, alloc thread could get a spurious - * -EROFS due to prio_write() -> journal_meta() not finding any devices: + * allocator thread already started? */ - wake_up_process(k); + if (ca->alloc_thread) + return 0; + + p = kthread_run(bch2_allocator_thread, ca, "bcache_allocator"); + if (IS_ERR(p)) + return PTR_ERR(p); + + ca->alloc_thread = p; return 0; } diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h index 195108c..cfd1c8e 100644 --- a/libbcachefs/alloc.h +++ b/libbcachefs/alloc.h @@ -10,24 +10,14 @@ struct bch_dev; struct bch_fs; struct dev_group; -static inline size_t prios_per_bucket(const struct bch_dev *ca) -{ - return (bucket_bytes(ca) - sizeof(struct prio_set)) / - sizeof(struct bucket_disk); -} - -static inline size_t prio_buckets(const struct bch_dev *ca) -{ - return DIV_ROUND_UP((size_t) (ca)->mi.nbuckets, prios_per_bucket(ca)); -} - void bch2_dev_group_remove(struct dev_group *, struct bch_dev *); void bch2_dev_group_add(struct dev_group *, struct bch_dev *); -int bch2_prio_read(struct bch_dev *); -int bch2_prio_write(struct bch_dev *); +int bch2_alloc_read(struct bch_fs *, struct list_head *); +int bch2_alloc_write(struct bch_fs *, struct bch_dev *, u64 *); +int bch2_alloc_replay_key(struct bch_fs *, struct bpos); -size_t bch2_bucket_alloc(struct bch_dev *, enum alloc_reserve); +long bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve); void bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); @@ -80,8 +70,15 @@ static inline struct bch_dev *dev_group_next(struct dev_group *devs, (_ptr)++) void bch2_recalc_capacity(struct bch_fs *); + +void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); +void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); + void bch2_dev_allocator_stop(struct bch_dev *); int bch2_dev_allocator_start(struct bch_dev *); + void bch2_fs_allocator_init(struct bch_fs *); +extern const struct bkey_ops bch2_bkey_alloc_ops; + #endif /* _BCACHE_ALLOC_H */ diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index ae58d08..ce3a919 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -35,20 +35,13 @@ struct prio_clock { /* There is one reserve for each type of btree, one for prios and gens * and one for moving GC */ enum alloc_reserve { - RESERVE_PRIO, - RESERVE_BTREE, - RESERVE_METADATA_LAST = RESERVE_BTREE, - RESERVE_MOVINGGC, - - RESERVE_NONE, - RESERVE_NR, + RESERVE_ALLOC = -1, + RESERVE_BTREE = 0, + RESERVE_MOVINGGC = 1, + RESERVE_NONE = 2, + RESERVE_NR = 3, }; -static inline bool allocation_is_metadata(enum alloc_reserve id) -{ - return id <= RESERVE_METADATA_LAST; -} - struct dev_group { spinlock_t lock; unsigned nr; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 977ac36..ab99af7 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -305,7 +305,7 @@ do { \ (btree_reserve_required_nodes(BTREE_MAX_DEPTH) + GC_MERGE_NODES) /* Size of the freelist we allocate btree nodes from: */ -#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 2) +#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) struct btree; struct crypto_blkcipher; @@ -329,13 +329,23 @@ struct bch_member_cpu { u16 bucket_size; /* sectors */ u8 state; u8 tier; - u8 has_metadata; - u8 has_data; u8 replacement; u8 discard; u8 valid; }; +struct bch_replicas_cpu_entry { + u8 data_type; + u8 devs[BCH_SB_MEMBERS_MAX / 8]; +}; + +struct bch_replicas_cpu { + struct rcu_head rcu; + unsigned nr; + unsigned entry_size; + struct bch_replicas_cpu_entry entries[]; +}; + struct bch_dev { struct kobject kobj; struct percpu_ref ref; @@ -363,21 +373,7 @@ struct bch_dev { struct task_struct *alloc_thread; - struct prio_set *disk_buckets; - - /* - * When allocating new buckets, prio_write() gets first dibs - since we - * may not be allocate at all without writing priorities and gens. - * prio_last_buckets[] contains the last buckets we wrote priorities to - * (so gc can mark them as metadata). - */ - u64 *prio_buckets; - u64 *prio_last_buckets; - spinlock_t prio_buckets_lock; - struct bio *bio_prio; - bool prio_read_done; - bool need_prio_write; - struct mutex prio_write_lock; + bool need_alloc_write; /* * free: Buckets that are ready to be used @@ -391,6 +387,7 @@ struct bch_dev { DECLARE_FIFO(long, free)[RESERVE_NR]; DECLARE_FIFO(long, free_inc); spinlock_t freelist_lock; + bool alloc_thread_started; size_t fifo_last_bucket; @@ -415,6 +412,8 @@ struct bch_dev { atomic_long_t saturated_count; size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; + u64 allocator_journal_seq_flush; + bool allocator_invalidating_data; alloc_heap alloc_heap; bucket_heap copygc_heap; @@ -458,6 +457,7 @@ enum { BCH_FS_FSCK_FIXED_ERRORS, BCH_FS_FSCK_DONE, BCH_FS_FIXED_GENS, + BCH_FS_REBUILD_REPLICAS, }; struct btree_debug { @@ -507,6 +507,10 @@ struct bch_fs { struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; + struct bch_replicas_cpu __rcu *replicas; + struct bch_replicas_cpu __rcu *replicas_gc; + struct mutex replicas_gc_lock; + struct bch_opts opts; /* Updated by bch2_sb_update():*/ @@ -520,9 +524,6 @@ struct bch_fs { u8 nr_devices; u8 clean; - u8 meta_replicas_have; - u8 data_replicas_have; - u8 str_hash_type; u8 encryption_type; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 2d64bca..3f6d51a 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -2,7 +2,7 @@ #define _BCACHEFS_FORMAT_H /* - * Bcache on disk data structures + * bcachefs on disk data structures */ #include @@ -714,6 +714,25 @@ struct bch_xattr { } __attribute__((packed, aligned(8))); BKEY_VAL_TYPE(xattr, BCH_XATTR); +/* Bucket/allocation information: */ + +enum { + BCH_ALLOC = 128, +}; + +enum { + BCH_ALLOC_FIELD_READ_TIME = 0, + BCH_ALLOC_FIELD_WRITE_TIME = 1, +}; + +struct bch_alloc { + struct bch_val v; + __u8 fields; + __u8 gen; + __u8 data[]; +} __attribute__((packed, aligned(8))); +BKEY_VAL_TYPE(alloc, BCH_ALLOC); + /* Superblock */ /* Version 0: Cache device @@ -752,8 +771,7 @@ struct bch_member { LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) LE64_BITMASK(BCH_MEMBER_TIER, struct bch_member, flags[0], 4, 8) -LE64_BITMASK(BCH_MEMBER_HAS_METADATA, struct bch_member, flags[0], 8, 9) -LE64_BITMASK(BCH_MEMBER_HAS_DATA, struct bch_member, flags[0], 9, 10) +/* 8-10 unused, was HAS_(META)DATA */ LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14) LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15); @@ -800,7 +818,8 @@ enum bch_sb_field_type { BCH_SB_FIELD_journal = 0, BCH_SB_FIELD_members = 1, BCH_SB_FIELD_crypt = 2, - BCH_SB_FIELD_NR = 3, + BCH_SB_FIELD_replicas = 3, + BCH_SB_FIELD_NR = 4, }; struct bch_sb_field_journal { @@ -861,8 +880,24 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); -struct bch_sb_field_replication { +enum bch_data_types { + BCH_DATA_NONE = 0, + BCH_DATA_SB = 1, + BCH_DATA_JOURNAL = 2, + BCH_DATA_BTREE = 3, + BCH_DATA_USER = 4, + BCH_DATA_NR = 5, +}; + +struct bch_replicas_entry { + u8 data_type; + u8 nr; + u8 devs[0]; +}; + +struct bch_sb_field_replicas { struct bch_sb_field field; + struct bch_replicas_entry entries[0]; }; /* @@ -937,8 +972,7 @@ LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48); LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52); LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56); -LE64_BITMASK(BCH_SB_META_REPLICAS_HAVE, struct bch_sb, flags[0], 56, 60); -LE64_BITMASK(BCH_SB_DATA_REPLICAS_HAVE, struct bch_sb, flags[0], 60, 64); +/* 56-64 unused, was REPLICAS_HAVE */ LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8); @@ -946,6 +980,7 @@ LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); + /* 14-20 unused, was JOURNAL_ENTRY_SIZE */ LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); @@ -1003,77 +1038,6 @@ enum bch_compression_opts { BCH_COMPRESSION_NR = 3, }; -/* backing device specific stuff: */ - -struct backingdev_sb { - __le64 csum; - __le64 offset; /* sector where this sb was written */ - __le64 version; /* of on disk format */ - - uuid_le magic; /* bcachefs superblock UUID */ - - uuid_le disk_uuid; - - /* - * Internal cache set UUID - xored with various magic numbers and thus - * must never change: - */ - union { - uuid_le set_uuid; - __le64 set_magic; - }; - __u8 label[BCH_SB_LABEL_SIZE]; - - __le64 flags; - - /* Incremented each time superblock is written: */ - __le64 seq; - - /* - * User visible UUID for identifying the cache set the user is allowed - * to change: - * - * XXX hooked up? - */ - uuid_le user_uuid; - __le64 pad1[6]; - - __le64 data_offset; - __le16 block_size; /* sectors */ - __le16 pad2[3]; - - __le32 last_mount; /* time_t */ - __le16 pad3; - /* size of variable length portion - always 0 for backingdev superblock */ - __le16 u64s; - __u64 _data[0]; -}; - -LE64_BITMASK(BDEV_CACHE_MODE, struct backingdev_sb, flags, 0, 4); -#define CACHE_MODE_WRITETHROUGH 0U -#define CACHE_MODE_WRITEBACK 1U -#define CACHE_MODE_WRITEAROUND 2U -#define CACHE_MODE_NONE 3U - -LE64_BITMASK(BDEV_STATE, struct backingdev_sb, flags, 61, 63); -#define BDEV_STATE_NONE 0U -#define BDEV_STATE_CLEAN 1U -#define BDEV_STATE_DIRTY 2U -#define BDEV_STATE_STALE 3U - -#define BDEV_DATA_START_DEFAULT 16 /* sectors */ - -static inline _Bool __SB_IS_BDEV(__u64 version) -{ - return version == BCACHE_SB_VERSION_BDEV - || version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; -} - -static inline _Bool SB_IS_BDEV(const struct bch_sb *sb) -{ - return __SB_IS_BDEV(sb->version); -} - /* * Magic numbers * @@ -1088,7 +1052,6 @@ static inline _Bool SB_IS_BDEV(const struct bch_sb *sb) #define BCACHE_STATFS_MAGIC 0xca451a4e #define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) -#define PSET_MAGIC __cpu_to_le64(0x6750e15f87337f91ULL) #define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) static inline __le64 __bch2_sb_magic(struct bch_sb *sb) @@ -1103,11 +1066,6 @@ static inline __u64 __jset_magic(struct bch_sb *sb) return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC); } -static inline __u64 __pset_magic(struct bch_sb *sb) -{ - return __le64_to_cpu(__bch2_sb_magic(sb) ^ PSET_MAGIC); -} - static inline __u64 __bset_magic(struct bch_sb *sb) { return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC); @@ -1136,9 +1094,9 @@ struct jset_entry { LE32_BITMASK(JOURNAL_ENTRY_TYPE, struct jset_entry, flags, 0, 8); enum { - JOURNAL_ENTRY_BTREE_KEYS = 0, - JOURNAL_ENTRY_BTREE_ROOT = 1, - JOURNAL_ENTRY_PRIO_PTRS = 2, + JOURNAL_ENTRY_BTREE_KEYS = 0, + JOURNAL_ENTRY_BTREE_ROOT = 1, + JOURNAL_ENTRY_PRIO_PTRS = 2, /* Obsolete */ /* * Journal sequence numbers can be blacklisted: bsets record the max @@ -1150,7 +1108,7 @@ enum { * and then record that we skipped it so that the next time we crash and * recover we don't think there was a missing journal entry. */ - JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED = 3, + JOURNAL_ENTRY_JOURNAL_SEQ_BLACKLISTED = 3, }; /* @@ -1193,35 +1151,14 @@ LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); #define BCH_JOURNAL_BUCKETS_MIN 20 -/* Bucket prios/gens */ - -struct prio_set { - struct bch_csum csum; - - __le64 magic; - __le32 nonce[3]; - __le16 version; - __le16 flags; - - __u8 encrypted_start[0]; - - __le64 next_bucket; - - struct bucket_disk { - __le16 prio[2]; - __u8 gen; - } __attribute__((packed)) data[]; -} __attribute__((packed, aligned(8))); - -LE32_BITMASK(PSET_CSUM_TYPE, struct prio_set, flags, 0, 4); - /* Btree: */ #define DEFINE_BCH_BTREE_IDS() \ - DEF_BTREE_ID(EXTENTS, 0, "extents") \ - DEF_BTREE_ID(INODES, 1, "inodes") \ - DEF_BTREE_ID(DIRENTS, 2, "dirents") \ - DEF_BTREE_ID(XATTRS, 3, "xattrs") + DEF_BTREE_ID(EXTENTS, 0, "extents") \ + DEF_BTREE_ID(INODES, 1, "inodes") \ + DEF_BTREE_ID(DIRENTS, 2, "dirents") \ + DEF_BTREE_ID(XATTRS, 3, "xattrs") \ + DEF_BTREE_ID(ALLOC, 4, "alloc") #define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val, @@ -1318,4 +1255,33 @@ struct btree_node_entry { }; } __attribute__((packed, aligned(8))); +/* Obsolete: */ + +struct prio_set { + struct bch_csum csum; + + __le64 magic; + __le32 nonce[3]; + __le16 version; + __le16 flags; + + __u8 encrypted_start[0]; + + __le64 next_bucket; + + struct bucket_disk { + __le16 prio[2]; + __u8 gen; + } __attribute__((packed)) data[]; +} __attribute__((packed, aligned(8))); + +LE32_BITMASK(PSET_CSUM_TYPE, struct prio_set, flags, 0, 4); + +#define PSET_MAGIC __cpu_to_le64(0x6750e15f87337f91ULL) + +static inline __u64 __pset_magic(struct bch_sb *sb) +{ + return __le64_to_cpu(__bch2_sb_magic(sb) ^ PSET_MAGIC); +} + #endif /* _BCACHEFS_FORMAT_H */ diff --git a/libbcachefs/bcachefs_ioctl.h b/libbcachefs/bcachefs_ioctl.h index 22d6845..5bdbbe6 100644 --- a/libbcachefs/bcachefs_ioctl.h +++ b/libbcachefs/bcachefs_ioctl.h @@ -1,13 +1,9 @@ -#ifndef _LINUX_BCACHE_IOCTL_H -#define _LINUX_BCACHE_IOCTL_H +#ifndef _BCACHEFS_IOCTL_H +#define _BCACHEFS_IOCTL_H #include #include "bcachefs_format.h" -#ifdef __cplusplus -extern "C" { -#endif - #define BCH_FORCE_IF_DATA_LOST (1 << 0) #define BCH_FORCE_IF_METADATA_LOST (1 << 1) #define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) @@ -97,8 +93,4 @@ struct bch_ioctl_data { __u64 end_offset; }; -#ifdef __cplusplus -} -#endif - -#endif /* _LINUX_BCACHE_IOCTL_H */ +#endif /* _BCACHEFS_IOCTL_H */ diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index 1383c96..0511e1f 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -580,6 +580,8 @@ BKEY_VAL_ACCESSORS(dirent, BCH_DIRENT); BKEY_VAL_ACCESSORS(xattr, BCH_XATTR); +BKEY_VAL_ACCESSORS(alloc, BCH_ALLOC); + /* byte order helpers */ #if !defined(__LITTLE_ENDIAN) && !defined(__BIG_ENDIAN) diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index cd9a60c..dbec8b3 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "bkey_methods.h" #include "btree_types.h" +#include "alloc.h" #include "dirent.h" #include "error.h" #include "extents.h" @@ -13,6 +14,7 @@ const struct bkey_ops *bch2_bkey_ops[] = { [BKEY_TYPE_INODES] = &bch2_bkey_inode_ops, [BKEY_TYPE_DIRENTS] = &bch2_bkey_dirent_ops, [BKEY_TYPE_XATTRS] = &bch2_bkey_xattr_ops, + [BKEY_TYPE_ALLOC] = &bch2_bkey_alloc_ops, [BKEY_TYPE_BTREE] = &bch2_bkey_btree_ops, }; diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 78132e4..815260b 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -129,6 +129,8 @@ static u8 bch2_btree_mark_key(struct bch_fs *c, enum bkey_type type, int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, struct bkey_s_c k) { + enum bch_data_types data_type = type == BKEY_TYPE_BTREE + ? BCH_DATA_BTREE : BCH_DATA_USER; int ret = 0; switch (k.k->type) { @@ -137,6 +139,15 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const struct bch_extent_ptr *ptr; + if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || + (!c->opts.nofsck && + fsck_err_on(!bch2_sb_has_replicas(c, e, data_type), c, + "superblock not marked as containing replicas"))) { + ret = bch2_check_mark_super(c, e, data_type); + if (ret) + return ret; + } + extent_for_each_ptr(e, ptr) { struct bch_dev *ca = c->devs[ptr->dev]; struct bucket *g = PTR_BUCKET(ca, ptr); @@ -147,7 +158,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, new.gen = ptr->gen; new.gen_valid = 1; })); - ca->need_prio_write = true; + ca->need_alloc_write = true; } if (fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, @@ -159,7 +170,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, new.gen = ptr->gen; new.gen_valid = 1; })); - ca->need_prio_write = true; + ca->need_alloc_write = true; set_bit(BCH_FS_FIXED_GENS, &c->flags); } @@ -168,6 +179,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, } } + atomic64_set(&c->key_version, max_t(u64, k.k->version.lo, atomic64_read(&c->key_version))); @@ -348,17 +360,6 @@ void bch2_mark_dev_metadata(struct bch_fs *c, struct bch_dev *ca) } spin_unlock(&c->journal.lock); - - spin_lock(&ca->prio_buckets_lock); - - for (i = 0; i < prio_buckets(ca) * 2; i++) { - b = ca->prio_buckets[i]; - if (b) - bch2_mark_metadata_bucket(ca, ca->buckets + b, - BUCKET_PRIOS, true); - } - - spin_unlock(&ca->prio_buckets_lock); } static void bch2_mark_metadata(struct bch_fs *c) @@ -474,10 +475,6 @@ void bch2_gc(struct bch_fs *c) * move around - if references move backwards in the ordering GC * uses, GC could skip past them */ - - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) - return; - trace_gc_start(c); /* @@ -487,6 +484,8 @@ void bch2_gc(struct bch_fs *c) bch2_recalc_sectors_available(c); down_write(&c->gc_lock); + if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) + goto out; bch2_gc_start(c); @@ -502,8 +501,7 @@ void bch2_gc(struct bch_fs *c) if (ret) { bch_err(c, "btree gc failed: %d", ret); set_bit(BCH_FS_GC_FAILURE, &c->flags); - up_write(&c->gc_lock); - return; + goto out; } gc_pos_set(c, gc_phase(c->gc_pos.phase + 1)); @@ -518,7 +516,7 @@ void bch2_gc(struct bch_fs *c) /* Indicates that gc is no longer in progress: */ gc_pos_set(c, gc_phase(GC_PHASE_DONE)); c->gc_count++; - +out: up_write(&c->gc_lock); trace_gc_end(c); bch2_time_stats_update(&c->btree_gc_time, start_time); @@ -529,6 +527,12 @@ void bch2_gc(struct bch_fs *c) */ for_each_member_device(ca, c, i) bch2_wake_allocator(ca); + + /* + * At startup, allocations can happen directly instead of via the + * allocator thread - issue wakeup in case they blocked on gc_lock: + */ + closure_wake_up(&c->freelist_wait); } /* Btree coalescing */ @@ -997,6 +1001,14 @@ int bch2_initial_gc(struct bch_fs *c, struct list_head *journal) unsigned iter = 0; enum btree_id id; int ret; + + mutex_lock(&c->sb_lock); + if (!bch2_sb_get_replicas(c->disk_sb)) { + if (BCH_SB_INITIALIZED(c->disk_sb)) + bch_info(c, "building replicas info"); + set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); + } + mutex_unlock(&c->sb_lock); again: bch2_gc_start(c); @@ -1006,11 +1018,9 @@ again: return ret; } - if (journal) { - ret = bch2_journal_mark(c, journal); - if (ret) - return ret; - } + ret = bch2_journal_mark(c, journal); + if (ret) + return ret; bch2_mark_metadata(c); diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 1846948..571a814 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1402,7 +1402,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, ret = validate_bset(c, b, ptr, i, sectors, &whiteout_u64s, WRITE); if (ret) - bch2_fatal_error(c); + bch2_inconsistent_error(c); return ret; } diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index 8a4ee6d..9794ac3 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -233,17 +233,29 @@ void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b) } static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, - bool use_reserve, - struct disk_reservation *res, - struct closure *cl) + struct disk_reservation *res, + struct closure *cl, + unsigned flags) { BKEY_PADDED(k) tmp; struct open_bucket *ob; struct btree *b; - unsigned reserve = use_reserve ? 0 : BTREE_NODE_RESERVE; + unsigned nr_reserve; + enum alloc_reserve alloc_reserve; + + if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) { + nr_reserve = 0; + alloc_reserve = RESERVE_ALLOC; + } else if (flags & BTREE_INSERT_USE_RESERVE) { + nr_reserve = BTREE_NODE_RESERVE / 2; + alloc_reserve = RESERVE_BTREE; + } else { + nr_reserve = BTREE_NODE_RESERVE; + alloc_reserve = RESERVE_NONE; + } mutex_lock(&c->btree_reserve_cache_lock); - if (c->btree_reserve_cache_nr > reserve) { + if (c->btree_reserve_cache_nr > nr_reserve) { struct btree_alloc *a = &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; @@ -263,8 +275,7 @@ retry: bkey_i_to_extent(&tmp.k), res->nr_replicas, c->opts.metadata_replicas_required, - use_reserve ? RESERVE_BTREE : RESERVE_NONE, - cl); + alloc_reserve, cl); if (IS_ERR(ob)) return ERR_CAST(ob); @@ -311,7 +322,7 @@ static struct btree *bch2_btree_node_alloc(struct bch_fs *c, bch2_btree_build_aux_trees(b); - bch2_check_mark_super(c, &b->key, true); + bch2_check_mark_super(c, bkey_i_to_s_c_extent(&b->key), BCH_DATA_BTREE); trace_btree_node_alloc(c, b); return b; @@ -533,9 +544,6 @@ static struct btree_reserve *__bch2_btree_reserve_get(struct bch_fs *c, if (flags & BTREE_INSERT_NOFAIL) disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL; - if (flags & BTREE_INSERT_NOWAIT) - cl = NULL; - /* * This check isn't necessary for correctness - it's just to potentially * prevent us from doing a lot of work that'll end up being wasted: @@ -565,8 +573,9 @@ static struct btree_reserve *__bch2_btree_reserve_get(struct bch_fs *c, reserve->nr = 0; while (reserve->nr < nr_nodes) { - b = __bch2_btree_node_alloc(c, flags & BTREE_INSERT_USE_RESERVE, - &disk_res, cl); + b = __bch2_btree_node_alloc(c, &disk_res, + flags & BTREE_INSERT_NOWAIT + ? NULL : cl, flags); if (IS_ERR(b)) { ret = PTR_ERR(b); goto err_free; @@ -793,8 +802,8 @@ void bch2_btree_journal_key(struct btree_insert *trans, struct btree_write *w = btree_current_write(b); EBUG_ON(iter->level || b->level); - EBUG_ON(!trans->journal_res.ref && - test_bit(JOURNAL_REPLAY_DONE, &j->flags)); + EBUG_ON(trans->journal_res.ref != + !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); if (!journal_pin_active(&w->journal)) bch2_journal_pin_add(j, &trans->journal_res, @@ -1026,6 +1035,27 @@ retry: */ six_unlock_read(&b->lock); mutex_unlock(&c->btree_interior_update_lock); + + /* + * Bit of funny circularity going on here we have to break: + * + * We have to drop our journal pin before writing the journal + * entry that points to the new btree root: else, we could + * deadlock if the journal currently happens to be full. + * + * This mean we're dropping the journal pin _before_ the new + * nodes are technically reachable - but this is safe, because + * after the bch2_btree_set_root_ondisk() call above they will + * be reachable as of the very next journal write: + */ + bch2_journal_pin_drop(&c->journal, &as->journal); + + /* + * And, do a journal write to write the pointer to the new root, + * then wait for it to complete before freeing the nodes we + * replaced: + */ + bch2_journal_meta_async(&c->journal, cl); break; } @@ -1051,19 +1081,70 @@ static void btree_interior_update_updated_btree(struct bch_fs *c, mutex_unlock(&c->btree_interior_update_lock); + /* + * In general, when you're staging things in a journal that will later + * be written elsewhere, and you also want to guarantee ordering: that + * is, if you have updates a, b, c, after a crash you should never see c + * and not a or b - there's a problem: + * + * If the final destination of the update(s) (i.e. btree node) can be + * written/flushed _before_ the relevant journal entry - oops, that + * breaks ordering, since the various leaf nodes can be written in any + * order. + * + * Normally we use bset->journal_seq to deal with this - if during + * recovery we find a btree node write that's newer than the newest + * journal entry, we just ignore it - we don't need it, anything we're + * supposed to have (that we reported as completed via fsync()) will + * still be in the journal, and as far as the state of the journal is + * concerned that btree node write never happened. + * + * That breaks when we're rewriting/splitting/merging nodes, since we're + * mixing btree node writes that haven't happened yet with previously + * written data that has been reported as completed to the journal. + * + * Thus, before making the new nodes reachable, we have to wait the + * newest journal sequence number we have data for to be written (if it + * hasn't been yet). + */ bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl); continue_at(&as->cl, btree_interior_update_nodes_written, system_freezable_wq); } -static void btree_interior_update_reparent(struct btree_interior_update *as, +static void interior_update_flush(struct journal *j, + struct journal_entry_pin *pin, u64 seq) +{ + struct btree_interior_update *as = + container_of(pin, struct btree_interior_update, journal); + + bch2_journal_flush_seq_async(j, as->journal_seq, NULL); +} + +static void btree_interior_update_reparent(struct bch_fs *c, + struct btree_interior_update *as, struct btree_interior_update *child) { child->b = NULL; child->mode = BTREE_INTERIOR_UPDATING_AS; child->parent_as = as; closure_get(&as->cl); + + /* + * When we write a new btree root, we have to drop our journal pin + * _before_ the new nodes are technically reachable; see + * btree_interior_update_nodes_written(). + * + * This goes for journal pins that are recursively blocked on us - so, + * just transfer the journal pin to the new interior update so + * btree_interior_update_nodes_written() can drop it. + */ + bch2_journal_pin_add_if_older(&c->journal, &child->journal, + &as->journal, interior_update_flush); + bch2_journal_pin_drop(&c->journal, &child->journal); + + as->journal_seq = max(as->journal_seq, child->journal_seq); } static void btree_interior_update_updated_root(struct bch_fs *c, @@ -1081,7 +1162,7 @@ static void btree_interior_update_updated_root(struct bch_fs *c, * btree_interior_update operation to point to us: */ if (r->as) - btree_interior_update_reparent(as, r->as); + btree_interior_update_reparent(c, as, r->as); as->mode = BTREE_INTERIOR_UPDATING_ROOT; as->b = r->b; @@ -1089,19 +1170,21 @@ static void btree_interior_update_updated_root(struct bch_fs *c, mutex_unlock(&c->btree_interior_update_lock); + /* + * When we're rewriting nodes and updating interior nodes, there's an + * issue with updates that haven't been written in the journal getting + * mixed together with older data - see * btree_interior_update_updated_btree() + * for the explanation. + * + * However, this doesn't affect us when we're writing a new btree root - + * because to make that new root reachable we have to write out a new + * journal entry, which must necessarily be newer than as->journal_seq. + */ + continue_at(&as->cl, btree_interior_update_nodes_written, system_freezable_wq); } -static void interior_update_flush(struct journal *j, - struct journal_entry_pin *pin, u64 seq) -{ - struct btree_interior_update *as = - container_of(pin, struct btree_interior_update, journal); - - bch2_journal_flush_seq_async(j, as->journal_seq, NULL); -} - /* * @b is being split/rewritten: it may have pointers to not-yet-written btree * nodes and thus outstanding btree_interior_updates - redirect @b's @@ -1150,7 +1233,7 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c, */ list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { list_del(&p->write_blocked_list); - btree_interior_update_reparent(as, p); + btree_interior_update_reparent(c, as, p); } clear_btree_node_dirty(b); diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 7c4abe4..b5cfa89 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -373,16 +373,20 @@ int __bch2_btree_insert_at(struct btree_insert *); /* for copygc, or when merging btree nodes */ #define BTREE_INSERT_USE_RESERVE (1 << 2) +#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << 3) /* * Insert is for journal replay: don't get journal reservations, or mark extents * (bch_mark_key) */ -#define BTREE_INSERT_JOURNAL_REPLAY (1 << 3) +#define BTREE_INSERT_JOURNAL_REPLAY (1 << 4) /* Don't block on allocation failure (for new btree nodes: */ -#define BTREE_INSERT_NOWAIT (1 << 4) -#define BTREE_INSERT_GC_LOCK_HELD (1 << 5) +#define BTREE_INSERT_NOWAIT (1 << 5) +#define BTREE_INSERT_GC_LOCK_HELD (1 << 6) + +#define BCH_HASH_SET_MUST_CREATE (1 << 7) +#define BCH_HASH_SET_MUST_REPLACE (1 << 8) int bch2_btree_delete_at(struct btree_iter *, unsigned); diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 1c2f692..e522705 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -306,14 +306,18 @@ static void bch2_dev_usage_update(struct bch_dev *ca, _old; \ }) -void bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g) +bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g, + struct bucket_mark *old) { struct bch_fs_usage stats = { 0 }; - struct bucket_mark old, new; + struct bucket_mark new; + + *old = bucket_data_cmpxchg(ca, g, new, ({ + if (!is_available_bucket(new)) + return false; - old = bucket_data_cmpxchg(ca, g, new, ({ new.owned_by_allocator = 1; - new.had_metadata = 0; + new.touched_this_mount = 1; new.data_type = 0; new.cached_sectors = 0; new.dirty_sectors = 0; @@ -321,11 +325,28 @@ void bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g) })); /* XXX: we're not actually updating fs usage's cached sectors... */ - bch2_fs_usage_update(&stats, old, new); + bch2_fs_usage_update(&stats, *old, new); - if (!old.owned_by_allocator && old.cached_sectors) + if (!old->owned_by_allocator && old->cached_sectors) trace_invalidate(ca, g - ca->buckets, - old.cached_sectors); + old->cached_sectors); + return true; +} + +bool bch2_mark_alloc_bucket_startup(struct bch_dev *ca, struct bucket *g) +{ + struct bucket_mark new, old; + + old = bucket_data_cmpxchg(ca, g, new, ({ + if (new.touched_this_mount || + !is_available_bucket(new)) + return false; + + new.owned_by_allocator = 1; + new.touched_this_mount = 1; + })); + + return true; } void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g) @@ -333,6 +354,7 @@ void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g) struct bucket_mark old, new; old = bucket_data_cmpxchg(ca, g, new, ({ + new.touched_this_mount = 1; new.owned_by_allocator = 0; new.data_type = 0; new.cached_sectors = 0; @@ -348,7 +370,8 @@ void bch2_mark_alloc_bucket(struct bch_dev *ca, struct bucket *g, struct bucket_mark new; bucket_data_cmpxchg(ca, g, new, ({ - new.owned_by_allocator = owned_by_allocator; + new.touched_this_mount = 1; + new.owned_by_allocator = owned_by_allocator; })); } @@ -376,8 +399,8 @@ void bch2_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g, old = bucket_data_cmpxchg(ca, g, new, ({ saturated_add(ca, new.dirty_sectors, ca->mi.bucket_size, GC_MAX_SECTORS_USED); - new.data_type = type; - new.had_metadata = 1; + new.data_type = type; + new.touched_this_mount = 1; })); if (old.data_type != type && @@ -458,8 +481,9 @@ static void bch2_mark_pointer(struct bch_fs *c, if (gc_will_visit) { if (journal_seq) bucket_cmpxchg(g, new, ({ - new.journal_seq_valid = 1; - new.journal_seq = journal_seq; + new.touched_this_mount = 1; + new.journal_seq_valid = 1; + new.journal_seq = journal_seq; })); goto out; @@ -479,11 +503,6 @@ static void bch2_mark_pointer(struct bch_fs *c, return; } - EBUG_ON(type != S_CACHED && - !may_make_unavailable && - is_available_bucket(new) && - test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)); - if (type != S_CACHED && new.dirty_sectors == GC_MAX_SECTORS_USED && disk_sectors < 0) @@ -508,7 +527,7 @@ static void bch2_mark_pointer(struct bch_fs *c, new.data_type = data_type; } - new.had_metadata |= is_meta_bucket(new); + new.touched_this_mount = 1; })); if (old.data_type != data_type && diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index f99a62b..37eb471 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -191,7 +191,9 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m, void bch2_bucket_seq_cleanup(struct bch_fs *); -void bch2_invalidate_bucket(struct bch_dev *, struct bucket *); +bool bch2_invalidate_bucket(struct bch_dev *, struct bucket *, + struct bucket_mark *); +bool bch2_mark_alloc_bucket_startup(struct bch_dev *, struct bucket *); void bch2_mark_free_bucket(struct bch_dev *, struct bucket *); void bch2_mark_alloc_bucket(struct bch_dev *, struct bucket *, bool); void bch2_mark_metadata_bucket(struct bch_dev *, struct bucket *, diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 3c8b644..c25c9fa 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -3,6 +3,7 @@ #include "util.h" +/* kill, switch to bch_data_types */ enum bucket_data_type { BUCKET_DATA = 0, BUCKET_BTREE, @@ -19,23 +20,12 @@ struct bucket_mark { struct { u8 gen; - - unsigned gen_valid:1; - unsigned journal_seq_valid:1; - - /* - * If this bucket had metadata while at the current generation - * number, the allocator must increment its gen before we reuse - * it: - */ - unsigned had_metadata:1; - - unsigned owned_by_allocator:1; - - unsigned data_type:3; - - unsigned nouse:1; - + u8 data_type:3, + gen_valid:1, + owned_by_allocator:1, + nouse:1, + journal_seq_valid:1, + touched_this_mount:1; u16 dirty_sectors; u16 cached_sectors; diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 57bfb4a..74d54ab 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -412,9 +412,6 @@ static const char *extent_ptr_invalid(const struct bch_fs *c, size_ondisk > ca->mi.bucket_size) return "spans multiple buckets"; - if (!(metadata ? ca->mi.has_metadata : ca->mi.has_data)) - return "device not marked as containing data"; - return NULL; } @@ -547,12 +544,12 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, goto err; } - if (replicas < c->sb.meta_replicas_have) { + if (!bch2_sb_has_replicas(c, e, BCH_DATA_BTREE)) { bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k); bch2_fs_bug(c, - "btree key bad (too few replicas, %u < %u): %s", - replicas, c->sb.meta_replicas_have, buf); + "btree key bad (replicas not marked in superblock):\n%s", + buf); return; } @@ -1755,12 +1752,12 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b, } if (!bkey_extent_is_cached(e.k) && - replicas < c->sb.data_replicas_have) { - bch2_bkey_val_to_text(c, btree_node_type(b), buf, - sizeof(buf), e.s_c); + !bch2_sb_has_replicas(c, e, BCH_DATA_USER)) { + bch2_bkey_val_to_text(c, btree_node_type(b), + buf, sizeof(buf), e.s_c); bch2_fs_bug(c, - "extent key bad (too few replicas, %u < %u): %s", - replicas, c->sb.data_replicas_have, buf); + "extent key bad (replicas not marked in superblock):\n%s", + buf); return; } diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 1145a19..54b523d 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -531,7 +531,8 @@ static int bch2_write_extent(struct bch_write_op *op, key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); - bch2_check_mark_super(c, key_to_write, false); + bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write), + BCH_DATA_USER); bch2_submit_wbio_replicas(to_wbio(bio), c, key_to_write); return ret; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 92364fe..b0011b4 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -53,28 +53,6 @@ static inline u64 journal_pin_seq(struct journal *j, return last_seq(j) + fifo_entry_idx(&j->pin, pin_list); } -static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, - struct jset_entry *entry, unsigned type) -{ - while (entry < vstruct_last(jset)) { - if (JOURNAL_ENTRY_TYPE(entry) == type) - return entry; - - entry = vstruct_next(entry); - } - - return NULL; -} - -#define for_each_jset_entry_type(entry, jset, type) \ - for (entry = (jset)->start; \ - (entry = __jset_entry_type_next(jset, entry, type)); \ - entry = vstruct_next(entry)) - -#define for_each_jset_key(k, _n, entry, jset) \ - for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \ - vstruct_for_each_safe(entry, k, _n) - static inline void bch2_journal_add_entry(struct journal_buf *buf, const void *data, size_t u64s, unsigned type, enum btree_id id, @@ -123,20 +101,6 @@ static void bch2_journal_add_btree_root(struct journal_buf *buf, JOURNAL_ENTRY_BTREE_ROOT, id, level); } -static inline void bch2_journal_add_prios(struct journal *j, - struct journal_buf *buf) -{ - /* - * no prio bucket ptrs yet... XXX should change the allocator so this - * can't happen: - */ - if (!buf->nr_prio_buckets) - return; - - bch2_journal_add_entry(buf, j->prio_buckets, buf->nr_prio_buckets, - JOURNAL_ENTRY_PRIO_PTRS, 0, 0); -} - static void journal_seq_blacklist_flush(struct journal *j, struct journal_entry_pin *pin, u64 seq) { @@ -986,7 +950,6 @@ static inline bool journal_has_keys(struct list_head *list) int bch2_journal_read(struct bch_fs *c, struct list_head *list) { struct journal *j = &c->journal; - struct jset_entry *prio_ptrs; struct journal_list jlist; struct journal_replay *i; struct journal_entry_pin_list *p; @@ -1094,15 +1057,6 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) bch_info(c, "journal read done, %i keys in %i entries, seq %llu", keys, entries, (u64) atomic64_read(&j->seq)); - - i = list_last_entry(list, struct journal_replay, list); - prio_ptrs = bch2_journal_find_entry(&i->j, JOURNAL_ENTRY_PRIO_PTRS, 0); - if (prio_ptrs) { - memcpy_u64s(j->prio_buckets, - prio_ptrs->_data, - le16_to_cpu(prio_ptrs->u64s)); - j->nr_prio_buckets = le16_to_cpu(prio_ptrs->u64s); - } fsck_err: return ret; } @@ -1189,12 +1143,7 @@ static void __bch2_journal_next_entry(struct journal *j) static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf) { - unsigned ret = BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX); - - if (buf->nr_prio_buckets) - ret += JSET_KEYS_U64s + buf->nr_prio_buckets; - - return ret; + return BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX); } static enum { @@ -1395,9 +1344,7 @@ static int journal_entry_open(struct journal *j) buf->disk_sectors = sectors; sectors = min_t(unsigned, sectors, buf->size >> 9); - j->cur_buf_sectors = sectors; - buf->nr_prio_buckets = j->nr_prio_buckets; u64s = (sectors << 9) / sizeof(u64); @@ -1510,17 +1457,27 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list) for_each_jset_key(k, _n, entry, &i->j) { struct disk_reservation disk_res; - /* - * We might cause compressed extents to be split, so we - * need to pass in a disk_reservation: - */ - BUG_ON(bch2_disk_reservation_get(c, &disk_res, 0, 0)); + if (entry->btree_id == BTREE_ID_ALLOC) { + /* + * allocation code handles replay for + * BTREE_ID_ALLOC keys: + */ + ret = bch2_alloc_replay_key(c, k->k.p); + } else { + + /* + * We might cause compressed extents to be + * split, so we need to pass in a + * disk_reservation: + */ + BUG_ON(bch2_disk_reservation_get(c, &disk_res, 0, 0)); - ret = bch2_btree_insert(c, entry->btree_id, k, - &disk_res, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_REPLAY); - bch2_disk_reservation_put(c, &disk_res); + ret = bch2_btree_insert(c, entry->btree_id, k, + &disk_res, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_JOURNAL_REPLAY); + bch2_disk_reservation_put(c, &disk_res); + } if (ret) { bch_err(c, "journal replay: error %d while replaying key", @@ -1560,13 +1517,12 @@ err: return ret; } -#if 0 /* * Allocate more journal space at runtime - not currently making use if it, but * the code works: */ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, - unsigned nr) + unsigned nr) { struct journal *j = &c->journal; struct journal_device *ja = &ca->journal; @@ -1614,8 +1570,8 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, while (ja->nr < nr) { /* must happen under journal lock, to avoid racing with gc: */ - u64 b = bch2_bucket_alloc(ca, RESERVE_NONE); - if (!b) { + long b = bch2_bucket_alloc(c, ca, RESERVE_NONE); + if (b < 0) { if (!closure_wait(&c->freelist_wait, &cl)) { spin_unlock(&j->lock); closure_sync(&cl); @@ -1651,7 +1607,7 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, } spin_unlock(&j->lock); - BUG_ON(bch2_validate_journal_layout(ca->disk_sb.sb, ca->mi)); + BUG_ON(bch2_sb_validate_journal(ca->disk_sb.sb, ca->mi)); bch2_write_super(c); @@ -1663,16 +1619,15 @@ err: kfree(new_buckets); bch2_disk_reservation_put(c, &disk_res); + if (!ret) + bch2_dev_allocator_add(c, ca); + return ret; } -#endif int bch2_dev_journal_alloc(struct bch_dev *ca) { - struct journal_device *ja = &ca->journal; - struct bch_sb_field_journal *journal_buckets; - unsigned i, nr; - u64 b, *p; + unsigned nr; if (dynamic_fault("bcachefs:add:journal_alloc")) return -ENOMEM; @@ -1686,45 +1641,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca) min(1 << 10, (1 << 20) / ca->mi.bucket_size)); - p = krealloc(ja->bucket_seq, nr * sizeof(u64), - GFP_KERNEL|__GFP_ZERO); - if (!p) - return -ENOMEM; - - ja->bucket_seq = p; - - p = krealloc(ja->buckets, nr * sizeof(u64), - GFP_KERNEL|__GFP_ZERO); - if (!p) - return -ENOMEM; - - ja->buckets = p; - - journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, - nr + sizeof(*journal_buckets) / sizeof(u64)); - if (!journal_buckets) - return -ENOMEM; - - for (i = 0, b = ca->mi.first_bucket; - i < nr && b < ca->mi.nbuckets; b++) { - if (!is_available_bucket(ca->buckets[b].mark)) - continue; - - bch2_mark_metadata_bucket(ca, &ca->buckets[b], - BUCKET_JOURNAL, true); - ja->buckets[i] = b; - journal_buckets->buckets[i] = cpu_to_le64(b); - i++; - } - - if (i < nr) - return -ENOSPC; - - BUG_ON(bch2_validate_journal_layout(ca->disk_sb.sb, ca->mi)); - - ja->nr = nr; - - return 0; + return bch2_set_nr_journal_buckets(ca->fs, ca, nr); } /* Journalling */ @@ -2274,9 +2191,6 @@ static void journal_write(struct closure *cl) jset = w->data; j->write_start_time = local_clock(); - - bch2_journal_add_prios(j, w); - mutex_lock(&c->btree_root_lock); for (i = 0; i < BTREE_ID_NR; i++) { struct btree_root *r = &c->btree_roots[i]; @@ -2324,7 +2238,8 @@ static void journal_write(struct closure *cl) closure_return_with_destructor(cl, journal_write_done); } - bch2_check_mark_super(c, &j->key, true); + bch2_check_mark_super(c, bkey_i_to_s_c_extent(&j->key), + BCH_DATA_JOURNAL); /* * XXX: we really should just disable the entire journal in nochanges @@ -2380,7 +2295,7 @@ no_io: closure_return_with_destructor(cl, journal_write_done); err: - bch2_fatal_error(c); + bch2_inconsistent_error(c); closure_return_with_destructor(cl, journal_write_done); } diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index d0dd0d3..88a9bd1 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -121,6 +121,28 @@ struct journal_replay { struct jset j; }; +static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, + struct jset_entry *entry, unsigned type) +{ + while (entry < vstruct_last(jset)) { + if (JOURNAL_ENTRY_TYPE(entry) == type) + return entry; + + entry = vstruct_next(entry); + } + + return NULL; +} + +#define for_each_jset_entry_type(entry, jset, type) \ + for (entry = (jset)->start; \ + (entry = __jset_entry_type_next(jset, entry, type)); \ + entry = vstruct_next(entry)) + +#define for_each_jset_key(k, _n, entry, jset) \ + for_each_jset_entry_type(entry, jset, JOURNAL_ENTRY_BTREE_KEYS) \ + vstruct_for_each_safe(entry, k, _n) + #define JOURNAL_PIN (32 * 1024) static inline bool journal_pin_active(struct journal_entry_pin *pin) diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 4b01b14..3314fc0 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -20,13 +20,6 @@ struct journal_buf { unsigned size; unsigned disk_sectors; - - /* - * ugh, prio_buckets are stupid - need to convert them to new - * transaction machinery when it arrives - */ - unsigned nr_prio_buckets; - /* bloom filter: */ unsigned long has_inode[1024 / sizeof(unsigned long)]; }; @@ -189,14 +182,6 @@ struct journal { /* protects advancing ja->last_idx: */ struct mutex reclaim_lock; - - /* - * ugh: need to get prio_buckets converted over to the eventual new - * transaction machinery - */ - __le64 prio_buckets[BCH_SB_MEMBERS_MAX]; - unsigned nr_prio_buckets; - unsigned write_delay_ms; unsigned reclaim_delay_ms; diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 8c9e3c2..ba0cc0e 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -59,16 +59,18 @@ int bch2_move_data_off_device(struct bch_dev *ca) { struct moving_context ctxt; struct bch_fs *c = ca->fs; - struct bch_sb_field_members *mi; unsigned pass = 0; u64 seen_key_count; int ret = 0; BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW); - if (!ca->mi.has_data) + if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_USER))) return 0; + mutex_lock(&c->replicas_gc_lock); + bch2_replicas_gc_start(c, 1 << BCH_DATA_USER); + bch2_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE); ctxt.avoid = ca; @@ -124,7 +126,11 @@ int bch2_move_data_off_device(struct bch_dev *ca) BUG_ON(ret); seen_key_count++; + continue; next: + if (bkey_extent_is_data(k.k)) + bch2_check_mark_super(c, bkey_s_c_to_extent(k), + BCH_DATA_USER); bch2_btree_iter_advance_pos(&iter); bch2_btree_iter_cond_resched(&iter); @@ -133,23 +139,20 @@ next: bch2_move_ctxt_exit(&ctxt); if (ret) - return ret; + goto err; } while (seen_key_count && pass++ < MAX_DATA_OFF_ITER); if (seen_key_count) { pr_err("Unable to migrate all data in %d iterations.", MAX_DATA_OFF_ITER); - return -1; + ret = -1; + goto err; } - mutex_lock(&c->sb_lock); - mi = bch2_sb_get_members(c->disk_sb); - SET_BCH_MEMBER_HAS_DATA(&mi->members[ca->dev_idx], false); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); - - return 0; +err: + bch2_replicas_gc_end(c, ret); + mutex_unlock(&c->replicas_gc_lock); + return ret; } /* @@ -245,21 +248,27 @@ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca, int bch2_move_metadata_off_device(struct bch_dev *ca) { struct bch_fs *c = ca->fs; - struct bch_sb_field_members *mi; unsigned i; - int ret; + int ret = 0; BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW); - if (!ca->mi.has_metadata) + if (!(bch2_dev_has_data(c, ca) & + ((1 << BCH_DATA_JOURNAL)| + (1 << BCH_DATA_BTREE)))) return 0; + mutex_lock(&c->replicas_gc_lock); + bch2_replicas_gc_start(c, + (1 << BCH_DATA_JOURNAL)| + (1 << BCH_DATA_BTREE)); + /* 1st, Move the btree nodes off the device */ for (i = 0; i < BTREE_ID_NR; i++) { ret = bch2_move_btree_off(c, ca, i); if (ret) - return ret; + goto err; } /* There are no prios/gens to move -- they are already in the device. */ @@ -268,16 +277,12 @@ int bch2_move_metadata_off_device(struct bch_dev *ca) ret = bch2_journal_move(ca); if (ret) - return ret; - - mutex_lock(&c->sb_lock); - mi = bch2_sb_get_members(c->disk_sb); - SET_BCH_MEMBER_HAS_METADATA(&mi->members[ca->dev_idx], false); - - bch2_write_super(c); - mutex_unlock(&c->sb_lock); + goto err; - return 0; +err: + bch2_replicas_gc_end(c, ret); + mutex_unlock(&c->replicas_gc_lock); + return ret; } /* @@ -326,12 +331,16 @@ static int bch2_flag_key_bad(struct btree_iter *iter, */ int bch2_flag_data_bad(struct bch_dev *ca) { - int ret = 0; + struct bch_fs *c = ca->fs; struct bkey_s_c k; struct bkey_s_c_extent e; struct btree_iter iter; + int ret = 0; - bch2_btree_iter_init(&iter, ca->fs, BTREE_ID_EXTENTS, + mutex_lock(&c->replicas_gc_lock); + bch2_replicas_gc_start(c, 1 << BCH_DATA_USER); + + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH); while ((k = bch2_btree_iter_peek(&iter)).k && @@ -377,10 +386,16 @@ int bch2_flag_data_bad(struct bch_dev *ca) */ continue; advance: + if (bkey_extent_is_data(k.k)) + bch2_check_mark_super(c, bkey_s_c_to_extent(k), + BCH_DATA_USER); bch2_btree_iter_advance_pos(&iter); } bch2_btree_iter_unlock(&iter); + bch2_replicas_gc_end(c, ret); + mutex_unlock(&c->replicas_gc_lock); + return ret; } diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 6fa707d..53eb15a 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -59,6 +59,8 @@ enum opt_type { s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \ BCH_OPT(data_replicas_required, 0444, BCH_SB_DATA_REPLICAS_REQ,\ s8, OPT_UINT(1, BCH_REPLICAS_MAX)) \ + BCH_OPT(degraded, 0444, NO_SB_OPT, \ + s8, OPT_BOOL()) \ BCH_OPT(metadata_checksum, 0644, BCH_SB_META_CSUM_TYPE, \ s8, OPT_STR(bch2_csum_types)) \ BCH_OPT(data_checksum, 0644, BCH_SB_DATA_CSUM_TYPE, \ diff --git a/libbcachefs/str_hash.h b/libbcachefs/str_hash.h index b237b75..ab28b07 100644 --- a/libbcachefs/str_hash.h +++ b/libbcachefs/str_hash.h @@ -267,9 +267,6 @@ static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc, } } -#define BCH_HASH_SET_MUST_CREATE (1 << 4) -#define BCH_HASH_SET_MUST_REPLACE (1 << 5) - static inline int bch2_hash_set(const struct bch_hash_desc desc, const struct bch_hash_info *info, struct bch_fs *c, u64 inode, diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 130b130..1eae0fc 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -11,6 +11,9 @@ #include #include +static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); +static const char *bch2_sb_validate_replicas(struct bch_sb *); + static inline void __bch2_sb_layout_size_assert(void) { BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); @@ -228,8 +231,8 @@ static int u64_cmp(const void *_l, const void *_r) return l < r ? -1 : l > r ? 1 : 0; } -const char *bch2_validate_journal_layout(struct bch_sb *sb, - struct bch_member_cpu mi) +const char *bch2_sb_validate_journal(struct bch_sb *sb, + struct bch_member_cpu mi) { struct bch_sb_field_journal *journal; const char *err; @@ -291,7 +294,7 @@ static const char *bch2_sb_validate_members(struct bch_sb *sb) return "Invalid superblock: bad member info"; for (i = 0; i < sb->nr_devices; i++) { - if (bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le))) + if (!bch2_dev_exists(sb, mi, i)) continue; if (le16_to_cpu(mi->members[i].bucket_size) < @@ -302,7 +305,7 @@ static const char *bch2_sb_validate_members(struct bch_sb *sb) return NULL; } -const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb) +const char *bch2_sb_validate(struct bcache_superblock *disk_sb) { struct bch_sb *sb = disk_sb->sb; struct bch_sb_field *f; @@ -347,11 +350,6 @@ const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb) BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) return "Invalid number of metadata replicas"; - if (!BCH_SB_META_REPLICAS_HAVE(sb) || - BCH_SB_META_REPLICAS_HAVE(sb) > - BCH_SB_META_REPLICAS_WANT(sb)) - return "Invalid number of metadata replicas"; - if (!BCH_SB_DATA_REPLICAS_WANT(sb) || BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) return "Invalid number of data replicas"; @@ -360,11 +358,6 @@ const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb) BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) return "Invalid number of metadata replicas"; - if (!BCH_SB_DATA_REPLICAS_HAVE(sb) || - BCH_SB_DATA_REPLICAS_HAVE(sb) > - BCH_SB_DATA_REPLICAS_WANT(sb)) - return "Invalid number of data replicas"; - if (!BCH_SB_BTREE_NODE_SIZE(sb)) return "Btree node size not set"; @@ -419,7 +412,11 @@ const char *bch2_validate_cache_super(struct bcache_superblock *disk_sb) mi.bucket_size * mi.nbuckets) return "Invalid superblock: device too small"; - err = bch2_validate_journal_layout(sb, mi); + err = bch2_sb_validate_journal(sb, mi); + if (err) + return err; + + err = bch2_sb_validate_replicas(sb); if (err) return err; @@ -464,8 +461,6 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.btree_node_size = BCH_SB_BTREE_NODE_SIZE(src); c->sb.nr_devices = src->nr_devices; c->sb.clean = BCH_SB_CLEAN(src); - c->sb.meta_replicas_have= BCH_SB_META_REPLICAS_HAVE(src); - c->sb.data_replicas_have= BCH_SB_DATA_REPLICAS_HAVE(src); c->sb.str_hash_type = BCH_SB_STR_HASH_TYPE(src); c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); c->sb.time_base_lo = le64_to_cpu(src->time_base_lo); @@ -517,6 +512,7 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) unsigned journal_u64s = journal_buckets ? le32_to_cpu(journal_buckets->field.u64s) : 0; + int ret; lockdep_assert_held(&c->sb_lock); @@ -524,8 +520,12 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) return -ENOMEM; __copy_super(c->disk_sb, src); - bch2_sb_update(c); + ret = bch2_sb_replicas_to_cpu_replicas(c); + if (ret) + return ret; + + bch2_sb_update(c); return 0; } @@ -743,6 +743,7 @@ void bch2_write_super(struct bch_fs *c) struct closure *cl = &c->sb_write; struct bch_dev *ca; unsigned i, super_idx = 0; + const char *err; bool wrote; lockdep_assert_held(&c->sb_lock); @@ -754,7 +755,16 @@ void bch2_write_super(struct bch_fs *c) for_each_online_member(ca, c, i) bch2_sb_from_fs(c, ca); - if (c->opts.nochanges) + for_each_online_member(ca, c, i) { + err = bch2_sb_validate(&ca->disk_sb); + if (err) { + bch2_fs_inconsistent(c, "sb invalid before write: %s", err); + goto out; + } + } + + if (c->opts.nochanges || + test_bit(BCH_FS_ERROR, &c->flags)) goto out; do { @@ -771,40 +781,482 @@ out: bch2_sb_update(c); } -void bch2_check_mark_super_slowpath(struct bch_fs *c, const struct bkey_i *k, - bool meta) +/* replica information: */ + +static inline struct bch_replicas_entry * +replicas_entry_next(struct bch_replicas_entry *i) +{ + return (void *) i + offsetof(struct bch_replicas_entry, devs) + i->nr; +} + +#define for_each_replicas_entry(_r, _i) \ + for (_i = (_r)->entries; \ + (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ + (_i) = replicas_entry_next(_i)) + +static void bch2_sb_replicas_nr_entries(struct bch_sb_field_replicas *r, + unsigned *nr, + unsigned *bytes, + unsigned *max_dev) +{ + struct bch_replicas_entry *i; + unsigned j; + + *nr = 0; + *bytes = sizeof(*r); + *max_dev = 0; + + if (!r) + return; + + for_each_replicas_entry(r, i) { + for (j = 0; j < i->nr; j++) + *max_dev = max_t(unsigned, *max_dev, i->devs[j]); + (*nr)++; + } + + *bytes = (void *) i - (void *) r; +} + +static struct bch_replicas_cpu * +__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r) +{ + struct bch_replicas_cpu *cpu_r; + unsigned i, nr, bytes, max_dev, entry_size; + + bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev); + + entry_size = offsetof(struct bch_replicas_cpu_entry, devs) + + DIV_ROUND_UP(max_dev + 1, 8); + + cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) + + nr * entry_size, GFP_NOIO); + if (!cpu_r) + return NULL; + + cpu_r->nr = nr; + cpu_r->entry_size = entry_size; + + if (nr) { + struct bch_replicas_cpu_entry *dst = + cpu_replicas_entry(cpu_r, 0); + struct bch_replicas_entry *src = sb_r->entries; + + while (dst < cpu_replicas_entry(cpu_r, nr)) { + dst->data_type = src->data_type; + for (i = 0; i < src->nr; i++) + replicas_set_dev(dst, src->devs[i]); + + src = replicas_entry_next(src); + dst = (void *) dst + entry_size; + } + } + + eytzinger0_sort(cpu_r->entries, + cpu_r->nr, + cpu_r->entry_size, + memcmp, NULL); + return cpu_r; +} + +static int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) +{ + struct bch_sb_field_replicas *sb_r; + struct bch_replicas_cpu *cpu_r, *old_r; + + lockdep_assert_held(&c->sb_lock); + + sb_r = bch2_sb_get_replicas(c->disk_sb); + cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r); + if (!cpu_r) + return -ENOMEM; + + old_r = c->replicas; + rcu_assign_pointer(c->replicas, cpu_r); + if (old_r) + kfree_rcu(old_r, rcu); + + return 0; +} + +/* + * for when gc of replica information is in progress: + */ +static int bch2_update_gc_replicas(struct bch_fs *c, + struct bch_replicas_cpu *gc_r, + struct bkey_s_c_extent e, + enum bch_data_types data_type) { - struct bch_member *mi; - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); const struct bch_extent_ptr *ptr; - unsigned nr_replicas = 0; + struct bch_replicas_cpu_entry *new_e; + struct bch_replicas_cpu *new; + unsigned i, nr, entry_size, max_dev = 0; + + extent_for_each_ptr(e, ptr) + if (!ptr->cached) + max_dev = max_t(unsigned, max_dev, ptr->dev); + + entry_size = offsetof(struct bch_replicas_cpu_entry, devs) + + DIV_ROUND_UP(max_dev + 1, 8); + entry_size = max(entry_size, gc_r->entry_size); + nr = gc_r->nr + 1; + + new = kzalloc(sizeof(struct bch_replicas_cpu) + + nr * entry_size, GFP_NOIO); + if (!new) + return -ENOMEM; + + new->nr = nr; + new->entry_size = entry_size; + + for (i = 0; i < gc_r->nr; i++) + memcpy(cpu_replicas_entry(new, i), + cpu_replicas_entry(gc_r, i), + gc_r->entry_size); + + new_e = cpu_replicas_entry(new, nr - 1); + new_e->data_type = data_type; + + extent_for_each_ptr(e, ptr) + if (!ptr->cached) + replicas_set_dev(new_e, ptr->dev); + + eytzinger0_sort(new->entries, + new->nr, + new->entry_size, + memcmp, NULL); + + rcu_assign_pointer(c->replicas_gc, new); + kfree_rcu(gc_r, rcu); + return 0; +} + +int bch2_check_mark_super_slowpath(struct bch_fs *c, struct bkey_s_c_extent e, + enum bch_data_types data_type) +{ + struct bch_replicas_cpu *gc_r; + const struct bch_extent_ptr *ptr; + struct bch_sb_field_replicas *sb_r; + struct bch_replicas_entry *new_entry; + unsigned new_entry_bytes, new_u64s, nr, bytes, max_dev; + int ret = 0; mutex_lock(&c->sb_lock); + gc_r = rcu_dereference_protected(c->replicas_gc, + lockdep_is_held(&c->sb_lock)); + if (gc_r && + !replicas_has_extent(gc_r, e, data_type)) { + ret = bch2_update_gc_replicas(c, gc_r, e, data_type); + if (ret) + goto err; + } + /* recheck, might have raced */ - if (bch2_check_super_marked(c, k, meta)) { + if (bch2_sb_has_replicas(c, e, data_type)) { mutex_unlock(&c->sb_lock); - return; + return 0; } - mi = bch2_sb_get_members(c->disk_sb)->members; + new_entry_bytes = sizeof(struct bch_replicas_entry) + + bch2_extent_nr_dirty_ptrs(e.s_c); + + sb_r = bch2_sb_get_replicas(c->disk_sb); + + bch2_sb_replicas_nr_entries(sb_r, &nr, &bytes, &max_dev); + + new_u64s = DIV_ROUND_UP(bytes + new_entry_bytes, sizeof(u64)); + + sb_r = bch2_fs_sb_resize_replicas(c, + DIV_ROUND_UP(sizeof(*sb_r) + bytes + new_entry_bytes, + sizeof(u64))); + if (!sb_r) { + ret = -ENOSPC; + goto err; + } + + new_entry = (void *) sb_r + bytes; + new_entry->data_type = data_type; + new_entry->nr = 0; extent_for_each_ptr(e, ptr) - if (!ptr->cached) { - (meta - ? SET_BCH_MEMBER_HAS_METADATA - : SET_BCH_MEMBER_HAS_DATA)(mi + ptr->dev, true); - nr_replicas++; + if (!ptr->cached) + new_entry->devs[new_entry->nr++] = ptr->dev; + + ret = bch2_sb_replicas_to_cpu_replicas(c); + if (ret) { + memset(new_entry, 0, + vstruct_end(&sb_r->field) - (void *) new_entry); + goto err; + } + + bch2_write_super(c); +err: + mutex_unlock(&c->sb_lock); + return ret; +} + +struct replicas_status __bch2_replicas_status(struct bch_fs *c, + struct bch_dev *dev_to_offline) +{ + struct bch_replicas_cpu_entry *e; + struct bch_replicas_cpu *r; + unsigned i, dev, dev_slots, nr_online, nr_offline; + struct replicas_status ret; + + memset(&ret, 0, sizeof(ret)); + + for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) + ret.replicas[i].nr_online = UINT_MAX; + + rcu_read_lock(); + r = rcu_dereference(c->replicas); + dev_slots = min_t(unsigned, replicas_dev_slots(r), c->sb.nr_devices); + + for (i = 0; i < r->nr; i++) { + e = cpu_replicas_entry(r, i); + + BUG_ON(e->data_type >= ARRAY_SIZE(ret.replicas)); + + nr_online = nr_offline = 0; + + for (dev = 0; dev < dev_slots; dev++) { + if (!replicas_test_dev(e, dev)) + continue; + + if (bch2_dev_is_online(c->devs[dev]) && + c->devs[dev] != dev_to_offline) + nr_online++; + else + nr_offline++; } - nr_replicas = min_t(unsigned, nr_replicas, - (meta - ? BCH_SB_META_REPLICAS_HAVE - : BCH_SB_DATA_REPLICAS_HAVE)(c->disk_sb)); - (meta - ? SET_BCH_SB_META_REPLICAS_HAVE - : SET_BCH_SB_DATA_REPLICAS_HAVE)(c->disk_sb, nr_replicas); + ret.replicas[e->data_type].nr_online = + min(ret.replicas[e->data_type].nr_online, + nr_online); + + ret.replicas[e->data_type].nr_offline = + max(ret.replicas[e->data_type].nr_offline, + nr_offline); + } + + rcu_read_unlock(); + + return ret; +} + +struct replicas_status bch2_replicas_status(struct bch_fs *c) +{ + return __bch2_replicas_status(c, NULL); +} + +unsigned bch2_replicas_online(struct bch_fs *c, bool meta) +{ + struct replicas_status s = bch2_replicas_status(c); + + return meta + ? min(s.replicas[BCH_DATA_JOURNAL].nr_online, + s.replicas[BCH_DATA_BTREE].nr_online) + : s.replicas[BCH_DATA_USER].nr_online; +} + +unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) +{ + struct bch_replicas_cpu_entry *e; + struct bch_replicas_cpu *r; + unsigned i, ret = 0; + + rcu_read_lock(); + r = rcu_dereference(c->replicas); + + if (ca->dev_idx >= replicas_dev_slots(r)) + goto out; + + for (i = 0; i < r->nr; i++) { + e = cpu_replicas_entry(r, i); + + if (replicas_test_dev(e, ca->dev_idx)) { + ret |= 1 << e->data_type; + break; + } + } +out: + rcu_read_unlock(); + + return ret; +} + +static const char *bch2_sb_validate_replicas(struct bch_sb *sb) +{ + struct bch_sb_field_members *mi; + struct bch_sb_field_replicas *sb_r; + struct bch_replicas_cpu *cpu_r = NULL; + struct bch_replicas_entry *e; + const char *err; + unsigned i; + + mi = bch2_sb_get_members(sb); + sb_r = bch2_sb_get_replicas(sb); + if (!sb_r) + return NULL; + + for_each_replicas_entry(sb_r, e) { + err = "invalid replicas entry: invalid data type"; + if (e->data_type >= BCH_DATA_NR) + goto err; + + err = "invalid replicas entry: too many devices"; + if (e->nr >= BCH_REPLICAS_MAX) + goto err; + + err = "invalid replicas entry: invalid device"; + for (i = 0; i < e->nr; i++) + if (!bch2_dev_exists(sb, mi, e->devs[i])) + goto err; + } + + err = "cannot allocate memory"; + cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r); + if (!cpu_r) + goto err; + + sort_cmp_size(cpu_r->entries, + cpu_r->nr, + cpu_r->entry_size, + memcmp, NULL); + + for (i = 0; i + 1 < cpu_r->nr; i++) { + struct bch_replicas_cpu_entry *l = + cpu_replicas_entry(cpu_r, i); + struct bch_replicas_cpu_entry *r = + cpu_replicas_entry(cpu_r, i + 1); + + BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); + + err = "duplicate replicas entry"; + if (!memcmp(l, r, cpu_r->entry_size)) + goto err; + } + + err = NULL; +err: + kfree(cpu_r); + return err; +} + +int bch2_replicas_gc_end(struct bch_fs *c, int err) +{ + struct bch_sb_field_replicas *sb_r; + struct bch_replicas_cpu *r, *old_r; + struct bch_replicas_entry *dst_e; + size_t i, j, bytes, dev_slots; + int ret = 0; + + lockdep_assert_held(&c->replicas_gc_lock); + + mutex_lock(&c->sb_lock); + + r = rcu_dereference_protected(c->replicas_gc, + lockdep_is_held(&c->sb_lock)); + + if (err) { + rcu_assign_pointer(c->replicas_gc, NULL); + kfree_rcu(r, rcu); + goto err; + } + + dev_slots = replicas_dev_slots(r); + + bytes = sizeof(struct bch_sb_field_replicas); + + for (i = 0; i < r->nr; i++) { + struct bch_replicas_cpu_entry *e = + cpu_replicas_entry(r, i); + + bytes += sizeof(struct bch_replicas_entry); + for (j = 0; j < r->entry_size - 1; j++) + bytes += hweight8(e->devs[j]); + } + + sb_r = bch2_fs_sb_resize_replicas(c, + DIV_ROUND_UP(sizeof(*sb_r) + bytes, sizeof(u64))); + if (!sb_r) { + ret = -ENOSPC; + goto err; + } + + memset(&sb_r->entries, 0, + vstruct_end(&sb_r->field) - + (void *) &sb_r->entries); + + dst_e = sb_r->entries; + for (i = 0; i < r->nr; i++) { + struct bch_replicas_cpu_entry *src_e = + cpu_replicas_entry(r, i); + + dst_e->data_type = src_e->data_type; + + for (j = 0; j < dev_slots; j++) + if (replicas_test_dev(src_e, j)) + dst_e->devs[dst_e->nr++] = j; + + dst_e = replicas_entry_next(dst_e); + } + + old_r = rcu_dereference_protected(c->replicas, + lockdep_is_held(&c->sb_lock)); + rcu_assign_pointer(c->replicas, r); + rcu_assign_pointer(c->replicas_gc, NULL); + kfree_rcu(old_r, rcu); bch2_write_super(c); +err: mutex_unlock(&c->sb_lock); + return ret; +} + +int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) +{ + struct bch_replicas_cpu *r, *src; + unsigned i; + + lockdep_assert_held(&c->replicas_gc_lock); + + mutex_lock(&c->sb_lock); + BUG_ON(c->replicas_gc); + + src = rcu_dereference_protected(c->replicas, + lockdep_is_held(&c->sb_lock)); + + r = kzalloc(sizeof(struct bch_replicas_cpu) + + src->nr * src->entry_size, GFP_NOIO); + if (!r) { + mutex_unlock(&c->sb_lock); + return -ENOMEM; + } + + r->entry_size = src->entry_size; + r->nr = 0; + + for (i = 0; i < src->nr; i++) { + struct bch_replicas_cpu_entry *dst_e = + cpu_replicas_entry(r, r->nr); + struct bch_replicas_cpu_entry *src_e = + cpu_replicas_entry(src, i); + + if (!(src_e->data_type & typemask)) { + memcpy(dst_e, src_e, r->entry_size); + r->nr++; + } + } + + eytzinger0_sort(r->entries, + r->nr, + r->entry_size, + memcmp, NULL); + + rcu_assign_pointer(c->replicas_gc, r); + mutex_unlock(&c->sb_lock); + + return 0; } diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index 8f0d82d..879fdda 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -2,6 +2,7 @@ #define _BCACHE_SUPER_IO_H #include "extents.h" +#include "eytzinger.h" #include "super_types.h" #include @@ -40,6 +41,15 @@ bch2_fs_sb_resize_##_name(struct bch_fs *c, unsigned u64s) \ BCH_SB_FIELD_TYPE(journal); BCH_SB_FIELD_TYPE(members); BCH_SB_FIELD_TYPE(crypt); +BCH_SB_FIELD_TYPE(replicas); + +static inline bool bch2_dev_exists(struct bch_sb *sb, + struct bch_sb_field_members *mi, + unsigned dev) +{ + return dev < sb->nr_devices && + !bch2_is_zero(mi->members[dev].uuid.b, sizeof(uuid_le)); +} static inline bool bch2_sb_test_feature(struct bch_sb *sb, enum bch_sb_features f) @@ -91,8 +101,6 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) .bucket_size = le16_to_cpu(mi->bucket_size), .state = BCH_MEMBER_STATE(mi), .tier = BCH_MEMBER_TIER(mi), - .has_metadata = BCH_MEMBER_HAS_METADATA(mi), - .has_data = BCH_MEMBER_HAS_DATA(mi), .replacement = BCH_MEMBER_REPLACEMENT(mi), .discard = BCH_MEMBER_DISCARD(mi), .valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)), @@ -105,55 +113,116 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); void bch2_free_super(struct bcache_superblock *); int bch2_super_realloc(struct bcache_superblock *, unsigned); -const char *bch2_validate_journal_layout(struct bch_sb *, +const char *bch2_sb_validate_journal(struct bch_sb *, struct bch_member_cpu); -const char *bch2_validate_cache_super(struct bcache_superblock *); +const char *bch2_sb_validate(struct bcache_superblock *); const char *bch2_read_super(struct bcache_superblock *, struct bch_opts, const char *); void bch2_write_super(struct bch_fs *); -void bch2_check_mark_super_slowpath(struct bch_fs *, - const struct bkey_i *, bool); +static inline bool replicas_test_dev(struct bch_replicas_cpu_entry *e, + unsigned dev) +{ + return (e->devs[dev >> 3] & (1 << (dev & 7))) != 0; +} -static inline bool bch2_check_super_marked(struct bch_fs *c, - const struct bkey_i *k, bool meta) +static inline void replicas_set_dev(struct bch_replicas_cpu_entry *e, + unsigned dev) { - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); - const struct bch_extent_ptr *ptr; - unsigned nr_replicas = 0; - bool ret = true; + e->devs[dev >> 3] |= 1 << (dev & 7); +} - extent_for_each_ptr(e, ptr) { - struct bch_dev *ca = c->devs[ptr->dev]; +static inline unsigned replicas_dev_slots(struct bch_replicas_cpu *r) +{ + return (r->entry_size - + offsetof(struct bch_replicas_cpu_entry, devs)) * 8; +} - if (ptr->cached) - continue; +static inline struct bch_replicas_cpu_entry * +cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) +{ + return (void *) r->entries + r->entry_size * i; +} - if (!(meta - ? ca->mi.has_metadata - : ca->mi.has_data)) { - ret = false; - break; +int bch2_check_mark_super_slowpath(struct bch_fs *, struct bkey_s_c_extent, + enum bch_data_types); + +static inline bool replicas_has_extent(struct bch_replicas_cpu *r, + struct bkey_s_c_extent e, + enum bch_data_types data_type) +{ + const struct bch_extent_ptr *ptr; + struct bch_replicas_cpu_entry search = { + .data_type = data_type, + }; + unsigned max_dev = 0; + + BUG_ON(!data_type || + data_type == BCH_DATA_SB || + data_type >= BCH_DATA_NR); + + extent_for_each_ptr(e, ptr) + if (!ptr->cached) { + max_dev = max_t(unsigned, max_dev, ptr->dev); + replicas_set_dev(&search, ptr->dev); } - nr_replicas++; - } + return max_dev < replicas_dev_slots(r) && + eytzinger0_find(r->entries, r->nr, + r->entry_size, + memcmp, &search) < r->nr; +} + +static inline bool bch2_sb_has_replicas(struct bch_fs *c, + struct bkey_s_c_extent e, + enum bch_data_types data_type) +{ + bool ret; - if (nr_replicas < - (meta ? c->sb.meta_replicas_have : c->sb.data_replicas_have)) - ret = false; + rcu_read_lock(); + ret = replicas_has_extent(rcu_dereference(c->replicas), + e, data_type); + rcu_read_unlock(); return ret; } -static inline void bch2_check_mark_super(struct bch_fs *c, - const struct bkey_i *k, bool meta) +static inline int bch2_check_mark_super(struct bch_fs *c, + struct bkey_s_c_extent e, + enum bch_data_types data_type) { - if (bch2_check_super_marked(c, k, meta)) - return; + struct bch_replicas_cpu *gc_r; + bool marked; - bch2_check_mark_super_slowpath(c, k, meta); + rcu_read_lock(); + marked = replicas_has_extent(rcu_dereference(c->replicas), + e, data_type) && + (!(gc_r = rcu_dereference(c->replicas_gc)) || + replicas_has_extent(gc_r, e, data_type)); + rcu_read_unlock(); + + if (marked) + return 0; + + return bch2_check_mark_super_slowpath(c, e, data_type); } +struct replicas_status { + struct { + unsigned nr_online; + unsigned nr_offline; + } replicas[BCH_DATA_NR]; +}; + +struct replicas_status __bch2_replicas_status(struct bch_fs *, + struct bch_dev *); +struct replicas_status bch2_replicas_status(struct bch_fs *); + +unsigned bch2_replicas_online(struct bch_fs *, bool); +unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); + +int bch2_replicas_gc_end(struct bch_fs *, int); +int bch2_replicas_gc_start(struct bch_fs *, unsigned); + #endif /* _BCACHE_SUPER_IO_H */ diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 2a3947e..692eb41 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -224,6 +224,9 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch2_dev_allocator_stop(ca); bch2_fs_journal_stop(&c->journal); + + for_each_member_device(ca, c, i) + bch2_dev_allocator_remove(c, ca); } static void bch2_writes_disabled(struct percpu_ref *writes) @@ -330,6 +333,10 @@ const char *bch2_fs_read_write(struct bch_fs *c) c->state != BCH_FS_RO) goto out; + for_each_rw_member(ca, c, i) + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + err = "error starting allocator thread"; for_each_rw_member(ca, c, i) if (bch2_dev_allocator_start(ca)) { @@ -484,6 +491,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mutex_init(&c->state_lock); mutex_init(&c->sb_lock); + mutex_init(&c->replicas_gc_lock); mutex_init(&c->btree_cache_lock); mutex_init(&c->bucket_lock); mutex_init(&c->btree_root_lock); @@ -603,7 +611,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mi = bch2_sb_get_members(c->disk_sb); for (i = 0; i < c->sb.nr_devices; i++) - if (!bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le)) && + if (bch2_dev_exists(c->disk_sb, mi, i) && bch2_dev_alloc(c, i)) goto err; @@ -681,12 +689,16 @@ static const char *__bch2_fs_start(struct bch_fs *c) const char *err = "cannot allocate memory"; struct bch_sb_field_members *mi; struct bch_dev *ca; - unsigned i, id; - time64_t now; LIST_HEAD(journal); struct jset *j; + struct closure cl; + u64 journal_seq = 0; + time64_t now; + unsigned i; int ret = -EINVAL; + closure_init_stack(&cl); + BUG_ON(c->state != BCH_FS_STARTING); mutex_lock(&c->sb_lock); @@ -694,6 +706,10 @@ static const char *__bch2_fs_start(struct bch_fs *c) bch2_sb_from_fs(c, ca); mutex_unlock(&c->sb_lock); + for_each_rw_member(ca, c, i) + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + if (BCH_SB_INITIALIZED(c->disk_sb)) { ret = bch2_journal_read(c, &journal); if (ret) @@ -704,44 +720,45 @@ static const char *__bch2_fs_start(struct bch_fs *c) c->prio_clock[READ].hand = le16_to_cpu(j->read_clock); c->prio_clock[WRITE].hand = le16_to_cpu(j->write_clock); - err = "error reading priorities"; - for_each_readable_member(ca, c, i) { - ret = bch2_prio_read(ca); - if (ret) { - percpu_ref_put(&ca->io_ref); - goto err; - } - } - - for (id = 0; id < BTREE_ID_NR; id++) { + for (i = 0; i < BTREE_ID_NR; i++) { unsigned level; struct bkey_i *k; - err = "bad btree root"; - k = bch2_journal_find_btree_root(c, j, id, &level); - if (!k && id == BTREE_ID_EXTENTS) + err = "missing btree root"; + k = bch2_journal_find_btree_root(c, j, i, &level); + if (!k && i < BTREE_ID_ALLOC) goto err; - if (!k) { - pr_debug("missing btree root: %d", id); + + if (!k) continue; - } err = "error reading btree root"; - if (bch2_btree_root_read(c, id, k, level)) + if (bch2_btree_root_read(c, i, k, level)) goto err; } - bch_verbose(c, "starting mark and sweep:"); + err = "error reading allocation information"; + ret = bch2_alloc_read(c, &journal); + if (ret) + goto err; + bch_verbose(c, "starting mark and sweep:"); err = "error in recovery"; ret = bch2_initial_gc(c, &journal); if (ret) goto err; + bch_verbose(c, "mark and sweep done"); if (c->opts.noreplay) goto recovery_done; - bch_verbose(c, "mark and sweep done"); + err = "cannot allocate new btree root"; + for (i = 0; i < BTREE_ID_NR; i++) + if (!c->btree_roots[i].b && + bch2_btree_root_alloc(c, i, &cl)) + goto err; + + closure_sync(&cl); /* * bch2_journal_start() can't happen sooner, or btree_gc_finish() @@ -758,12 +775,10 @@ static const char *__bch2_fs_start(struct bch_fs *c) } bch_verbose(c, "starting journal replay:"); - err = "journal replay failed"; ret = bch2_journal_replay(c, &journal); if (ret) goto err; - bch_verbose(c, "journal replay done"); if (c->opts.norecovery) @@ -774,23 +789,21 @@ static const char *__bch2_fs_start(struct bch_fs *c) ret = bch2_fsck(c, !c->opts.nofsck); if (ret) goto err; + bch_verbose(c, "fsck done"); for_each_rw_member(ca, c, i) - if (ca->need_prio_write) { - ret = bch2_prio_write(ca); + if (ca->need_alloc_write) { + ret = bch2_alloc_write(c, ca, &journal_seq); if (ret) { percpu_ref_put(&ca->io_ref); goto err; } } - bch_verbose(c, "fsck done"); + bch2_journal_flush_seq(&c->journal, journal_seq); } else { struct bch_inode_unpacked inode; struct bkey_inode_buf packed_inode; - struct closure cl; - - closure_init_stack(&cl); bch_notice(c, "initializing new filesystem"); @@ -805,6 +818,11 @@ static const char *__bch2_fs_start(struct bch_fs *c) goto err; } + err = "cannot allocate new btree root"; + for (i = 0; i < BTREE_ID_NR; i++) + if (bch2_btree_root_alloc(c, i, &cl)) + goto err; + /* * journal_res_get() will crash if called before this has * set up the journal.pin FIFO and journal.cur pointer: @@ -819,13 +837,6 @@ static const char *__bch2_fs_start(struct bch_fs *c) goto err; } - err = "cannot allocate new btree root"; - for (id = 0; id < BTREE_ID_NR; id++) - if (bch2_btree_root_alloc(c, id, &cl)) { - closure_sync(&cl); - goto err; - } - /* Wait for new btree roots to be written: */ closure_sync(&cl); @@ -877,6 +888,8 @@ out: bch2_journal_entries_free(&journal); return err; err: + closure_sync(&cl); + switch (ret) { case BCH_FSCK_ERRORS_NOT_FIXED: bch_err(c, "filesystem contains errors: please report this to the developers"); @@ -940,10 +953,7 @@ static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) if (uuid_le_cmp(fs->uuid, sb->uuid)) return "device not a member of filesystem"; - if (sb->dev_idx >= newest->nr_devices) - return "device has invalid dev_idx"; - - if (bch2_is_zero(mi->members[sb->dev_idx].uuid.b, sizeof(uuid_le))) + if (!bch2_dev_exists(newest, mi, sb->dev_idx)) return "device has been removed"; if (fs->block_size != sb->block_size) @@ -981,9 +991,6 @@ static void bch2_dev_free(struct bch_dev *ca) free_percpu(ca->sectors_written); bioset_exit(&ca->replica_set); free_percpu(ca->usage_percpu); - kvpfree(ca->disk_buckets, bucket_bytes(ca)); - kfree(ca->prio_buckets); - kfree(ca->bio_prio); kvpfree(ca->buckets, ca->mi.nbuckets * sizeof(struct bucket)); kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8)); free_heap(&ca->copygc_heap); @@ -1011,7 +1018,7 @@ static void __bch2_dev_offline(struct bch_dev *ca) lockdep_assert_held(&c->state_lock); - __bch2_dev_read_only(ca->fs, ca); + __bch2_dev_read_only(c, ca); reinit_completion(&ca->offline_complete); percpu_ref_kill(&ca->io_ref); @@ -1061,7 +1068,7 @@ static int bch2_dev_sysfs_online(struct bch_dev *ca) return 0; if (!ca->kobj.state_in_sysfs) { - ret = kobject_add(&ca->kobj, &ca->fs->kobj, + ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx); if (ret) return ret; @@ -1087,7 +1094,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) struct bch_member *member; size_t reserve_none, movinggc_reserve, free_inc_reserve, total_reserve; size_t heap_size; - unsigned i; + unsigned i, btree_node_reserve_buckets; struct bch_dev *ca; if (bch2_fs_init_fault("dev_alloc")) @@ -1107,8 +1114,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ca->dev_idx = dev_idx; spin_lock_init(&ca->freelist_lock); - spin_lock_init(&ca->prio_buckets_lock); - mutex_init(&ca->prio_write_lock); bch2_dev_moving_gc_init(ca); INIT_WORK(&ca->io_error_work, bch2_nonfatal_io_error_work); @@ -1134,12 +1139,16 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) free_inc_reserve = movinggc_reserve / 2; heap_size = movinggc_reserve * 8; + btree_node_reserve_buckets = + DIV_ROUND_UP(BTREE_NODE_RESERVE, + ca->mi.bucket_size / c->sb.btree_node_size); + if (percpu_ref_init(&ca->ref, bch2_dev_ref_release, 0, GFP_KERNEL) || percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_release, PERCPU_REF_INIT_DEAD, GFP_KERNEL) || - !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) || - !init_fifo(&ca->free[RESERVE_BTREE], BTREE_NODE_RESERVE, GFP_KERNEL) || + !init_fifo(&ca->free[RESERVE_BTREE], btree_node_reserve_buckets, + GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_MOVINGGC], movinggc_reserve, GFP_KERNEL) || !init_fifo(&ca->free[RESERVE_NONE], reserve_none, GFP_KERNEL) || @@ -1152,18 +1161,12 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) !(ca->buckets = kvpmalloc(ca->mi.nbuckets * sizeof(struct bucket), GFP_KERNEL|__GFP_ZERO)) || - !(ca->prio_buckets = kzalloc(sizeof(u64) * prio_buckets(ca) * - 2, GFP_KERNEL)) || - !(ca->disk_buckets = kvpmalloc(bucket_bytes(ca), GFP_KERNEL)) || !(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)) || - !(ca->bio_prio = bio_kmalloc(GFP_NOIO, bucket_pages(ca))) || bioset_init(&ca->replica_set, 4, offsetof(struct bch_write_bio, bio)) || !(ca->sectors_written = alloc_percpu(*ca->sectors_written))) goto err; - ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); - total_reserve = ca->free_inc.size; for (i = 0; i < RESERVE_NR; i++) total_reserve += ca->free[i].size; @@ -1232,53 +1235,48 @@ static int __bch2_dev_online(struct bch_fs *c, struct bcache_superblock *sb) lg_local_lock(&c->usage_lock); if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA))) - bch2_mark_dev_metadata(ca->fs, ca); + bch2_mark_dev_metadata(c, ca); lg_local_unlock(&c->usage_lock); + if (ca->mi.state == BCH_MEMBER_STATE_RW) { + struct bch_sb_field_journal *journal_buckets = + bch2_sb_get_journal(ca->disk_sb.sb); + bool has_journal = + bch2_nr_journal_buckets(journal_buckets) >= + BCH_JOURNAL_BUCKETS_MIN; + + bch2_dev_group_add(&c->tiers[ca->mi.tier].devs, ca); + bch2_dev_group_add(&c->all_devs, ca); + + if (has_journal) + bch2_dev_group_add(&c->journal.devs, ca); + } + percpu_ref_reinit(&ca->io_ref); return 0; } /* Device management: */ -bool bch2_fs_may_start(struct bch_fs *c, int flags) +static bool have_enough_devs(struct bch_fs *c, + struct replicas_status s, + unsigned flags) { - struct bch_sb_field_members *mi; - unsigned meta_missing = 0; - unsigned data_missing = 0; - bool degraded = false; - unsigned i; - - mutex_lock(&c->sb_lock); - mi = bch2_sb_get_members(c->disk_sb); - - for (i = 0; i < c->disk_sb->nr_devices; i++) - if (!c->devs[i] && - !bch2_is_zero(mi->members[i].uuid.b, sizeof(uuid_le))) { - degraded = true; - if (BCH_MEMBER_HAS_METADATA(&mi->members[i])) - meta_missing++; - if (BCH_MEMBER_HAS_DATA(&mi->members[i])) - data_missing++; - } - mutex_unlock(&c->sb_lock); - - if (degraded && - !(flags & BCH_FORCE_IF_DEGRADED)) - return false; - - if (meta_missing && + if ((s.replicas[BCH_DATA_JOURNAL].nr_offline || + s.replicas[BCH_DATA_BTREE].nr_offline) && !(flags & BCH_FORCE_IF_METADATA_DEGRADED)) return false; - if (meta_missing >= BCH_SB_META_REPLICAS_HAVE(c->disk_sb) && + if ((!s.replicas[BCH_DATA_JOURNAL].nr_online || + !s.replicas[BCH_DATA_BTREE].nr_online) && !(flags & BCH_FORCE_IF_METADATA_LOST)) return false; - if (data_missing && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) + if (s.replicas[BCH_DATA_USER].nr_offline && + !(flags & BCH_FORCE_IF_DATA_DEGRADED)) return false; - if (data_missing >= BCH_SB_DATA_REPLICAS_HAVE(c->disk_sb) && + if (!s.replicas[BCH_DATA_USER].nr_online && !(flags & BCH_FORCE_IF_DATA_LOST)) return false; @@ -1297,40 +1295,80 @@ bool bch2_fs_may_start(struct bch_fs *c, int flags) bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags) { - lockdep_assert_held(&c->state_lock); - - if (new_state == BCH_MEMBER_STATE_RW) - return true; + struct replicas_status s; + struct bch_dev *ca2; + int i, nr_rw = 0, required; - if (ca->mi.state == BCH_MEMBER_STATE_FAILED) - return true; + lockdep_assert_held(&c->state_lock); - /* - * If the device is already offline - whatever is going on with it can't - * possible make the FS need to go RO: - */ - if (!bch2_dev_is_online(ca)) + switch (new_state) { + case BCH_MEMBER_STATE_RW: return true; + case BCH_MEMBER_STATE_RO: + if (ca->mi.state != BCH_MEMBER_STATE_RW) + return true; + + /* do we have enough devices to write to? */ + for_each_member_device(ca2, c, i) + nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW; + + required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) + ? c->opts.metadata_replicas + : c->opts.metadata_replicas_required, + !(flags & BCH_FORCE_IF_DATA_DEGRADED) + ? c->opts.data_replicas + : c->opts.data_replicas_required); + + return nr_rw - 1 <= required; + case BCH_MEMBER_STATE_FAILED: + case BCH_MEMBER_STATE_SPARE: + if (ca->mi.state != BCH_MEMBER_STATE_RW && + ca->mi.state != BCH_MEMBER_STATE_RO) + return true; + + /* do we have enough devices to read from? */ + s = __bch2_replicas_status(c, ca); + + pr_info("replicas: j %u %u b %u %u d %u %u", + s.replicas[BCH_DATA_JOURNAL].nr_online, + s.replicas[BCH_DATA_JOURNAL].nr_offline, + + s.replicas[BCH_DATA_BTREE].nr_online, + s.replicas[BCH_DATA_BTREE].nr_offline, + + s.replicas[BCH_DATA_USER].nr_online, + s.replicas[BCH_DATA_USER].nr_offline); + + return have_enough_devs(c, s, flags); + default: + BUG(); + } +} - if (ca->mi.has_data && - !(flags & BCH_FORCE_IF_DATA_DEGRADED)) - return false; - - if (ca->mi.has_data && - c->sb.data_replicas_have <= 1 && - !(flags & BCH_FORCE_IF_DATA_LOST)) - return false; +static bool bch2_fs_may_start(struct bch_fs *c, int flags) +{ + struct replicas_status s; + struct bch_sb_field_members *mi; + unsigned i; - if (ca->mi.has_metadata && - !(flags & BCH_FORCE_IF_METADATA_DEGRADED)) - return false; + if (!c->opts.degraded) { + mutex_lock(&c->sb_lock); + mi = bch2_sb_get_members(c->disk_sb); + + for (i = 0; i < c->disk_sb->nr_devices; i++) + if (bch2_dev_exists(c->disk_sb, mi, i) && + !bch2_dev_is_online(c->devs[i]) && + (c->devs[i]->mi.state == BCH_MEMBER_STATE_RW || + c->devs[i]->mi.state == BCH_MEMBER_STATE_RO)) { + mutex_unlock(&c->sb_lock); + return false; + } + mutex_unlock(&c->sb_lock); + } - if (ca->mi.has_metadata && - c->sb.meta_replicas_have <= 1 && - !(flags & BCH_FORCE_IF_METADATA_LOST)) - return false; + s = bch2_replicas_status(c); - return true; + return have_enough_devs(c, s, flags); } static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) @@ -1343,8 +1381,7 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) * complete. */ bch2_dev_allocator_stop(ca); - - bch2_dev_group_remove(&c->journal.devs, ca); + bch2_dev_allocator_remove(c, ca); } static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) @@ -1353,6 +1390,9 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW); + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + if (bch2_dev_allocator_start(ca)) return "error starting allocator thread"; @@ -1411,7 +1451,7 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) { struct bch_sb_field_members *mi; - unsigned dev_idx = ca->dev_idx; + unsigned dev_idx = ca->dev_idx, data; int ret = -EINVAL; mutex_lock(&c->state_lock); @@ -1439,19 +1479,12 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) goto err; } - if (ca->mi.has_data || ca->mi.has_metadata) { - bch_err(ca, "Remove failed, still has data"); + data = bch2_dev_has_data(c, ca); + if (data) { + bch_err(ca, "Remove failed, still has data (%x)", data); goto err; } - /* - * Ok, really doing the remove: - * Drop device's prio pointer before removing it from superblock: - */ - spin_lock(&c->journal.lock); - c->journal.prio_buckets[dev_idx] = 0; - spin_unlock(&c->journal.lock); - bch2_journal_meta(&c->journal); __bch2_dev_offline(ca); @@ -1476,6 +1509,7 @@ err: return ret; } +/* Add new device to running filesystem: */ int bch2_dev_add(struct bch_fs *c, const char *path) { struct bcache_superblock sb; @@ -1490,7 +1524,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (err) return -EINVAL; - err = bch2_validate_cache_super(&sb); + err = bch2_sb_validate(&sb); if (err) return -EINVAL; @@ -1514,9 +1548,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) mi = bch2_sb_get_members(c->disk_sb); for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) - if (dev_idx >= c->sb.nr_devices || - bch2_is_zero(mi->members[dev_idx].uuid.b, - sizeof(uuid_le))) + if (!bch2_dev_exists(c->disk_sb, mi, dev_idx)) goto have_slot; no_slot: err = "no slots available in superblock"; @@ -1587,13 +1619,13 @@ err: return ret ?: -EINVAL; } +/* Hot add existing device to running filesystem: */ int bch2_dev_online(struct bch_fs *c, const char *path) { struct bcache_superblock sb = { 0 }; struct bch_dev *ca; unsigned dev_idx; const char *err; - int ret; mutex_lock(&c->state_lock); @@ -1616,12 +1648,6 @@ int bch2_dev_online(struct bch_fs *c, const char *path) mutex_unlock(&c->sb_lock); ca = c->devs[dev_idx]; - ret = bch2_prio_read(ca); - if (ret) { - err = "error reading priorities"; - goto err; - } - if (ca->mi.state == BCH_MEMBER_STATE_RW) { err = __bch2_dev_read_write(c, ca); if (err) @@ -1656,6 +1682,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca) { + unsigned data; int ret; mutex_lock(&c->state_lock); @@ -1680,8 +1707,9 @@ int bch2_dev_evacuate(struct bch_fs *c, struct bch_dev *ca) return ret; } - if (ca->mi.has_data || ca->mi.has_metadata) { - bch_err(ca, "Migrate error: data still present"); + data = bch2_dev_has_data(c, ca); + if (data) { + bch_err(ca, "Migrate error: data still present (%x)", data); return -EINVAL; } @@ -1714,11 +1742,7 @@ const char *bch2_fs_open(char * const *devices, unsigned nr_devices, if (err) goto err; - err = "attempting to register backing device"; - if (__SB_IS_BDEV(le64_to_cpu(sb[i].sb->version))) - goto err; - - err = bch2_validate_cache_super(&sb[i]); + err = bch2_sb_validate(&sb[i]); if (err) goto err; } @@ -1790,7 +1814,7 @@ static const char *__bch2_fs_open_incremental(struct bcache_superblock *sb, struct bch_fs *c; bool allocated_fs = false; - err = bch2_validate_cache_super(sb); + err = bch2_sb_validate(sb); if (err) return err; @@ -1855,11 +1879,7 @@ const char *bch2_fs_open_incremental(const char *path) if (err) return err; - if (!__SB_IS_BDEV(le64_to_cpu(sb.sb->version))) - err = __bch2_fs_open_incremental(&sb, opts); - else - err = "not a bcachefs superblock"; - + err = __bch2_fs_open_incremental(&sb, opts); bch2_free_super(&sb); return err; diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index edfa85b..1986fdd 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -337,8 +337,8 @@ SHOW(bch2_fs) sysfs_pd_controller_show(tiering, &c->tiers[1].pd); /* XXX */ - sysfs_printf(meta_replicas_have, "%u", c->sb.meta_replicas_have); - sysfs_printf(data_replicas_have, "%u", c->sb.data_replicas_have); + sysfs_printf(meta_replicas_have, "%u", bch2_replicas_online(c, true)); + sysfs_printf(data_replicas_have, "%u", bch2_replicas_online(c, false)); /* Debugging: */ @@ -693,7 +693,6 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) return scnprintf(buf, PAGE_SIZE, "free_inc: %zu/%zu\n" - "free[RESERVE_PRIO]: %zu/%zu\n" "free[RESERVE_BTREE]: %zu/%zu\n" "free[RESERVE_MOVINGGC]: %zu/%zu\n" "free[RESERVE_NONE]: %zu/%zu\n" @@ -705,7 +704,6 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) "open buckets: %u/%u (reserved %u)\n" "open_buckets_wait: %s\n", fifo_used(&ca->free_inc), ca->free_inc.size, - fifo_used(&ca->free[RESERVE_PRIO]), ca->free[RESERVE_PRIO].size, fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size, fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, @@ -759,8 +757,11 @@ SHOW(bch2_dev) sysfs_print(alloc_buckets, stats.buckets_alloc); sysfs_print(available_buckets, dev_buckets_available(ca)); sysfs_print(free_buckets, dev_buckets_free(ca)); - sysfs_print(has_data, ca->mi.has_data); - sysfs_print(has_metadata, ca->mi.has_metadata); + sysfs_print(has_data, bch2_dev_has_data(c, ca) & + (1 << BCH_DATA_USER)); + sysfs_print(has_metadata, bch2_dev_has_data(c, ca) & + ((1 << BCH_DATA_JOURNAL)| + (1 << BCH_DATA_BTREE))); sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd); diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 5400dec..906e7a6 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -533,3 +533,47 @@ void eytzinger0_sort(void *base, size_t n, size_t size, } } } + +void sort_cmp_size(void *base, size_t num, size_t size, + int (*cmp_func)(const void *, const void *, size_t), + void (*swap_func)(void *, void *, size_t size)) +{ + /* pre-scale counters for performance */ + int i = (num/2 - 1) * size, n = num * size, c, r; + + if (!swap_func) { + if (size == 4 && alignment_ok(base, 4)) + swap_func = u32_swap; + else if (size == 8 && alignment_ok(base, 8)) + swap_func = u64_swap; + else + swap_func = generic_swap; + } + + /* heapify */ + for ( ; i >= 0; i -= size) { + for (r = i; r * 2 + size < n; r = c) { + c = r * 2 + size; + if (c < n - size && + cmp_func(base + c, base + c + size, size) < 0) + c += size; + if (cmp_func(base + r, base + c, size) >= 0) + break; + swap_func(base + r, base + c, size); + } + } + + /* sort */ + for (i = n - size; i > 0; i -= size) { + swap_func(base, base + i, size); + for (r = 0; r * 2 + size < i; r = c) { + c = r * 2 + size; + if (c < i - size && + cmp_func(base + c, base + c + size, size) < 0) + c += size; + if (cmp_func(base + r, base + c, size) >= 0) + break; + swap_func(base + r, base + c, size); + } + } +} diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 927aa3a..68d9a86 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -763,4 +763,8 @@ static inline struct bio_vec next_contig_bvec(struct bio *bio, size_t bch_scnmemcpy(char *, size_t, const char *, size_t); +void sort_cmp_size(void *base, size_t num, size_t size, + int (*cmp_func)(const void *, const void *, size_t), + void (*swap_func)(void *, void *, size_t)); + #endif /* _BCACHE_UTIL_H */ -- 2.39.5