From bca8b084ad754afc54e628d9db7721b90d9480b7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 23 Nov 2018 03:04:34 -0500 Subject: [PATCH] Update bcachefs sources to da7fefde29 bcachefs: shim for userspace raid library --- .bcachefs_revision | 2 +- include/linux/blkdev.h | 2 + libbcachefs/alloc_background.c | 118 ++- libbcachefs/alloc_background.h | 2 +- libbcachefs/alloc_foreground.c | 353 ++++++-- libbcachefs/alloc_foreground.h | 31 +- libbcachefs/alloc_types.h | 11 +- libbcachefs/bcachefs.h | 48 +- libbcachefs/bcachefs_format.h | 69 +- libbcachefs/bkey.h | 2 + libbcachefs/bkey_methods.c | 2 + libbcachefs/btree_gc.c | 442 ++++++--- libbcachefs/btree_gc.h | 19 +- libbcachefs/btree_iter.c | 5 +- libbcachefs/btree_update_interior.c | 39 +- libbcachefs/btree_update_leaf.c | 43 +- libbcachefs/buckets.c | 462 ++++++---- libbcachefs/buckets.h | 30 +- libbcachefs/buckets_types.h | 11 +- libbcachefs/compress.c | 12 +- libbcachefs/disk_groups.h | 13 + libbcachefs/ec.c | 1283 +++++++++++++++++++++++++++ libbcachefs/ec.h | 108 +++ libbcachefs/ec_types.h | 30 + libbcachefs/extents.c | 196 ++-- libbcachefs/extents.h | 21 +- libbcachefs/extents_types.h | 4 + libbcachefs/fs-io.c | 145 +-- libbcachefs/io.c | 137 ++- libbcachefs/journal.c | 98 +- libbcachefs/journal.h | 32 +- libbcachefs/journal_io.c | 349 +++----- libbcachefs/journal_reclaim.c | 3 +- libbcachefs/journal_types.h | 1 - libbcachefs/opts.c | 26 + libbcachefs/opts.h | 7 +- libbcachefs/recovery.c | 6 + libbcachefs/replicas.c | 34 +- libbcachefs/super-io.c | 1 + libbcachefs/super.c | 27 +- libbcachefs/sysfs.c | 49 +- 41 files changed, 3275 insertions(+), 998 deletions(-) create mode 100644 libbcachefs/ec.c create mode 100644 libbcachefs/ec.h create mode 100644 libbcachefs/ec_types.h diff --git a/.bcachefs_revision b/.bcachefs_revision index 48cf256..abb9e48 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -a9f14c773fb122a4b283fc7b79d9f98703a18890 +da7fefde294e3c56359ee498a62a77182a4733cd diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1d5581d..e4982f9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -6,6 +6,8 @@ #include #include +#define BIO_MAX_PAGES 256 + typedef unsigned fmode_t; struct bio; diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 41ea73a..8992916 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -9,6 +9,7 @@ #include "buckets.h" #include "clock.h" #include "debug.h" +#include "ec.h" #include "error.h" #include "journal_io.h" @@ -82,7 +83,8 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) case BCH_ALLOC: { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); - if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k)) + /* allow for unknown fields */ + if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v)) return "incorrect value size"; break; } @@ -235,6 +237,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key; struct bucket *g; struct bkey_i_alloc *a; + int ret; u8 *d; percpu_down_read_preempt_disable(&c->usage_lock); @@ -258,32 +261,50 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, bch2_btree_iter_set_pos(iter, a->k.p); - return bch2_btree_insert_at(c, NULL, journal_seq, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE| - flags, - BTREE_INSERT_ENTRY(iter, &a->k_i)); + ret = bch2_btree_insert_at(c, NULL, journal_seq, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_USE_ALLOC_RESERVE| + flags, + BTREE_INSERT_ENTRY(iter, &a->k_i)); + + if (!ret && ca->buckets_written) + set_bit(b, ca->buckets_written); + + return ret; } -int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos) +int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) { struct bch_dev *ca; struct btree_iter iter; int ret; - if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode]) + if (k->k.p.inode >= c->sb.nr_devices || + !c->devs[k->k.p.inode]) return 0; - ca = bch_dev_bkey_exists(c, pos.inode); + ca = bch_dev_bkey_exists(c, k->k.p.inode); - if (pos.offset >= ca->mi.nbuckets) + if (k->k.p.offset >= ca->mi.nbuckets) return 0; - bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, k->k.p, + BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto err; + + /* check buckets_written with btree node locked: */ - ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL, 0); + ret = test_bit(k->k.p.offset, ca->buckets_written) + ? 0 + : bch2_btree_insert_at(c, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_JOURNAL_REPLAY, + BTREE_INSERT_ENTRY(&iter, k)); +err: bch2_btree_iter_unlock(&iter); return ret; } @@ -909,12 +930,6 @@ static int bch2_allocator_thread(void *arg) pr_debug("free_inc now empty"); do { - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) { - up_read(&c->gc_lock); - bch_err(ca, "gc failure"); - goto stop; - } - /* * Find some buckets that we can invalidate, either * they're completely unused, or only contain clean data @@ -1112,6 +1127,24 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) } mutex_unlock(&c->btree_reserve_cache_lock); + while (1) { + struct open_bucket *ob; + + spin_lock(&c->freelist_lock); + if (!ca->open_buckets_partial_nr) { + spin_unlock(&c->freelist_lock); + break; + } + ob = c->open_buckets + + ca->open_buckets_partial[--ca->open_buckets_partial_nr]; + ob->on_partial_list = false; + spin_unlock(&c->freelist_lock); + + bch2_open_bucket_put(c, ob); + } + + bch2_ec_stop_dev(c, ca); + /* * Wake up threads that were blocked on allocation, so they can notice * the device can no longer be removed and the capacity has changed: @@ -1254,9 +1287,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c) bool invalidating_data = false; int ret = 0; - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) - return -1; - if (test_alloc_startup(c)) { invalidating_data = true; goto not_enough; @@ -1264,51 +1294,47 @@ static int __bch2_fs_allocator_start(struct bch_fs *c) /* Scan for buckets that are already invalidated: */ for_each_rw_member(ca, c, dev_iter) { - struct btree_iter iter; + struct bucket_array *buckets; struct bucket_mark m; - struct bkey_s_c k; - for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), 0, k) { - if (k.k->type != BCH_ALLOC) - continue; + down_read(&ca->bucket_lock); + percpu_down_read_preempt_disable(&c->usage_lock); - bu = k.k->p.offset; - m = READ_ONCE(bucket(ca, bu)->mark); + buckets = bucket_array(ca); + + for (bu = buckets->first_bucket; + bu < buckets->nbuckets; bu++) { + m = READ_ONCE(buckets->b[bu].mark); - if (!is_available_bucket(m) || m.cached_sectors) + if (!m.gen_valid || + !is_available_bucket(m) || + m.cached_sectors) continue; - percpu_down_read_preempt_disable(&c->usage_lock); bch2_mark_alloc_bucket(c, ca, bu, true, - gc_pos_alloc(c, NULL), - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); - percpu_up_read_preempt_enable(&c->usage_lock); + gc_pos_alloc(c, NULL), 0); fifo_push(&ca->free_inc, bu); - if (fifo_full(&ca->free_inc)) + discard_invalidated_buckets(c, ca); + + if (fifo_full(&ca->free[RESERVE_BTREE])) break; } - bch2_btree_iter_unlock(&iter); + percpu_up_read_preempt_enable(&c->usage_lock); + up_read(&ca->bucket_lock); } /* did we find enough buckets? */ for_each_rw_member(ca, c, dev_iter) - if (fifo_used(&ca->free_inc) < ca->free[RESERVE_BTREE].size) { + if (!fifo_full(&ca->free[RESERVE_BTREE])) { percpu_ref_put(&ca->io_ref); goto not_enough; } return 0; not_enough: - pr_debug("did not find enough empty buckets; issuing discards"); - - /* clear out free_inc, we'll be using it again below: */ - for_each_rw_member(ca, c, dev_iter) - discard_invalidated_buckets(c, ca); - - pr_debug("scanning for reclaimable buckets"); + pr_debug("not enough empty buckets; scanning for reclaimable buckets"); for_each_rw_member(ca, c, dev_iter) { find_reclaimable_buckets(c, ca); diff --git a/libbcachefs/alloc_background.h b/libbcachefs/alloc_background.h index 2de9357..6911fa6 100644 --- a/libbcachefs/alloc_background.h +++ b/libbcachefs/alloc_background.h @@ -16,7 +16,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); } int bch2_alloc_read(struct bch_fs *, struct list_head *); -int bch2_alloc_replay_key(struct bch_fs *, struct bpos); +int bch2_alloc_replay_key(struct bch_fs *, struct bkey_i *); static inline void bch2_wake_allocator(struct bch_dev *ca) { diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 0685996..91ab336 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -61,6 +61,7 @@ #include "clock.h" #include "debug.h" #include "disk_groups.h" +#include "ec.h" #include "io.h" #include @@ -94,6 +95,11 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) { struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + if (ob->ec) { + bch2_ec_bucket_written(c, ob); + return; + } + percpu_down_read_preempt_disable(&c->usage_lock); spin_lock(&ob->lock); @@ -113,6 +119,19 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) closure_wake_up(&c->open_buckets_wait); } +void bch2_open_bucket_write_error(struct bch_fs *c, + struct open_buckets *obs, + unsigned dev) +{ + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, obs, ob, i) + if (ob->ptr.dev == dev && + ob->ec) + bch2_ec_bucket_cancel(c, ob); +} + static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) { struct open_bucket *ob; @@ -128,15 +147,17 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) } static void open_bucket_free_unused(struct bch_fs *c, - struct write_point *wp, - struct open_bucket *ob) + struct open_bucket *ob, + bool may_realloc) { struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); BUG_ON(ca->open_buckets_partial_nr >= ARRAY_SIZE(ca->open_buckets_partial)); - if (wp->type == BCH_DATA_USER) { + if (ca->open_buckets_partial_nr < + ARRAY_SIZE(ca->open_buckets_partial) && + may_realloc) { spin_lock(&c->freelist_lock); ob->on_partial_list = true; ca->open_buckets_partial[ca->open_buckets_partial_nr++] = @@ -284,18 +305,18 @@ out: return ob; } -static int __dev_alloc_cmp(struct write_point *wp, - unsigned l, unsigned r) +static int __dev_stripe_cmp(struct dev_stripe_state *stripe, + unsigned l, unsigned r) { - return ((wp->next_alloc[l] > wp->next_alloc[r]) - - (wp->next_alloc[l] < wp->next_alloc[r])); + return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - + (stripe->next_alloc[l] < stripe->next_alloc[r])); } -#define dev_alloc_cmp(l, r) __dev_alloc_cmp(wp, l, r) +#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) -struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c, - struct write_point *wp, - struct bch_devs_mask *devs) +struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs) { struct dev_alloc_list ret = { .nr = 0 }; struct bch_dev *ca; @@ -304,14 +325,14 @@ struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c, for_each_member_device_rcu(ca, c, i, devs) ret.devs[ret.nr++] = i; - bubble_sort(ret.devs, ret.nr, dev_alloc_cmp); + bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); return ret; } -void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca, - struct write_point *wp) +void bch2_dev_stripe_increment(struct bch_fs *c, struct bch_dev *ca, + struct dev_stripe_state *stripe) { - u64 *v = wp->next_alloc + ca->dev_idx; + u64 *v = stripe->next_alloc + ca->dev_idx; u64 free_space = dev_buckets_free(c, ca); u64 free_space_inv = free_space ? div64_u64(1ULL << 48, free_space) @@ -323,26 +344,30 @@ void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca, else *v = U64_MAX; - for (v = wp->next_alloc; - v < wp->next_alloc + ARRAY_SIZE(wp->next_alloc); v++) + for (v = stripe->next_alloc; + v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) *v = *v < scale ? 0 : *v - scale; } +#define BUCKET_MAY_ALLOC_PARTIAL (1 << 0) +#define BUCKET_ALLOC_USE_DURABILITY (1 << 1) + static int bch2_bucket_alloc_set(struct bch_fs *c, struct open_buckets *ptrs, - struct write_point *wp, + struct dev_stripe_state *stripe, struct bch_devs_mask *devs_may_alloc, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, enum alloc_reserve reserve, + unsigned flags, struct closure *cl) { struct dev_alloc_list devs_sorted = - bch2_wp_alloc_list(c, wp, devs_may_alloc); + bch2_dev_alloc_list(c, stripe, devs_may_alloc); struct bch_dev *ca; bool alloc_failure = false; - unsigned i; + unsigned i, durability; BUG_ON(*nr_effective >= nr_replicas); @@ -353,13 +378,11 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, if (!ca) continue; - if (!ca->mi.durability && - (*have_cache || - wp->type != BCH_DATA_USER)) + if (!ca->mi.durability && *have_cache) continue; ob = bch2_bucket_alloc(c, ca, reserve, - wp->type == BCH_DATA_USER, cl); + flags & BUCKET_MAY_ALLOC_PARTIAL, cl); if (IS_ERR(ob)) { enum bucket_alloc_ret ret = -PTR_ERR(ob); @@ -374,13 +397,16 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, continue; } + durability = (flags & BUCKET_ALLOC_USE_DURABILITY) + ? ca->mi.durability : 1; + __clear_bit(ca->dev_idx, devs_may_alloc->d); - *nr_effective += ca->mi.durability; - *have_cache |= !ca->mi.durability; + *nr_effective += durability; + *have_cache |= !durability; ob_push(c, ptrs, ob); - bch2_wp_rescale(c, ca, wp); + bch2_dev_stripe_increment(c, ca, stripe); if (*nr_effective >= nr_replicas) return 0; @@ -389,15 +415,150 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, return alloc_failure ? -ENOSPC : -EROFS; } +/* Allocate from stripes: */ + +/* + * XXX: use a higher watermark for allocating open buckets here: + */ +static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) +{ + struct bch_devs_mask devs; + struct open_bucket *ob; + unsigned i, nr_have = 0, nr_data = + min_t(unsigned, h->nr_active_devs, + EC_STRIPE_MAX) - h->redundancy; + bool have_cache = true; + int ret = 0; + + BUG_ON(h->blocks.nr > nr_data); + BUG_ON(h->parity.nr > h->redundancy); + + devs = h->devs; + + open_bucket_for_each(c, &h->parity, ob, i) + __clear_bit(ob->ptr.dev, devs.d); + open_bucket_for_each(c, &h->blocks, ob, i) + __clear_bit(ob->ptr.dev, devs.d); + + percpu_down_read_preempt_disable(&c->usage_lock); + rcu_read_lock(); + + if (h->parity.nr < h->redundancy) { + nr_have = h->parity.nr; + + ret = bch2_bucket_alloc_set(c, &h->parity, + &h->parity_stripe, + &devs, + h->redundancy, + &nr_have, + &have_cache, + RESERVE_NONE, + 0, + NULL); + if (ret) + goto err; + } + + if (h->blocks.nr < nr_data) { + nr_have = h->blocks.nr; + + ret = bch2_bucket_alloc_set(c, &h->blocks, + &h->block_stripe, + &devs, + nr_data, + &nr_have, + &have_cache, + RESERVE_NONE, + 0, + NULL); + if (ret) + goto err; + } + + rcu_read_unlock(); + percpu_up_read_preempt_enable(&c->usage_lock); + + return bch2_ec_stripe_new_alloc(c, h); +err: + rcu_read_unlock(); + percpu_up_read_preempt_enable(&c->usage_lock); + return -1; +} + +/* + * if we can't allocate a new stripe because there are already too many + * partially filled stripes, force allocating from an existing stripe even when + * it's to a device we don't want: + */ + +static void bucket_alloc_from_stripe(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + u16 target, + unsigned erasure_code, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache) +{ + struct dev_alloc_list devs_sorted; + struct ec_stripe_head *h; + struct open_bucket *ob; + struct bch_dev *ca; + unsigned i, ec_idx; + + if (!erasure_code) + return; + + if (nr_replicas < 2) + return; + + if (ec_open_bucket(c, ptrs)) + return; + + h = bch2_ec_stripe_head_get(c, target, erasure_code, nr_replicas - 1); + if (!h) + return; + + if (!h->s && ec_stripe_alloc(c, h)) + goto out_put_head; + + rcu_read_lock(); + devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); + rcu_read_unlock(); + + for (i = 0; i < devs_sorted.nr; i++) + open_bucket_for_each(c, &h->s->blocks, ob, ec_idx) + if (ob->ptr.dev == devs_sorted.devs[i] && + !test_and_set_bit(ec_idx, h->s->blocks_allocated)) + goto got_bucket; + goto out_put_head; +got_bucket: + ca = bch_dev_bkey_exists(c, ob->ptr.dev); + + ob->ec_idx = ec_idx; + ob->ec = h->s; + + __clear_bit(ob->ptr.dev, devs_may_alloc->d); + *nr_effective += ca->mi.durability; + *have_cache |= !ca->mi.durability; + + ob_push(c, ptrs, ob); + atomic_inc(&h->s->pin); +out_put_head: + bch2_ec_stripe_head_put(h); +} + /* Sector allocator */ -static int get_buckets_from_writepoint(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache) +static void get_buckets_from_writepoint(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + bool need_ec) { struct open_buckets ptrs_skip = { .nr = 0 }; struct open_bucket *ob; @@ -409,7 +570,8 @@ static int get_buckets_from_writepoint(struct bch_fs *c, if (*nr_effective < nr_replicas && test_bit(ob->ptr.dev, devs_may_alloc->d) && (ca->mi.durability || - (wp->type == BCH_DATA_USER && !*have_cache))) { + (wp->type == BCH_DATA_USER && !*have_cache)) && + (ob->ec || !need_ec)) { __clear_bit(ob->ptr.dev, devs_may_alloc->d); *nr_effective += ca->mi.durability; *have_cache |= !ca->mi.durability; @@ -420,8 +582,6 @@ static int get_buckets_from_writepoint(struct bch_fs *c, } } wp->ptrs = ptrs_skip; - - return *nr_effective < nr_replicas ? -ENOSPC : 0; } static int open_bucket_add_buckets(struct bch_fs *c, @@ -429,22 +589,25 @@ static int open_bucket_add_buckets(struct bch_fs *c, struct write_point *wp, struct bch_devs_list *devs_have, u16 target, + unsigned erasure_code, unsigned nr_replicas, unsigned *nr_effective, bool *have_cache, enum alloc_reserve reserve, - struct closure *cl) + struct closure *_cl) { struct bch_devs_mask devs; - const struct bch_devs_mask *t; struct open_bucket *ob; - unsigned i; + struct closure *cl = NULL; + unsigned i, flags = BUCKET_ALLOC_USE_DURABILITY; int ret; - percpu_down_read_preempt_disable(&c->usage_lock); - rcu_read_lock(); + if (wp->type == BCH_DATA_USER) + flags |= BUCKET_MAY_ALLOC_PARTIAL; - devs = c->rw_devs[wp->type]; + rcu_read_lock(); + devs = target_rw_devs(c, wp->type, target); + rcu_read_unlock(); /* Don't allocate from devices we already have pointers to: */ for (i = 0; i < devs_have->nr; i++) @@ -453,50 +616,83 @@ static int open_bucket_add_buckets(struct bch_fs *c, open_bucket_for_each(c, ptrs, ob, i) __clear_bit(ob->ptr.dev, devs.d); - t = bch2_target_to_mask(c, target); - if (t) - bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); + if (erasure_code) { + get_buckets_from_writepoint(c, ptrs, wp, &devs, + nr_replicas, nr_effective, + have_cache, true); + if (*nr_effective >= nr_replicas) + return 0; - ret = get_buckets_from_writepoint(c, ptrs, wp, &devs, - nr_replicas, nr_effective, have_cache); - if (!ret) - goto out; + bucket_alloc_from_stripe(c, ptrs, wp, &devs, + target, erasure_code, + nr_replicas, nr_effective, + have_cache); + if (*nr_effective >= nr_replicas) + return 0; + } + + get_buckets_from_writepoint(c, ptrs, wp, &devs, + nr_replicas, nr_effective, + have_cache, false); + if (*nr_effective >= nr_replicas) + return 0; + + percpu_down_read_preempt_disable(&c->usage_lock); + rcu_read_lock(); +retry_blocking: /* * Try nonblocking first, so that if one device is full we'll try from * other devices: */ - ret = bch2_bucket_alloc_set(c, ptrs, wp, &devs, + ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, nr_replicas, nr_effective, have_cache, - reserve, NULL); - if (!ret || ret == -EROFS || !cl) - goto out; + reserve, flags, cl); + if (ret && ret != -EROFS && !cl && _cl) { + cl = _cl; + goto retry_blocking; + } - ret = bch2_bucket_alloc_set(c, ptrs, wp, &devs, - nr_replicas, nr_effective, have_cache, - reserve, cl); -out: rcu_read_unlock(); percpu_up_read_preempt_enable(&c->usage_lock); return ret; } -void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, - struct write_point *wp) +void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, + struct open_buckets *obs, + enum bch_data_type data_type) { struct open_buckets ptrs = { .nr = 0 }; - struct open_bucket *ob; - unsigned i; + struct open_bucket *ob, *ob2; + unsigned i, j; - mutex_lock(&wp->lock); - open_bucket_for_each(c, &wp->ptrs, ob, i) - if (!ca || ob->ptr.dev == ca->dev_idx) - open_bucket_free_unused(c, wp, ob); + open_bucket_for_each(c, obs, ob, i) { + bool drop = !ca || ob->ptr.dev == ca->dev_idx; + + if (!drop && ob->ec) { + mutex_lock(&ob->ec->lock); + open_bucket_for_each(c, &ob->ec->blocks, ob2, j) + drop |= ob2->ptr.dev == ca->dev_idx; + open_bucket_for_each(c, &ob->ec->parity, ob2, j) + drop |= ob2->ptr.dev == ca->dev_idx; + mutex_unlock(&ob->ec->lock); + } + + if (drop) + bch2_open_bucket_put(c, ob); else ob_push(c, &ptrs, ob); + } - wp->ptrs = ptrs; + *obs = ptrs; +} + +void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, + struct write_point *wp) +{ + mutex_lock(&wp->lock); + bch2_open_buckets_stop_dev(c, ca, &wp->ptrs, wp->type); mutex_unlock(&wp->lock); } @@ -629,6 +825,7 @@ out: */ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, unsigned target, + unsigned erasure_code, struct write_point_specifier write_point, struct bch_devs_list *devs_have, unsigned nr_replicas, @@ -648,26 +845,37 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, BUG_ON(!nr_replicas || !nr_replicas_required); retry: write_points_nr = c->write_points_nr; + wp = writepoint_find(c, write_point.v); + /* metadata may not allocate on cache devices: */ + if (wp->type != BCH_DATA_USER) + have_cache = true; + if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { - ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, target, + ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, + target, erasure_code, nr_replicas, &nr_effective, &have_cache, reserve, cl); } else { - ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, target, + ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, + target, erasure_code, nr_replicas, &nr_effective, &have_cache, reserve, NULL); if (!ret) goto alloc_done; - ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, 0, + ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, + 0, erasure_code, nr_replicas, &nr_effective, &have_cache, reserve, cl); } alloc_done: BUG_ON(!ret && nr_effective < nr_replicas); + if (erasure_code && !ec_open_bucket(c, &ptrs)) + pr_debug("failed to get ec bucket: ret %u", ret); + if (ret == -EROFS && nr_effective >= nr_replicas_required) ret = 0; @@ -677,7 +885,7 @@ alloc_done: /* Free buckets we didn't use: */ open_bucket_for_each(c, &wp->ptrs, ob, i) - open_bucket_free_unused(c, wp, ob); + open_bucket_free_unused(c, ob, wp->type == BCH_DATA_USER); wp->ptrs = ptrs; @@ -696,7 +904,8 @@ err: if (ptrs.nr < ARRAY_SIZE(ptrs.v)) ob_push(c, &ptrs, ob); else - open_bucket_free_unused(c, wp, ob); + open_bucket_free_unused(c, ob, + wp->type == BCH_DATA_USER); wp->ptrs = ptrs; mutex_unlock(&wp->lock); diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index 729afc9..a332e9d 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -16,11 +16,11 @@ struct dev_alloc_list { u8 devs[BCH_SB_MEMBERS_MAX]; }; -struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *, - struct write_point *, - struct bch_devs_mask *); -void bch2_wp_rescale(struct bch_fs *, struct bch_dev *, - struct write_point *); +struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, + struct dev_stripe_state *, + struct bch_devs_mask *); +void bch2_dev_stripe_increment(struct bch_fs *, struct bch_dev *, + struct dev_stripe_state *); long bch2_bucket_alloc_new_fs(struct bch_dev *); @@ -42,6 +42,22 @@ static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \ (_i)++) +static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, + struct open_buckets *obs) +{ + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, obs, ob, i) + if (ob->ec) + return ob; + + return NULL; +} + +void bch2_open_bucket_write_error(struct bch_fs *, + struct open_buckets *, unsigned); + void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) @@ -75,7 +91,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c, } struct write_point *bch2_alloc_sectors_start(struct bch_fs *, - unsigned, + unsigned, unsigned, struct write_point_specifier, struct bch_devs_list *, unsigned, unsigned, @@ -87,6 +103,9 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, struct bkey_i_extent *, unsigned); void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); +void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *, + struct open_buckets *, enum bch_data_type); + void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *, struct write_point *); diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index 110663f..6f17f09 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -7,6 +7,8 @@ #include "clock_types.h" #include "fifo.h" +struct ec_bucket_buf; + /* There's two of these clocks, one for reads and one for writes: */ struct bucket_clock { /* @@ -55,8 +57,10 @@ struct open_bucket { u8 freelist; bool valid; bool on_partial_list; + u8 ec_idx; unsigned sectors_free; struct bch_extent_ptr ptr; + struct ec_stripe_new *ec; }; #define OPEN_BUCKET_LIST_MAX 15 @@ -66,18 +70,23 @@ struct open_buckets { u8 v[OPEN_BUCKET_LIST_MAX]; }; +struct dev_stripe_state { + u64 next_alloc[BCH_SB_MEMBERS_MAX]; +}; + struct write_point { struct hlist_node node; struct mutex lock; u64 last_used; unsigned long write_point; enum bch_data_type type; + bool is_ec; /* calculated based on how many pointers we're actually going to use: */ unsigned sectors_free; struct open_buckets ptrs; - u64 next_alloc[BCH_SB_MEMBERS_MAX]; + struct dev_stripe_state stripe; }; struct write_point_specifier { diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index e23f45e..05891a0 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -201,7 +201,7 @@ #include -#define bch2_fs_init_fault(name) \ +#define bch2_fs_init_fault(name) \ dynamic_fault("bcachefs:bch_fs_init:" name) #define bch2_meta_read_fault(name) \ dynamic_fault("bcachefs:meta:read:" name) @@ -270,7 +270,10 @@ do { \ BCH_DEBUG_PARAM(test_alloc_startup, \ "Force allocator startup to use the slowpath where it" \ "can't find enough free buckets without invalidating" \ - "cached data") + "cached data") \ + BCH_DEBUG_PARAM(force_reconstruct_read, \ + "Force reads to use the reconstruct path, when reading" \ + "from erasure coded extents") #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() @@ -308,6 +311,7 @@ enum bch_time_stats { #include "btree_types.h" #include "buckets_types.h" #include "clock_types.h" +#include "ec_types.h" #include "journal_types.h" #include "keylist_types.h" #include "quota_types.h" @@ -330,13 +334,16 @@ enum gc_phase { GC_PHASE_START, GC_PHASE_SB, -#define DEF_BTREE_ID(kwd, val, name) GC_PHASE_BTREE_##kwd, - DEFINE_BCH_BTREE_IDS() -#undef DEF_BTREE_ID + GC_PHASE_BTREE_EC, + GC_PHASE_BTREE_EXTENTS, + GC_PHASE_BTREE_INODES, + GC_PHASE_BTREE_DIRENTS, + GC_PHASE_BTREE_XATTRS, + GC_PHASE_BTREE_ALLOC, + GC_PHASE_BTREE_QUOTAS, GC_PHASE_PENDING_DELETE, GC_PHASE_ALLOC, - GC_PHASE_DONE }; struct gc_pos { @@ -381,14 +388,14 @@ struct bch_dev { * gc_lock, for device resize - holding any is sufficient for access: * Or rcu_read_lock(), but only for ptr_stale(): */ - struct bucket_array __rcu *buckets; + struct bucket_array __rcu *buckets[2]; unsigned long *buckets_dirty; + unsigned long *buckets_written; /* most out of date gen in the btree */ u8 *oldest_gens; struct rw_semaphore bucket_lock; - struct bch_dev_usage __percpu *usage_percpu; - struct bch_dev_usage usage_cached; + struct bch_dev_usage __percpu *usage[2]; /* Allocator: */ struct task_struct __rcu *alloc_thread; @@ -466,7 +473,6 @@ enum { /* errors: */ BCH_FS_ERROR, - BCH_FS_GC_FAILURE, /* misc: */ BCH_FS_BDEV_MOUNTED, @@ -602,8 +608,8 @@ struct bch_fs { atomic64_t sectors_available; - struct bch_fs_usage __percpu *usage_percpu; - struct bch_fs_usage usage_cached; + struct bch_fs_usage __percpu *usage[2]; + struct percpu_rw_semaphore usage_lock; struct closure_waitlist freelist_wait; @@ -644,9 +650,6 @@ struct bch_fs { * * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.) * - * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not - * currently running, and gc marks are currently valid - * * Protected by gc_pos_lock. Only written to by GC thread, so GC thread * can read without a lock. */ @@ -681,6 +684,21 @@ struct bch_fs { /* REBALANCE */ struct bch_fs_rebalance rebalance; + /* ERASURE CODING */ + struct list_head ec_new_stripe_list; + struct mutex ec_new_stripe_lock; + + GENRADIX(struct ec_stripe) ec_stripes; + struct mutex ec_stripes_lock; + + ec_stripes_heap ec_stripes_heap; + spinlock_t ec_stripes_heap_lock; + + struct bio_set ec_bioset; + + struct work_struct ec_stripe_delete_work; + struct llist_head ec_stripe_delete_list; + /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; struct bio_set dio_write_bioset; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 56fef9e..c462ab2 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -233,6 +233,9 @@ struct bkey_packed { } __attribute__((packed, aligned(8))); #define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) +#define BKEY_U64s_MAX U8_MAX +#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s) + #define KEY_PACKED_BITS_START 24 #define KEY_FORMAT_LOCAL_BTREE 0 @@ -460,8 +463,9 @@ enum bch_compression_type { x(ptr, 0) \ x(crc32, 1) \ x(crc64, 2) \ - x(crc128, 3) -#define BCH_EXTENT_ENTRY_MAX 4 + x(crc128, 3) \ + x(stripe_ptr, 4) +#define BCH_EXTENT_ENTRY_MAX 5 enum bch_extent_entry_type { #define x(f, n) BCH_EXTENT_ENTRY_##f = n, @@ -552,7 +556,7 @@ struct bch_extent_ptr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u64 type:1, cached:1, - erasure_coded:1, + unused:1, reservation:1, offset:44, /* 8 petabytes */ dev:8, @@ -562,23 +566,35 @@ struct bch_extent_ptr { dev:8, offset:44, reservation:1, - erasure_coded:1, + unused:1, cached:1, type:1; #endif } __attribute__((packed, aligned(8))); -struct bch_extent_reservation { +struct bch_extent_stripe_ptr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u64 type:5, - unused:23, + block:8, + idx:51; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 idx:51, + block:8, + type:5; +#endif +}; + +struct bch_extent_reservation { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:6, + unused:22, replicas:4, generation:32; #elif defined (__BIG_ENDIAN_BITFIELD) __u64 generation:32, replicas:4, - unused:23, - type:5; + unused:22, + type:6; #endif }; @@ -701,7 +717,8 @@ BKEY_VAL_TYPE(inode_generation, BCH_INODE_GENERATION); BCH_INODE_FIELD(bi_data_replicas, 8) \ BCH_INODE_FIELD(bi_promote_target, 16) \ BCH_INODE_FIELD(bi_foreground_target, 16) \ - BCH_INODE_FIELD(bi_background_target, 16) + BCH_INODE_FIELD(bi_background_target, 16) \ + BCH_INODE_FIELD(bi_erasure_code, 16) #define BCH_INODE_FIELDS_INHERIT() \ BCH_INODE_FIELD(bi_data_checksum) \ @@ -711,7 +728,8 @@ BKEY_VAL_TYPE(inode_generation, BCH_INODE_GENERATION); BCH_INODE_FIELD(bi_data_replicas) \ BCH_INODE_FIELD(bi_promote_target) \ BCH_INODE_FIELD(bi_foreground_target) \ - BCH_INODE_FIELD(bi_background_target) + BCH_INODE_FIELD(bi_background_target) \ + BCH_INODE_FIELD(bi_erasure_code) enum { /* @@ -871,6 +889,27 @@ struct bch_quota { } __attribute__((packed, aligned(8))); BKEY_VAL_TYPE(quota, BCH_QUOTA); +/* Erasure coding */ + +enum { + BCH_STRIPE = 128, +}; + +struct bch_stripe { + struct bch_val v; + __le16 sectors; + __u8 algorithm; + __u8 nr_blocks; + __u8 nr_redundant; + + __u8 csum_granularity_bits; + __u8 csum_type; + __u8 pad; + + struct bch_extent_ptr ptrs[0]; +} __attribute__((packed, aligned(8))); +BKEY_VAL_TYPE(stripe, BCH_STRIPE); + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -1060,7 +1099,7 @@ struct bch_sb_field_quota { struct bch_disk_group { __u8 label[BCH_SB_LABEL_SIZE]; __le64 flags[2]; -}; +} __attribute__((packed, aligned(8))); LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) @@ -1069,7 +1108,7 @@ LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) struct bch_sb_field_disk_groups { struct bch_sb_field field; struct bch_disk_group entries[0]; -}; +} __attribute__((packed, aligned(8))); /* * On clean shutdown, store btree roots and current journal sequence number in @@ -1235,12 +1274,15 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, struct bch_sb, flags[2], 0, 4); LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); +LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); + /* Features: */ enum bch_sb_features { BCH_FEATURE_LZ4 = 0, BCH_FEATURE_GZIP = 1, BCH_FEATURE_ZSTD = 2, BCH_FEATURE_ATOMIC_NLINK = 3, /* should have gone under compat */ + BCH_FEATURE_EC = 4, BCH_FEATURE_NR, }; @@ -1407,7 +1449,8 @@ LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); DEF_BTREE_ID(DIRENTS, 2, "dirents") \ DEF_BTREE_ID(XATTRS, 3, "xattrs") \ DEF_BTREE_ID(ALLOC, 4, "alloc") \ - DEF_BTREE_ID(QUOTAS, 5, "quotas") + DEF_BTREE_ID(QUOTAS, 5, "quotas") \ + DEF_BTREE_ID(EC, 6, "erasure_coding") #define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val, diff --git a/libbcachefs/bkey.h b/libbcachefs/bkey.h index bd1d21b..28bf646 100644 --- a/libbcachefs/bkey.h +++ b/libbcachefs/bkey.h @@ -579,6 +579,8 @@ BKEY_VAL_ACCESSORS(alloc, BCH_ALLOC); BKEY_VAL_ACCESSORS(quota, BCH_QUOTA); +BKEY_VAL_ACCESSORS(stripe, BCH_STRIPE); + /* byte order helpers */ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 43bcbb0..97d72d2 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -4,6 +4,7 @@ #include "btree_types.h" #include "alloc_background.h" #include "dirent.h" +#include "ec.h" #include "error.h" #include "extents.h" #include "inode.h" @@ -17,6 +18,7 @@ const struct bkey_ops bch2_bkey_ops[] = { [BKEY_TYPE_XATTRS] = bch2_bkey_xattr_ops, [BKEY_TYPE_ALLOC] = bch2_bkey_alloc_ops, [BKEY_TYPE_QUOTAS] = bch2_bkey_quota_ops, + [BKEY_TYPE_EC] = bch2_bkey_ec_ops, [BKEY_TYPE_BTREE] = bch2_bkey_btree_ops, }; diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 6b67da9..9fe438d 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -14,6 +14,7 @@ #include "buckets.h" #include "clock.h" #include "debug.h" +#include "ec.h" #include "error.h" #include "extents.h" #include "journal.h" @@ -113,6 +114,7 @@ static bool bkey_type_needs_gc(enum bkey_type type) switch (type) { case BKEY_TYPE_BTREE: case BKEY_TYPE_EXTENTS: + case BKEY_TYPE_EC: return true; default: return false; @@ -153,6 +155,17 @@ static u8 ptr_gens_recalc_oldest(struct bch_fs *c, } } break; + case BKEY_TYPE_EC: + switch (k.k->type) { + case BCH_STRIPE: { + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + + for (ptr = s.v->ptrs; + ptr < s.v->ptrs + s.v->nr_blocks; + ptr++) + ptr_gen_recalc_oldest(c, ptr, &max_stale); + } + } default: break; } @@ -214,6 +227,21 @@ static int ptr_gens_check(struct bch_fs *c, enum bkey_type type, } } break; + case BKEY_TYPE_EC: + switch (k.k->type) { + case BCH_STRIPE: { + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + + for (ptr = s.v->ptrs; + ptr < s.v->ptrs + s.v->nr_blocks; + ptr++) { + ret = ptr_gen_check(c, type, ptr); + if (ret) + return ret; + } + } + } + break; default: break; } @@ -229,8 +257,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type, { struct gc_pos pos = { 0 }; unsigned flags = - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD| + BCH_BUCKET_MARK_GC| (initial ? BCH_BUCKET_MARK_NOATOMIC : 0); int ret = 0; @@ -359,15 +386,27 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, return 0; } +static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) +{ + return (int) btree_id_to_gc_phase(l) - + (int) btree_id_to_gc_phase(r); +} + static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal, bool initial) { + enum btree_id ids[BTREE_ID_NR]; unsigned i; + for (i = 0; i < BTREE_ID_NR; i++) + ids[i] = i; + bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); + for (i = 0; i < BTREE_ID_NR; i++) { - enum bkey_type type = bkey_type(0, i); + enum btree_id id = ids[i]; + enum bkey_type type = bkey_type(0, id); - int ret = bch2_gc_btree(c, i, initial); + int ret = bch2_gc_btree(c, id, initial); if (ret) return ret; @@ -441,9 +480,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, BCH_DATA_SB, flags); } - if (c) - spin_lock(&c->journal.lock); - for (i = 0; i < ca->journal.nr; i++) { b = ca->journal.buckets[i]; bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL, @@ -453,7 +489,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, if (c) { percpu_up_read_preempt_enable(&c->usage_lock); - spin_unlock(&c->journal.lock); } else { preempt_enable(); } @@ -468,9 +503,7 @@ static void bch2_mark_superblocks(struct bch_fs *c) gc_pos_set(c, gc_phase(GC_PHASE_SB)); for_each_online_member(ca, c, i) - bch2_mark_dev_superblock(c, ca, - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); + bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC); mutex_unlock(&c->sb_lock); } @@ -478,7 +511,6 @@ static void bch2_mark_superblocks(struct bch_fs *c) static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) { struct gc_pos pos = { 0 }; - struct bch_fs_usage stats = { 0 }; struct btree_update *as; struct pending_btree_node_free *d; @@ -490,13 +522,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) bch2_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&d->key), true, 0, - pos, &stats, 0, - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); - /* - * Don't apply stats - pending deletes aren't tracked in - * bch_alloc_stats: - */ + pos, NULL, 0, + BCH_BUCKET_MARK_GC); mutex_unlock(&c->btree_interior_update_lock); } @@ -517,8 +544,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) fifo_for_each_entry(i, &ca->free_inc, iter) bch2_mark_alloc_bucket(c, ca, i, true, gc_pos_alloc(c, NULL), - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); + BCH_BUCKET_MARK_GC); @@ -526,8 +552,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) fifo_for_each_entry(i, &ca->free[j], iter) bch2_mark_alloc_bucket(c, ca, i, true, gc_pos_alloc(c, NULL), - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); + BCH_BUCKET_MARK_GC); } spin_unlock(&c->freelist_lock); @@ -541,8 +566,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) ca = bch_dev_bkey_exists(c, ob->ptr.dev); bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, gc_pos_alloc(c, ob), - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| - BCH_BUCKET_MARK_GC_LOCK_HELD); + BCH_BUCKET_MARK_GC); } spin_unlock(&ob->lock); } @@ -550,121 +574,310 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) percpu_up_read_preempt_enable(&c->usage_lock); } -static void bch2_gc_start(struct bch_fs *c) +static void bch2_gc_free(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + for_each_member_device(ca, c, i) { + kvpfree(rcu_dereference_protected(ca->buckets[1], 1), + sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket)); + ca->buckets[1] = NULL; + + free_percpu(ca->usage[1]); + ca->usage[1] = NULL; + } + + free_percpu(c->usage[1]); + c->usage[1] = NULL; +} + +static void bch2_gc_done_nocheck(struct bch_fs *c) { struct bch_dev *ca; - struct bucket_array *buckets; - struct bucket_mark new; unsigned i; - size_t b; int cpu; - percpu_down_write(&c->usage_lock); + for_each_member_device(ca, c, i) { + struct bucket_array *src = __bucket_array(ca, 1); - /* - * Indicates to buckets code that gc is now in progress - done under - * usage_lock to avoid racing with bch2_mark_key(): - */ - __gc_pos_set(c, gc_phase(GC_PHASE_START)); + memcpy(__bucket_array(ca, 0), src, + sizeof(struct bucket_array) + + sizeof(struct bucket) * src->nbuckets); + }; - /* Save a copy of the existing bucket stats while we recompute them: */ for_each_member_device(ca, c, i) { - ca->usage_cached = __bch2_dev_usage_read(ca); + struct bch_dev_usage *p; + for_each_possible_cpu(cpu) { - struct bch_dev_usage *p = - per_cpu_ptr(ca->usage_percpu, cpu); + p = per_cpu_ptr(ca->usage[0], cpu); memset(p, 0, sizeof(*p)); } + + preempt_disable(); + *this_cpu_ptr(ca->usage[0]) = __bch2_dev_usage_read(ca, 1); + preempt_enable(); + } + + { + struct bch_fs_usage src = __bch2_fs_usage_read(c, 1); + struct bch_fs_usage *p; + + for_each_possible_cpu(cpu) { + p = per_cpu_ptr(c->usage[0], cpu); + memset(p, 0, offsetof(typeof(*p), online_reserved)); + } + + preempt_disable(); + memcpy(this_cpu_ptr(c->usage[0]), + &src, + offsetof(typeof(*p), online_reserved)); + preempt_enable(); + } + +} + +static void bch2_gc_done(struct bch_fs *c, bool initial) +{ + struct bch_dev *ca; + unsigned i; + int cpu; + +#define copy_field(_f, _msg, ...) \ + if (dst._f != src._f) { \ + pr_info(_msg ": got %llu, should be %llu, fixing" \ + , ##__VA_ARGS__, dst._f, src._f); \ + dst._f = src._f; \ + } +#define copy_bucket_field(_f) \ + if (dst->b[b].mark._f != src->b[b].mark._f) { \ + pr_info("dev %u bucket %zu has wrong " #_f \ + ": got %u, should be %u, fixing", \ + i, b, dst->b[b].mark._f, src->b[b].mark._f); \ + dst->b[b]._mark._f = src->b[b].mark._f; \ + } +#define copy_dev_field(_f, _msg, ...) \ + copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) +#define copy_fs_field(_f, _msg, ...) \ + copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) + + percpu_down_write(&c->usage_lock); + + if (initial) { + bch2_gc_done_nocheck(c); + goto out; } - c->usage_cached = __bch2_fs_usage_read(c); - for_each_possible_cpu(cpu) { - struct bch_fs_usage *p = - per_cpu_ptr(c->usage_percpu, cpu); + for_each_member_device(ca, c, i) { + struct bucket_array *dst = __bucket_array(ca, 0); + struct bucket_array *src = __bucket_array(ca, 1); + size_t b; + + if (initial) { + memcpy(dst, src, + sizeof(struct bucket_array) + + sizeof(struct bucket) * dst->nbuckets); + } - memset(p->replicas, 0, sizeof(p->replicas)); - memset(p->buckets, 0, sizeof(p->buckets)); + for (b = 0; b < src->nbuckets; b++) { + copy_bucket_field(gen); + copy_bucket_field(data_type); + copy_bucket_field(owned_by_allocator); + copy_bucket_field(stripe); + copy_bucket_field(dirty_sectors); + copy_bucket_field(cached_sectors); + } + }; + + for_each_member_device(ca, c, i) { + struct bch_dev_usage dst = __bch2_dev_usage_read(ca, 0); + struct bch_dev_usage src = __bch2_dev_usage_read(ca, 1); + struct bch_dev_usage *p; + unsigned b; + + for (b = 0; b < BCH_DATA_NR; b++) + copy_dev_field(buckets[b], + "buckets[%s]", bch2_data_types[b]); + copy_dev_field(buckets_alloc, "buckets_alloc"); + copy_dev_field(buckets_ec, "buckets_ec"); + + for (b = 0; b < BCH_DATA_NR; b++) + copy_dev_field(sectors[b], + "sectors[%s]", bch2_data_types[b]); + copy_dev_field(sectors_fragmented, + "sectors_fragmented"); + + for_each_possible_cpu(cpu) { + p = per_cpu_ptr(ca->usage[0], cpu); + memset(p, 0, sizeof(*p)); + } + + preempt_disable(); + p = this_cpu_ptr(ca->usage[0]); + *p = dst; + preempt_enable(); } + { + struct bch_fs_usage dst = __bch2_fs_usage_read(c, 0); + struct bch_fs_usage src = __bch2_fs_usage_read(c, 1); + struct bch_fs_usage *p; + unsigned r, b; + + for (r = 0; r < BCH_REPLICAS_MAX; r++) { + for (b = 0; b < BCH_DATA_NR; b++) + copy_fs_field(replicas[r].data[b], + "replicas[%i].data[%s]", + r, bch2_data_types[b]); + copy_fs_field(replicas[r].ec_data, + "replicas[%i].ec_data", r); + copy_fs_field(replicas[r].persistent_reserved, + "replicas[%i].persistent_reserved", r); + } + + for (b = 0; b < BCH_DATA_NR; b++) + copy_fs_field(buckets[b], + "buckets[%s]", bch2_data_types[b]); + + for_each_possible_cpu(cpu) { + p = per_cpu_ptr(c->usage[0], cpu); + memset(p, 0, offsetof(typeof(*p), online_reserved)); + } + + preempt_disable(); + p = this_cpu_ptr(c->usage[0]); + memcpy(p, &dst, offsetof(typeof(*p), online_reserved)); + preempt_enable(); + } +out: percpu_up_write(&c->usage_lock); - /* Clear bucket marks: */ +#undef copy_field +#undef copy_fs_field +#undef copy_dev_field +#undef copy_bucket_field +} + +static int bch2_gc_start(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + BUG_ON(c->usage[1]); + + c->usage[1] = alloc_percpu(struct bch_fs_usage); + if (!c->usage[1]) + return -ENOMEM; + for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { - bucket_cmpxchg(buckets->b + b, new, ({ - new.owned_by_allocator = 0; - new.data_type = 0; - new.cached_sectors = 0; - new.dirty_sectors = 0; - })); - ca->oldest_gens[b] = new.gen; + BUG_ON(ca->buckets[1]); + BUG_ON(ca->usage[1]); + + ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket), + GFP_KERNEL|__GFP_ZERO); + if (!ca->buckets[1]) { + percpu_ref_put(&ca->ref); + return -ENOMEM; + } + + ca->usage[1] = alloc_percpu(struct bch_dev_usage); + if (!ca->usage[1]) { + percpu_ref_put(&ca->ref); + return -ENOMEM; } - up_read(&ca->bucket_lock); } + + percpu_down_write(&c->usage_lock); + + for_each_member_device(ca, c, i) { + struct bucket_array *dst = __bucket_array(ca, 1); + struct bucket_array *src = __bucket_array(ca, 0); + size_t b; + + dst->first_bucket = src->first_bucket; + dst->nbuckets = src->nbuckets; + + for (b = 0; b < src->nbuckets; b++) + dst->b[b]._mark.gen = src->b[b].mark.gen; + }; + + percpu_up_write(&c->usage_lock); + + return 0; } /** - * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes + * bch2_gc - walk _all_ references to buckets, and recompute them: + * + * Order matters here: + * - Concurrent GC relies on the fact that we have a total ordering for + * everything that GC walks - see gc_will_visit_node(), + * gc_will_visit_root() + * + * - also, references move around in the course of index updates and + * various other crap: everything needs to agree on the ordering + * references are allowed to move around in - e.g., we're allowed to + * start with a reference owned by an open_bucket (the allocator) and + * move it to the btree, but not the reverse. + * + * This is necessary to ensure that gc doesn't miss references that + * move around - if references move backwards in the ordering GC + * uses, GC could skip past them */ -void bch2_gc(struct bch_fs *c) +int bch2_gc(struct bch_fs *c, struct list_head *journal, bool initial) { struct bch_dev *ca; u64 start_time = local_clock(); - unsigned i; + unsigned i, iter = 0; int ret; - /* - * Walk _all_ references to buckets, and recompute them: - * - * Order matters here: - * - Concurrent GC relies on the fact that we have a total ordering for - * everything that GC walks - see gc_will_visit_node(), - * gc_will_visit_root() - * - * - also, references move around in the course of index updates and - * various other crap: everything needs to agree on the ordering - * references are allowed to move around in - e.g., we're allowed to - * start with a reference owned by an open_bucket (the allocator) and - * move it to the btree, but not the reverse. - * - * This is necessary to ensure that gc doesn't miss references that - * move around - if references move backwards in the ordering GC - * uses, GC could skip past them - */ trace_gc_start(c); - /* - * Do this before taking gc_lock - bch2_disk_reservation_get() blocks on - * gc_lock if sectors_available goes to 0: - */ - bch2_recalc_sectors_available(c); - down_write(&c->gc_lock); - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) +again: + ret = bch2_gc_start(c); + if (ret) goto out; - bch2_gc_start(c); - bch2_mark_superblocks(c); - ret = bch2_gc_btrees(c, NULL, false); - if (ret) { - bch_err(c, "btree gc failed: %d", ret); - set_bit(BCH_FS_GC_FAILURE, &c->flags); + ret = bch2_gc_btrees(c, journal, initial); + if (ret) goto out; - } bch2_mark_pending_btree_node_frees(c); bch2_mark_allocator_buckets(c); - /* Indicates that gc is no longer in progress: */ - gc_pos_set(c, gc_phase(GC_PHASE_DONE)); c->gc_count++; out: + if (!ret && test_bit(BCH_FS_FIXED_GENS, &c->flags)) { + /* + * XXX: make sure gens we fixed got saved + */ + if (iter++ <= 2) { + bch_info(c, "Fixed gens, restarting mark and sweep:"); + clear_bit(BCH_FS_FIXED_GENS, &c->flags); + goto again; + } + + bch_info(c, "Unable to fix bucket gens, looping"); + ret = -EINVAL; + } + + if (!ret) + bch2_gc_done(c, initial); + + /* Indicates that gc is no longer in progress: */ + __gc_pos_set(c, gc_phase(GC_PHASE_START)); + + bch2_gc_free(c); up_write(&c->gc_lock); + + if (!ret && initial) + set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); + trace_gc_end(c); bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); @@ -680,6 +893,7 @@ out: * allocator thread - issue wakeup in case they blocked on gc_lock: */ closure_wake_up(&c->freelist_wait); + return ret; } /* Btree coalescing */ @@ -995,9 +1209,6 @@ void bch2_coalesce(struct bch_fs *c) { enum btree_id id; - if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) - return; - down_read(&c->gc_lock); trace_gc_coalesce_start(c); @@ -1009,7 +1220,6 @@ void bch2_coalesce(struct bch_fs *c) if (ret) { if (ret != -ESHUTDOWN) bch_err(c, "btree coalescing failed: %d", ret); - set_bit(BCH_FS_GC_FAILURE, &c->flags); return; } } @@ -1024,6 +1234,7 @@ static int bch2_gc_thread(void *arg) struct io_clock *clock = &c->io_clock[WRITE]; unsigned long last = atomic_long_read(&clock->now); unsigned last_kick = atomic_read(&c->kick_gc); + int ret; set_freezable(); @@ -1057,7 +1268,9 @@ static int bch2_gc_thread(void *arg) last = atomic_long_read(&clock->now); last_kick = atomic_read(&c->kick_gc); - bch2_gc(c); + ret = bch2_gc(c, NULL, false); + if (ret) + bch_err(c, "btree gc failed: %i", ret); debug_check_no_locks_held(); } @@ -1098,30 +1311,7 @@ int bch2_gc_thread_start(struct bch_fs *c) int bch2_initial_gc(struct bch_fs *c, struct list_head *journal) { - unsigned iter = 0; - int ret = 0; - - down_write(&c->gc_lock); -again: - bch2_gc_start(c); - - bch2_mark_superblocks(c); - - ret = bch2_gc_btrees(c, journal, true); - if (ret) - goto err; - - if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) { - if (iter++ > 2) { - bch_info(c, "Unable to fix bucket gens, looping"); - ret = -EINVAL; - goto err; - } - - bch_info(c, "Fixed gens, restarting initial mark and sweep:"); - clear_bit(BCH_FS_FIXED_GENS, &c->flags); - goto again; - } + int ret = bch2_gc(c, journal, true); /* * Skip past versions that might have possibly been used (as nonces), @@ -1130,9 +1320,5 @@ again: if (c->sb.encryption_type) atomic64_add(1 << 16, &c->key_version); - gc_pos_set(c, gc_phase(GC_PHASE_DONE)); - set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); -err: - up_write(&c->gc_lock); return ret; } diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index 101a6a8..d7809c2 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -6,7 +6,7 @@ enum bkey_type; void bch2_coalesce(struct bch_fs *); -void bch2_gc(struct bch_fs *); +int bch2_gc(struct bch_fs *, struct list_head *, bool); void bch2_gc_thread_stop(struct bch_fs *); int bch2_gc_thread_start(struct bch_fs *); int bch2_initial_gc(struct bch_fs *, struct list_head *); @@ -54,11 +54,22 @@ static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) return 0; } +static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) +{ + switch (id) { +#define DEF_BTREE_ID(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n; + DEFINE_BCH_BTREE_IDS() +#undef DEF_BTREE_ID + default: + BUG(); + } +} + static inline struct gc_pos gc_pos_btree(enum btree_id id, struct bpos pos, unsigned level) { return (struct gc_pos) { - .phase = GC_PHASE_BTREE_EXTENTS + id, + .phase = btree_id_to_gc_phase(id), .pos = pos, .level = level, }; @@ -93,14 +104,14 @@ static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *o }; } -static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos) +static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) { unsigned seq; bool ret; do { seq = read_seqcount_begin(&c->gc_pos_lock); - ret = gc_pos_cmp(c->gc_pos, pos) < 0; + ret = gc_pos_cmp(pos, c->gc_pos) <= 0; } while (read_seqcount_retry(&c->gc_pos_lock, seq)); return ret; diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 1eae181..ae1d4f8 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -817,7 +817,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter, */ iter->level = depth_want; iter->l[iter->level].b = NULL; - return 0; + return 1; } lock_type = __btree_lock_want(iter, iter->level); @@ -1044,6 +1044,9 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) ? btree_iter_down(iter) : btree_iter_lock_root(iter, depth_want); if (unlikely(ret)) { + if (ret == 1) + return 0; + iter->level = depth_want; iter->l[iter->level].b = BTREE_ITER_NOT_END; return ret; diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 4d34bdc..537b8da 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -159,7 +159,6 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b, { struct bch_fs *c = as->c; struct pending_btree_node_free *d; - unsigned replicas; /* * btree_update lock is only needed here to avoid racing with @@ -177,15 +176,6 @@ found: BUG_ON(d->index_update_done); d->index_update_done = true; - /* - * Btree nodes are accounted as freed in bch_alloc_stats when they're - * freed from the index: - */ - replicas = bch2_extent_nr_dirty_ptrs(k); - if (replicas) - stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -= - c->opts.btree_node_size * replicas; - /* * We're dropping @k from the btree, but it's still live until the * index update is persistent so we need to keep a reference around for @@ -207,15 +197,16 @@ found: * bch2_mark_key() compares the current gc pos to the pos we're * moving this reference from, hence one comparison here: */ - if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) { - struct bch_fs_usage tmp = { 0 }; + if (gc_pos_cmp(c->gc_pos, b + ? gc_pos_btree_node(b) + : gc_pos_btree_root(as->btree_id)) >= 0 && + gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) { + struct gc_pos pos = { 0 }; bch2_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&d->key), - false, 0, b - ? gc_pos_btree_node(b) - : gc_pos_btree_root(as->btree_id), - &tmp, 0, 0); + false, 0, pos, + NULL, 0, BCH_BUCKET_MARK_GC); /* * Don't apply tmp - pending deletes aren't tracked in * bch_alloc_stats: @@ -286,19 +277,13 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, static void bch2_btree_node_free_ondisk(struct bch_fs *c, struct pending_btree_node_free *pending) { - struct bch_fs_usage stats = { 0 }; - BUG_ON(!pending->index_update_done); bch2_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&pending->key), false, 0, gc_phase(GC_PHASE_PENDING_DELETE), - &stats, 0, 0); - /* - * Don't apply stats - pending deletes aren't tracked in - * bch_alloc_stats: - */ + NULL, 0, 0); } static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, @@ -339,7 +324,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, mutex_unlock(&c->btree_reserve_cache_lock); retry: - wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, + wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0, writepoint_ptr(&c->btree_write_point), &devs_have, res->nr_replicas, @@ -637,12 +622,12 @@ static void btree_update_wait_on_journal(struct closure *cl) int ret; ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl); - if (ret < 0) - goto err; - if (!ret) { + if (ret == -EAGAIN) { continue_at(cl, btree_update_wait_on_journal, system_wq); return; } + if (ret < 0) + goto err; bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl); err: diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 288d7ca..e8d6e07 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -343,19 +343,40 @@ static inline int do_btree_insert_at(struct btree_insert *trans, trans_for_each_entry(trans, i) BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK); - u64s = 0; - trans_for_each_entry(trans, i) - u64s += jset_u64s(i->k->k.u64s); - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) - ? bch2_journal_res_get(&c->journal, - &trans->journal_res, - u64s, u64s) - : 0; - if (ret) - return ret; + if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { + u64s = 0; + trans_for_each_entry(trans, i) + u64s += jset_u64s(i->k->k.u64s); + + while ((ret = bch2_journal_res_get(&c->journal, + &trans->journal_res, u64s, + JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) { + struct btree_iter *iter = trans->entries[0].iter; + struct closure cl; + + bch2_btree_iter_unlock(iter); + + closure_init_stack(&cl); + + while ((ret = bch2_journal_open_seq_async(&c->journal, + trans->journal_res.seq, + &cl)) == -EAGAIN) + closure_sync(&cl); + + if (ret) + return ret; + + if (!bch2_btree_iter_relock(iter)) { + trans_restart(" (iter relock after journal res get blocked)"); + return -EINTR; + } + } + + if (ret) + return ret; + } multi_lock_write(c, trans); diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 86d57f3..6037763 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -68,6 +68,7 @@ #include "btree_gc.h" #include "btree_update.h" #include "buckets.h" +#include "ec.h" #include "error.h" #include "movinggc.h" @@ -83,8 +84,7 @@ static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage); static void bch2_fs_stats_verify(struct bch_fs *c) { - struct bch_fs_usage stats = - __bch2_fs_usage_read(c); + struct bch_fs_usage stats =_bch2_fs_usage_read(c); unsigned i, j; for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) { @@ -207,43 +207,24 @@ do { \ _acc; \ }) -#define bch2_usage_read_cached(_c, _cached, _uncached) \ -({ \ - typeof(_cached) _ret; \ - unsigned _seq; \ - \ - do { \ - _seq = read_seqcount_begin(&(_c)->gc_pos_lock); \ - _ret = (_c)->gc_pos.phase == GC_PHASE_DONE \ - ? bch2_usage_read_raw(_uncached) \ - : (_cached); \ - } while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq)); \ - \ - _ret; \ -}) - -struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca) +struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca, bool gc) { - return bch2_usage_read_raw(ca->usage_percpu); + return bch2_usage_read_raw(ca->usage[gc]); } struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca) { - return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu); + return bch2_usage_read_raw(ca->usage[0]); } -struct bch_fs_usage -__bch2_fs_usage_read(struct bch_fs *c) +struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *c, bool gc) { - return bch2_usage_read_raw(c->usage_percpu); + return bch2_usage_read_raw(c->usage[gc]); } -struct bch_fs_usage -bch2_fs_usage_read(struct bch_fs *c) +struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *c) { - return bch2_usage_read_cached(c, - c->usage_cached, - c->usage_percpu); + return bch2_usage_read_raw(c->usage[0]); } struct fs_usage_sum { @@ -269,6 +250,7 @@ static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats) for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) { sum.data += stats.replicas[i].data[BCH_DATA_BTREE]; sum.data += stats.replicas[i].data[BCH_DATA_USER]; + sum.data += stats.replicas[i].ec_data; sum.cached += stats.replicas[i].data[BCH_DATA_CACHED]; sum.reserved += stats.replicas[i].persistent_reserved; } @@ -324,13 +306,11 @@ static inline enum bch_data_type bucket_type(struct bucket_mark m) : m.data_type; } -static bool bucket_became_unavailable(struct bch_fs *c, - struct bucket_mark old, +static bool bucket_became_unavailable(struct bucket_mark old, struct bucket_mark new) { return is_available_bucket(old) && - !is_available_bucket(new) && - (!c || c->gc_pos.phase == GC_PHASE_DONE); + !is_available_bucket(new); } void bch2_fs_usage_apply(struct bch_fs *c, @@ -360,12 +340,14 @@ void bch2_fs_usage_apply(struct bch_fs *c, percpu_down_read_preempt_disable(&c->usage_lock); /* online_reserved not subject to gc: */ - this_cpu_ptr(c->usage_percpu)->online_reserved += + this_cpu_ptr(c->usage[0])->online_reserved += stats->online_reserved; stats->online_reserved = 0; - if (!gc_will_visit(c, gc_pos)) - bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats); + bch2_usage_add(this_cpu_ptr(c->usage[0]), stats); + + if (gc_visited(c, gc_pos)) + bch2_usage_add(this_cpu_ptr(c->usage[1]), stats); bch2_fs_stats_verify(c); percpu_up_read_preempt_enable(&c->usage_lock); @@ -374,8 +356,9 @@ void bch2_fs_usage_apply(struct bch_fs *c, } static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, - struct bch_fs_usage *stats, - struct bucket_mark old, struct bucket_mark new) + struct bch_fs_usage *fs_usage, + struct bucket_mark old, struct bucket_mark new, + bool gc) { struct bch_dev_usage *dev_usage; @@ -387,16 +370,22 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, bch2_data_types[old.data_type], bch2_data_types[new.data_type]); - stats->buckets[bucket_type(old)] -= ca->mi.bucket_size; - stats->buckets[bucket_type(new)] += ca->mi.bucket_size; - - dev_usage = this_cpu_ptr(ca->usage_percpu); + dev_usage = this_cpu_ptr(ca->usage[gc]); - dev_usage->buckets[bucket_type(old)]--; - dev_usage->buckets[bucket_type(new)]++; + if (bucket_type(old) != bucket_type(new)) { + if (bucket_type(old)) { + fs_usage->buckets[bucket_type(old)] -= ca->mi.bucket_size; + dev_usage->buckets[bucket_type(old)]--; + } else { + fs_usage->buckets[bucket_type(new)] += ca->mi.bucket_size; + dev_usage->buckets[bucket_type(new)]++; + } + } dev_usage->buckets_alloc += (int) new.owned_by_allocator - (int) old.owned_by_allocator; + dev_usage->buckets_ec += + (int) new.stripe - (int) old.stripe; dev_usage->buckets_unavailable += is_unavailable_bucket(new) - is_unavailable_bucket(old); @@ -417,21 +406,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, ({ \ struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \ \ - bch2_dev_usage_update(c, ca, stats, _old, new); \ + bch2_dev_usage_update(c, ca, stats, _old, new, gc); \ _old; \ }) -void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, struct bucket_mark *old) +static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, struct bucket_mark *old, + bool gc) { - struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu); - struct bucket *g; + struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]); + struct bucket *g = __bucket(ca, b, gc); struct bucket_mark new; - percpu_rwsem_assert_held(&c->usage_lock); - - g = bucket(ca, b); - *old = bucket_data_cmpxchg(c, ca, stats, g, new, ({ BUG_ON(!is_available_bucket(new)); @@ -442,38 +428,49 @@ void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, new.gen++; })); - /* - * This isn't actually correct yet, since fs usage is still - * uncompressed sectors: - */ stats->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors; +} + +void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, struct bucket_mark *old) +{ + percpu_rwsem_assert_held(&c->usage_lock); + + __bch2_invalidate_bucket(c, ca, b, old, false); if (!old->owned_by_allocator && old->cached_sectors) trace_invalidate(ca, bucket_to_sector(ca, b), old->cached_sectors); } -void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t b, bool owned_by_allocator, - struct gc_pos pos, unsigned flags) +static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, bool owned_by_allocator, + bool gc) { - struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu); - struct bucket *g; + struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]); + struct bucket *g = __bucket(ca, b, gc); struct bucket_mark old, new; - percpu_rwsem_assert_held(&c->usage_lock); - g = bucket(ca, b); - - if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && - gc_will_visit(c, pos)) - return; - old = bucket_data_cmpxchg(c, ca, stats, g, new, ({ new.owned_by_allocator = owned_by_allocator; })); - BUG_ON(!owned_by_allocator && !old.owned_by_allocator && - c->gc_pos.phase == GC_PHASE_DONE); + BUG_ON(!gc && + !owned_by_allocator && !old.owned_by_allocator); +} + +void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, bool owned_by_allocator, + struct gc_pos pos, unsigned flags) +{ + percpu_rwsem_assert_held(&c->usage_lock); + + if (!(flags & BCH_BUCKET_MARK_GC)) + __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false); + + if ((flags & BCH_BUCKET_MARK_GC) || + gc_visited(c, pos)) + __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true); } #define checked_add(a, b) \ @@ -483,35 +480,47 @@ do { \ BUG_ON((a) != _res); \ } while (0) +static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, enum bch_data_type type, + unsigned sectors, bool gc) +{ + struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]); + struct bucket *g = __bucket(ca, b, gc); + struct bucket_mark old, new; + + BUG_ON(type != BCH_DATA_SB && + type != BCH_DATA_JOURNAL); + + old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ + new.data_type = type; + checked_add(new.dirty_sectors, sectors); + })); + + fs_usage->replicas[0].data[type] += sectors; +} + void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, enum bch_data_type type, unsigned sectors, struct gc_pos pos, unsigned flags) { - struct bch_fs_usage *stats; - struct bucket *g; - struct bucket_mark old, new; - BUG_ON(type != BCH_DATA_SB && type != BCH_DATA_JOURNAL); if (likely(c)) { percpu_rwsem_assert_held(&c->usage_lock); - if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && - gc_will_visit(c, pos)) - return; - - stats = this_cpu_ptr(c->usage_percpu); - - g = bucket(ca, b); - old = bucket_data_cmpxchg(c, ca, stats, g, new, ({ - new.data_type = type; - checked_add(new.dirty_sectors, sectors); - })); - - stats->replicas[0].data[type] += sectors; + if (!(flags & BCH_BUCKET_MARK_GC)) + __bch2_mark_metadata_bucket(c, ca, b, type, sectors, + false); + if ((flags & BCH_BUCKET_MARK_GC) || + gc_visited(c, pos)) + __bch2_mark_metadata_bucket(c, ca, b, type, sectors, + true); } else { + struct bucket *g; + struct bucket_mark old, new; + rcu_read_lock(); g = bucket(ca, b); @@ -522,9 +531,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, rcu_read_unlock(); } - - BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) && - bucket_became_unavailable(c, old, new)); } static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors) @@ -569,23 +575,15 @@ static void bch2_mark_pointer(struct bch_fs *c, struct extent_ptr_decoded p, s64 sectors, enum bch_data_type data_type, struct bch_fs_usage *fs_usage, - u64 journal_seq, unsigned flags) + u64 journal_seq, unsigned flags, + bool gc) { struct bucket_mark old, new; struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); - struct bucket *g = PTR_BUCKET(ca, &p.ptr); + size_t b = PTR_BUCKET_NR(ca, &p.ptr); + struct bucket *g = __bucket(ca, b, gc); u64 v; - if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) { - if (journal_seq) - bucket_cmpxchg(g, new, ({ - new.journal_seq_valid = 1; - new.journal_seq = journal_seq; - })); - - return; - } - v = atomic64_read(&g->_mark.v); do { new.v.counter = old.v.counter = v; @@ -627,17 +625,59 @@ static void bch2_mark_pointer(struct bch_fs *c, old.v.counter, new.v.counter)) != old.v.counter); - bch2_dev_usage_update(c, ca, fs_usage, old, new); + bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + + BUG_ON(!gc && bucket_became_unavailable(old, new)); +} + +static void bch2_mark_stripe_ptr(struct bch_fs *c, + struct bch_extent_stripe_ptr p, + s64 sectors, unsigned flags, + s64 *adjusted_disk_sectors, + unsigned *redundancy) +{ + struct ec_stripe *m; + unsigned old, new, nr_data; + int blocks_nonempty_delta; + s64 parity_sectors; + + m = genradix_ptr(&c->ec_stripes, p.idx); + if (WARN_ON(!m)) + return; + + if (WARN_ON(!m->alive)) + return; + + nr_data = m->nr_blocks - m->nr_redundant; + + parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data); + + if (sectors < 0) + parity_sectors = -parity_sectors; + + *adjusted_disk_sectors += parity_sectors; + + *redundancy = max_t(unsigned, *redundancy, m->nr_redundant + 1); + + new = atomic_add_return(sectors, &m->block_sectors[p.block]); + old = new - sectors; + + blocks_nonempty_delta = (int) !!new - (int) !!old; + if (!blocks_nonempty_delta) + return; + + atomic_add(blocks_nonempty_delta, &m->blocks_nonempty); + + BUG_ON(atomic_read(&m->blocks_nonempty) < 0); - BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) && - bucket_became_unavailable(c, old, new)); + bch2_stripes_heap_update(c, m, p.idx); } static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, s64 sectors, enum bch_data_type data_type, - struct gc_pos pos, struct bch_fs_usage *stats, - u64 journal_seq, unsigned flags) + u64 journal_seq, unsigned flags, + bool gc) { BUG_ON(!sectors); @@ -649,28 +689,43 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, struct extent_ptr_decoded p; s64 cached_sectors = 0; s64 dirty_sectors = 0; + s64 ec_sectors = 0; unsigned replicas = 0; + unsigned ec_redundancy = 0; + unsigned i; extent_for_each_ptr_decode(e, p, entry) { s64 disk_sectors = ptr_disk_sectors(e, p, sectors); + s64 adjusted_disk_sectors = disk_sectors; bch2_mark_pointer(c, e, p, disk_sectors, data_type, - stats, journal_seq, flags); + stats, journal_seq, flags, gc); + if (!p.ptr.cached) + for (i = 0; i < p.ec_nr; i++) + bch2_mark_stripe_ptr(c, p.ec[i], + disk_sectors, flags, + &adjusted_disk_sectors, + &ec_redundancy); if (!p.ptr.cached) replicas++; if (p.ptr.cached) - cached_sectors += disk_sectors; + cached_sectors += adjusted_disk_sectors; + else if (!p.ec_nr) + dirty_sectors += adjusted_disk_sectors; else - dirty_sectors += disk_sectors; + ec_sectors += adjusted_disk_sectors; } replicas = clamp_t(unsigned, replicas, 1, ARRAY_SIZE(stats->replicas)); + ec_redundancy = clamp_t(unsigned, ec_redundancy, + 1, ARRAY_SIZE(stats->replicas)); stats->replicas[0].data[BCH_DATA_CACHED] += cached_sectors; stats->replicas[replicas - 1].data[data_type] += dirty_sectors; + stats->replicas[ec_redundancy - 1].ec_data += ec_sectors; break; } case BCH_RESERVATION: { @@ -686,64 +741,129 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, } } -void bch2_mark_key(struct bch_fs *c, - enum bkey_type type, struct bkey_s_c k, - bool inserting, s64 sectors, - struct gc_pos pos, - struct bch_fs_usage *stats, - u64 journal_seq, unsigned flags) +static void bucket_set_stripe(struct bch_fs *c, + const struct bch_stripe *v, + bool enabled, + struct bch_fs_usage *fs_usage, + u64 journal_seq, + bool gc) { - /* - * synchronization w.r.t. GC: - * - * Normally, bucket sector counts/marks are updated on the fly, as - * references are added/removed from the btree, the lists of buckets the - * allocator owns, other metadata buckets, etc. - * - * When GC is in progress and going to mark this reference, we do _not_ - * mark this reference here, to avoid double counting - GC will count it - * when it gets to it. - * - * To know whether we should mark a given reference (GC either isn't - * running, or has already marked references at this position) we - * construct a total order for everything GC walks. Then, we can simply - * compare the position of the reference we're marking - @pos - with - * GC's current position. If GC is going to mark this reference, GC's - * current position will be less than @pos; if GC's current position is - * greater than @pos GC has either already walked this position, or - * isn't running. - * - * To avoid racing with GC's position changing, we have to deal with - * - GC's position being set to GC_POS_MIN when GC starts: - * usage_lock guards against this - * - GC's position overtaking @pos: we guard against this with - * whatever lock protects the data structure the reference lives in - * (e.g. the btree node lock, or the relevant allocator lock). - */ + unsigned i; - percpu_down_read_preempt_disable(&c->usage_lock); - if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && - gc_will_visit(c, pos)) - flags |= BCH_BUCKET_MARK_GC_WILL_VISIT; + for (i = 0; i < v->nr_blocks; i++) { + const struct bch_extent_ptr *ptr = v->ptrs + i; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + size_t b = PTR_BUCKET_NR(ca, ptr); + struct bucket *g = __bucket(ca, b, gc); + struct bucket_mark new, old; + + BUG_ON(ptr_stale(ca, ptr)); + + old = bucket_cmpxchg(g, new, ({ + new.stripe = enabled; + if (journal_seq) { + new.journal_seq_valid = 1; + new.journal_seq = journal_seq; + } + })); + + BUG_ON(old.stripe == enabled); + + bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + } +} + +static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, + bool inserting, + struct bch_fs_usage *fs_usage, + u64 journal_seq, unsigned flags, + bool gc) +{ + switch (k.k->type) { + case BCH_STRIPE: { + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + size_t idx = s.k->p.offset; + struct ec_stripe *m = genradix_ptr(&c->ec_stripes, idx); + unsigned i; - if (!stats) - stats = this_cpu_ptr(c->usage_percpu); + BUG_ON(!m); + BUG_ON(m->alive == inserting); + BUG_ON(atomic_read(&m->blocks_nonempty)); + + for (i = 0; i < EC_STRIPE_MAX; i++) + BUG_ON(atomic_read(&m->block_sectors[i])); + + if (inserting) { + m->sectors = le16_to_cpu(s.v->sectors); + m->algorithm = s.v->algorithm; + m->nr_blocks = s.v->nr_blocks; + m->nr_redundant = s.v->nr_redundant; + } + + if (inserting) + bch2_stripes_heap_insert(c, m, idx); + else + bch2_stripes_heap_del(c, m, idx); + + bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc); + break; + } + } +} + +static void __bch2_mark_key(struct bch_fs *c, + enum bkey_type type, struct bkey_s_c k, + bool inserting, s64 sectors, + struct bch_fs_usage *stats, + u64 journal_seq, unsigned flags, + bool gc) +{ switch (type) { case BKEY_TYPE_BTREE: bch2_mark_extent(c, k, inserting ? c->opts.btree_node_size : -c->opts.btree_node_size, BCH_DATA_BTREE, - pos, stats, journal_seq, flags); + stats, journal_seq, flags, gc); break; case BKEY_TYPE_EXTENTS: bch2_mark_extent(c, k, sectors, BCH_DATA_USER, - pos, stats, journal_seq, flags); + stats, journal_seq, flags, gc); + break; + case BKEY_TYPE_EC: + bch2_mark_stripe(c, k, inserting, + stats, journal_seq, flags, gc); break; default: break; } +} + +void bch2_mark_key(struct bch_fs *c, + enum bkey_type type, struct bkey_s_c k, + bool inserting, s64 sectors, + struct gc_pos pos, + struct bch_fs_usage *stats, + u64 journal_seq, unsigned flags) +{ + percpu_down_read_preempt_disable(&c->usage_lock); + + if (!(flags & BCH_BUCKET_MARK_GC)) { + if (!stats) + stats = this_cpu_ptr(c->usage[0]); + + __bch2_mark_key(c, type, k, inserting, sectors, + stats, journal_seq, flags, false); + } + + if ((flags & BCH_BUCKET_MARK_GC) || + gc_visited(c, pos)) { + __bch2_mark_key(c, type, k, inserting, sectors, + this_cpu_ptr(c->usage[1]), + journal_seq, flags, true); + } + percpu_up_read_preempt_enable(&c->usage_lock); } @@ -819,28 +939,20 @@ void bch2_mark_update(struct btree_insert *trans, /* Disk reservations: */ -static u64 __recalc_sectors_available(struct bch_fs *c) +static u64 bch2_recalc_sectors_available(struct bch_fs *c) { int cpu; for_each_possible_cpu(cpu) - per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0; + per_cpu_ptr(c->usage[0], cpu)->available_cache = 0; return avail_factor(bch2_fs_sectors_free(c, bch2_fs_usage_read(c))); } -/* Used by gc when it's starting: */ -void bch2_recalc_sectors_available(struct bch_fs *c) -{ - percpu_down_write(&c->usage_lock); - atomic64_set(&c->sectors_available, __recalc_sectors_available(c)); - percpu_up_write(&c->usage_lock); -} - void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) { percpu_down_read_preempt_disable(&c->usage_lock); - this_cpu_sub(c->usage_percpu->online_reserved, + this_cpu_sub(c->usage[0]->online_reserved, res->sectors); bch2_fs_stats_verify(c); @@ -860,7 +972,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, int ret; percpu_down_read_preempt_disable(&c->usage_lock); - stats = this_cpu_ptr(c->usage_percpu); + stats = this_cpu_ptr(c->usage[0]); if (sectors <= stats->available_cache) goto out; @@ -908,7 +1020,7 @@ recalculate: } percpu_down_write(&c->usage_lock); - sectors_available = __recalc_sectors_available(c); + sectors_available = bch2_recalc_sectors_available(c); if (sectors <= sectors_available || (flags & BCH_DISK_RESERVATION_NOFAIL)) { @@ -949,6 +1061,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { struct bucket_array *buckets = NULL, *old_buckets = NULL; unsigned long *buckets_dirty = NULL; + unsigned long *buckets_written = NULL; u8 *oldest_gens = NULL; alloc_fifo free[RESERVE_NR]; alloc_fifo free_inc; @@ -962,7 +1075,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7); size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), btree_reserve); - bool resize = ca->buckets != NULL, + bool resize = ca->buckets[0] != NULL, start_copygc = ca->copygc_thread != NULL; int ret = -ENOMEM; unsigned i; @@ -980,6 +1093,9 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) !(buckets_dirty = kvpmalloc(BITS_TO_LONGS(nbuckets) * sizeof(unsigned long), GFP_KERNEL|__GFP_ZERO)) || + !(buckets_written = kvpmalloc(BITS_TO_LONGS(nbuckets) * + sizeof(unsigned long), + GFP_KERNEL|__GFP_ZERO)) || !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) || !init_fifo(&free[RESERVE_MOVINGGC], copygc_reserve, GFP_KERNEL) || @@ -1014,13 +1130,17 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) memcpy(buckets_dirty, ca->buckets_dirty, BITS_TO_LONGS(n) * sizeof(unsigned long)); + memcpy(buckets_written, + ca->buckets_written, + BITS_TO_LONGS(n) * sizeof(unsigned long)); } - rcu_assign_pointer(ca->buckets, buckets); + rcu_assign_pointer(ca->buckets[0], buckets); buckets = old_buckets; swap(ca->oldest_gens, oldest_gens); swap(ca->buckets_dirty, buckets_dirty); + swap(ca->buckets_written, buckets_written); if (resize) percpu_up_write(&c->usage_lock); @@ -1060,6 +1180,8 @@ err: free_fifo(&free[i]); kvpfree(buckets_dirty, BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); + kvpfree(buckets_written, + BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); kvpfree(oldest_gens, nbuckets * sizeof(u8)); if (buckets) @@ -1077,19 +1199,21 @@ void bch2_dev_buckets_free(struct bch_dev *ca) free_fifo(&ca->free_inc); for (i = 0; i < RESERVE_NR; i++) free_fifo(&ca->free[i]); + kvpfree(ca->buckets_written, + BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); kvpfree(ca->buckets_dirty, BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8)); - kvpfree(rcu_dereference_protected(ca->buckets, 1), + kvpfree(rcu_dereference_protected(ca->buckets[0], 1), sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket)); - free_percpu(ca->usage_percpu); + free_percpu(ca->usage[0]); } int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { - if (!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage))) + if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage))) return -ENOMEM; return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index e84247d..76ebe2e 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -28,23 +28,34 @@ _old; \ }) -static inline struct bucket_array *bucket_array(struct bch_dev *ca) +static inline struct bucket_array *__bucket_array(struct bch_dev *ca, + bool gc) { - return rcu_dereference_check(ca->buckets, + return rcu_dereference_check(ca->buckets[gc], !ca->fs || percpu_rwsem_is_held(&ca->fs->usage_lock) || lockdep_is_held(&ca->fs->gc_lock) || lockdep_is_held(&ca->bucket_lock)); } -static inline struct bucket *bucket(struct bch_dev *ca, size_t b) +static inline struct bucket_array *bucket_array(struct bch_dev *ca) +{ + return __bucket_array(ca, false); +} + +static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc) { - struct bucket_array *buckets = bucket_array(ca); + struct bucket_array *buckets = __bucket_array(ca, gc); BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); return buckets->b + b; } +static inline struct bucket *bucket(struct bch_dev *ca, size_t b) +{ + return __bucket(ca, b, false); +} + static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca, size_t b, int rw) { @@ -128,7 +139,7 @@ static inline bool bucket_unused(struct bucket_mark mark) /* Device usage: */ -struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *); +struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *, bool); struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *); static inline u64 __dev_buckets_available(struct bch_dev *ca, @@ -167,7 +178,7 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca) /* Filesystem usage: */ -struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *); +struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *, bool); struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *); void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, struct disk_reservation *, struct gc_pos); @@ -184,6 +195,7 @@ static inline bool is_available_bucket(struct bucket_mark mark) { return (!mark.owned_by_allocator && !mark.dirty_sectors && + !mark.stripe && !mark.nouse); } @@ -205,17 +217,13 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, struct gc_pos, unsigned); #define BCH_BUCKET_MARK_NOATOMIC (1 << 0) -#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 1) -#define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 2) -#define BCH_BUCKET_MARK_GC_LOCK_HELD (1 << 3) +#define BCH_BUCKET_MARK_GC (1 << 1) void bch2_mark_key(struct bch_fs *, enum bkey_type, struct bkey_s_c, bool, s64, struct gc_pos, struct bch_fs_usage *, u64, unsigned); void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *); -void bch2_recalc_sectors_available(struct bch_fs *); - void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); static inline void bch2_disk_reservation_put(struct bch_fs *c, diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 6f7d3a2..0b1bd95 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -18,7 +18,8 @@ struct bucket_mark { gen_valid:1, owned_by_allocator:1, nouse:1, - journal_seq_valid:1; + journal_seq_valid:1, + stripe:1; u16 dirty_sectors; u16 cached_sectors; @@ -52,6 +53,7 @@ struct bucket_array { struct bch_dev_usage { u64 buckets[BCH_DATA_NR]; u64 buckets_alloc; + u64 buckets_ec; u64 buckets_unavailable; /* _compressed_ sectors: */ @@ -61,15 +63,18 @@ struct bch_dev_usage { struct bch_fs_usage { /* all fields are in units of 512 byte sectors: */ - u64 online_reserved; - u64 available_cache; struct { u64 data[BCH_DATA_NR]; + u64 ec_data; u64 persistent_reserved; } replicas[BCH_REPLICAS_MAX]; u64 buckets[BCH_DATA_NR]; + + /* fields starting here aren't touched by gc: */ + u64 online_reserved; + u64 available_cache; }; /* diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index 6379905..e74fc1f 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -601,11 +601,13 @@ have_compressed: goto out; } - ret = mempool_init_kmalloc_pool( - &c->decompress_workspace, - 1, decompress_workspace_size); - if (ret) - goto out; + if (!mempool_initialized(&c->decompress_workspace)) { + ret = mempool_init_kmalloc_pool( + &c->decompress_workspace, + 1, decompress_workspace_size); + if (ret) + goto out; + } out: pr_verbose_init(c->opts, "ret %i", ret); return ret; diff --git a/libbcachefs/disk_groups.h b/libbcachefs/disk_groups.h index d604728..b90b0ef 100644 --- a/libbcachefs/disk_groups.h +++ b/libbcachefs/disk_groups.h @@ -54,6 +54,19 @@ static inline struct target target_decode(unsigned target) } const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); + +static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, + enum bch_data_type data_type, + u16 target) +{ + struct bch_devs_mask devs = c->rw_devs[data_type]; + const struct bch_devs_mask *t = bch2_target_to_mask(c, target); + + if (t) + bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); + return devs; +} + bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); int bch2_disk_path_find(struct bch_sb_handle *, const char *); diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c new file mode 100644 index 0000000..02c51ea --- /dev/null +++ b/libbcachefs/ec.c @@ -0,0 +1,1283 @@ + +/* erasure coding */ + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bset.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "buckets.h" +#include "disk_groups.h" +#include "ec.h" +#include "error.h" +#include "io.h" +#include "keylist.h" +#include "super-io.h" +#include "util.h" + +#include + +#ifdef __KERNEL__ + +#include +#include + +static void raid5_recov(unsigned disks, unsigned failed_idx, + size_t size, void **data) +{ + unsigned i = 2, nr; + + BUG_ON(failed_idx >= disks); + + swap(data[0], data[failed_idx]); + memcpy(data[0], data[1], size); + + while (i < disks) { + nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); + xor_blocks(nr, size, data[0], data + i); + i += nr; + } + + swap(data[0], data[failed_idx]); +} + +static void raid_gen(int nd, int np, size_t size, void **v) +{ + if (np >= 1) + raid5_recov(nd + np, nd, size, v); + if (np >= 2) + raid6_call.gen_syndrome(nd + np, size, v); + BUG_ON(np > 2); +} + +static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) +{ + switch (nr) { + case 0: + break; + case 1: + if (ir[0] < nd + 1) + raid5_recov(nd + 1, ir[0], size, v); + else + raid6_call.gen_syndrome(nd + np, size, v); + break; + case 2: + if (ir[1] < nd) { + /* data+data failure. */ + raid6_2data_recov(nd + np, size, ir[0], ir[1], v); + } else if (ir[0] < nd) { + /* data + p/q failure */ + + if (ir[1] == nd) /* data + p failure */ + raid6_datap_recov(nd + np, size, ir[0], v); + else { /* data + q failure */ + raid5_recov(nd + 1, ir[0], size, v); + raid6_call.gen_syndrome(nd + np, size, v); + } + } else { + raid_gen(nd, np, size, v); + } + break; + default: + BUG(); + } +} + +#else + +#include + +#endif + +struct ec_bio { + struct bch_dev *ca; + struct ec_stripe_buf *buf; + size_t idx; + struct bio bio; +}; + +/* Stripes btree keys: */ + +static unsigned stripe_csums_per_device(const struct bch_stripe *s) +{ + return DIV_ROUND_UP(le16_to_cpu(s->sectors), + 1 << s->csum_granularity_bits); +} + +static unsigned stripe_val_u64s(const struct bch_stripe *s) +{ + unsigned bytes = sizeof(struct bch_stripe) + + sizeof(struct bch_extent_ptr) * s->nr_blocks + + bch_crc_bytes[s->csum_type] * s->nr_blocks * stripe_csums_per_device(s); + return DIV_ROUND_UP(bytes, sizeof(u64)); +} + +static void *stripe_csum(struct bch_stripe *s, unsigned dev, unsigned csum_idx) +{ + unsigned csum_bytes = bch_crc_bytes[s->csum_type]; + void *csums = s->ptrs + s->nr_blocks; + + BUG_ON(!csum_bytes); + + return csums + (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; +} + +const char *bch2_ec_key_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + if (k.k->p.inode) + return "invalid stripe key"; + + switch (k.k->type) { + case BCH_STRIPE: { + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + + if (bkey_val_bytes(k.k) < sizeof(*s)) + return "incorrect value size"; + + if (bkey_val_u64s(k.k) != stripe_val_u64s(s)) + return "incorrect value size"; + + return NULL; + } + default: + return "invalid type"; + } +} + +void bch2_ec_key_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + switch (k.k->type) { + case BCH_STRIPE: { + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + unsigned i; + + pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", + s->algorithm, + le16_to_cpu(s->sectors), + s->nr_blocks - s->nr_redundant, + s->nr_redundant, + s->csum_type, + 1U << s->csum_granularity_bits); + + for (i = 0; i < s->nr_blocks; i++) + pr_buf(out, " %u:%llu", s->ptrs[i].dev, + (u64) s->ptrs[i].offset); + } + } +} + +static int ptr_matches_stripe(struct bch_fs *c, + struct bch_stripe *v, + const struct bch_extent_ptr *ptr) +{ + unsigned i; + + for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) { + const struct bch_extent_ptr *ptr2 = v->ptrs + i; + + if (ptr->dev == ptr2->dev && + ptr->gen == ptr2->gen && + ptr->offset >= ptr2->offset && + ptr->offset < ptr2->offset + le16_to_cpu(v->sectors)) + return i; + } + + return -1; +} + +static int extent_matches_stripe(struct bch_fs *c, + struct bch_stripe *v, + struct bkey_s_c k) +{ + struct bkey_s_c_extent e; + const struct bch_extent_ptr *ptr; + int idx; + + if (!bkey_extent_is_data(k.k)) + return -1; + + e = bkey_s_c_to_extent(k); + + extent_for_each_ptr(e, ptr) { + idx = ptr_matches_stripe(c, v, ptr); + if (idx >= 0) + return idx; + } + + return -1; +} + +static void ec_stripe_key_init(struct bch_fs *c, + struct bkey_i_stripe *s, + struct open_buckets *blocks, + struct open_buckets *parity, + unsigned stripe_size) +{ + struct open_bucket *ob; + unsigned i, u64s; + + bkey_stripe_init(&s->k_i); + s->v.sectors = cpu_to_le16(stripe_size); + s->v.algorithm = 0; + s->v.nr_blocks = parity->nr + blocks->nr; + s->v.nr_redundant = parity->nr; + s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); + s->v.csum_type = BCH_CSUM_CRC32C; + s->v.pad = 0; + + open_bucket_for_each(c, blocks, ob, i) + s->v.ptrs[i] = ob->ptr; + + open_bucket_for_each(c, parity, ob, i) + s->v.ptrs[blocks->nr + i] = ob->ptr; + + while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { + BUG_ON(1 << s->v.csum_granularity_bits >= + le16_to_cpu(s->v.sectors) || + s->v.csum_granularity_bits == U8_MAX); + s->v.csum_granularity_bits++; + } + + set_bkey_val_u64s(&s->k, u64s); +} + +/* Checksumming: */ + +static void ec_generate_checksums(struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &buf->key.v; + unsigned csum_granularity = 1 << v->csum_granularity_bits; + unsigned csums_per_device = stripe_csums_per_device(v); + unsigned csum_bytes = bch_crc_bytes[v->csum_type]; + unsigned i, j; + + if (!csum_bytes) + return; + + BUG_ON(buf->offset); + BUG_ON(buf->size != le16_to_cpu(v->sectors)); + + for (i = 0; i < v->nr_blocks; i++) { + for (j = 0; j < csums_per_device; j++) { + unsigned offset = j << v->csum_granularity_bits; + unsigned len = min(csum_granularity, buf->size - offset); + + struct bch_csum csum = + bch2_checksum(NULL, v->csum_type, + null_nonce(), + buf->data[i] + (offset << 9), + len << 9); + + memcpy(stripe_csum(v, i, j), &csum, csum_bytes); + } + } +} + +static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &buf->key.v; + unsigned csum_granularity = 1 << v->csum_granularity_bits; + unsigned csum_bytes = bch_crc_bytes[v->csum_type]; + unsigned i; + + if (!csum_bytes) + return; + + for (i = 0; i < v->nr_blocks; i++) { + unsigned offset = buf->offset; + unsigned end = buf->offset + buf->size; + + if (!test_bit(i, buf->valid)) + continue; + + while (offset < end) { + unsigned j = offset >> v->csum_granularity_bits; + unsigned len = min(csum_granularity, end - offset); + struct bch_csum csum; + + BUG_ON(offset & (csum_granularity - 1)); + BUG_ON(offset + len != le16_to_cpu(v->sectors) && + ((offset + len) & (csum_granularity - 1))); + + csum = bch2_checksum(NULL, v->csum_type, + null_nonce(), + buf->data[i] + ((offset - buf->offset) << 9), + len << 9); + + if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) { + __bcache_io_error(c, + "checksum error while doing reconstruct read (%u:%u)", + i, j); + clear_bit(i, buf->valid); + break; + } + + offset += len; + } + } +} + +/* Erasure coding: */ + +static void ec_generate_ec(struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &buf->key.v; + unsigned nr_data = v->nr_blocks - v->nr_redundant; + unsigned bytes = le16_to_cpu(v->sectors) << 9; + + raid_gen(nr_data, v->nr_redundant, bytes, buf->data); +} + +static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr) +{ + return nr - bitmap_weight(buf->valid, nr); +} + +static unsigned ec_nr_failed(struct ec_stripe_buf *buf) +{ + return __ec_nr_failed(buf, buf->key.v.nr_blocks); +} + +static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &buf->key.v; + unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0; + unsigned nr_data = v->nr_blocks - v->nr_redundant; + unsigned bytes = buf->size << 9; + + if (ec_nr_failed(buf) > v->nr_redundant) { + __bcache_io_error(c, + "error doing reconstruct read: unable to read enough blocks"); + return -1; + } + + for (i = 0; i < nr_data; i++) + if (!test_bit(i, buf->valid)) + failed[nr_failed++] = i; + + raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); + return 0; +} + +/* IO: */ + +static void ec_block_endio(struct bio *bio) +{ + struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); + struct bch_dev *ca = ec_bio->ca; + struct closure *cl = bio->bi_private; + + if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding")) + clear_bit(ec_bio->idx, ec_bio->buf->valid); + + bio_put(&ec_bio->bio); + percpu_ref_put(&ca->io_ref); + closure_put(cl); +} + +static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, + unsigned rw, unsigned idx, struct closure *cl) +{ + struct bch_stripe *v = &buf->key.v; + unsigned offset = 0, bytes = buf->size << 9; + struct bch_extent_ptr *ptr = &v->ptrs[idx]; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (!bch2_dev_get_ioref(ca, rw)) { + clear_bit(idx, buf->valid); + return; + } + + while (offset < bytes) { + unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES, + DIV_ROUND_UP(bytes, PAGE_SIZE)); + unsigned b = min_t(size_t, bytes - offset, + nr_iovecs << PAGE_SHIFT); + struct ec_bio *ec_bio; + + ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs, + &c->ec_bioset), + struct ec_bio, bio); + + ec_bio->ca = ca; + ec_bio->buf = buf; + ec_bio->idx = idx; + + bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev); + bio_set_op_attrs(&ec_bio->bio, rw, 0); + + ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); + ec_bio->bio.bi_iter.bi_size = b; + ec_bio->bio.bi_end_io = ec_block_endio; + ec_bio->bio.bi_private = cl; + + bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset); + + closure_get(cl); + percpu_ref_get(&ca->io_ref); + + submit_bio(&ec_bio->bio); + + offset += b; + } + + percpu_ref_put(&ca->io_ref); +} + +/* recovery read path: */ +int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) +{ + struct btree_iter iter; + struct ec_stripe_buf *buf; + struct closure cl; + struct bkey_s_c k; + struct bch_stripe *v; + unsigned stripe_idx; + unsigned offset, end; + unsigned i, nr_data, csum_granularity; + int ret = 0, idx; + + closure_init_stack(&cl); + + BUG_ON(!rbio->pick.idx || + rbio->pick.idx - 1 >= rbio->pick.ec_nr); + + stripe_idx = rbio->pick.ec[rbio->pick.idx - 1].idx; + + buf = kzalloc(sizeof(*buf), GFP_NOIO); + if (!buf) + return -ENOMEM; + + bch2_btree_iter_init(&iter, c, BTREE_ID_EC, + POS(0, stripe_idx), + BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(&iter); + if (btree_iter_err(k) || k.k->type != BCH_STRIPE) { + __bcache_io_error(c, + "error doing reconstruct read: stripe not found"); + kfree(buf); + return bch2_btree_iter_unlock(&iter) ?: -EIO; + } + + bkey_reassemble(&buf->key.k_i, k); + bch2_btree_iter_unlock(&iter); + + v = &buf->key.v; + + nr_data = v->nr_blocks - v->nr_redundant; + + idx = ptr_matches_stripe(c, v, &rbio->pick.ptr); + BUG_ON(idx < 0); + + csum_granularity = 1U << v->csum_granularity_bits; + + offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset; + end = offset + bio_sectors(&rbio->bio); + + BUG_ON(end > le16_to_cpu(v->sectors)); + + buf->offset = round_down(offset, csum_granularity); + buf->size = min_t(unsigned, le16_to_cpu(v->sectors), + round_up(end, csum_granularity)) - buf->offset; + + for (i = 0; i < v->nr_blocks; i++) { + buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO); + if (!buf->data[i]) { + ret = -ENOMEM; + goto err; + } + } + + memset(buf->valid, 0xFF, sizeof(buf->valid)); + + for (i = 0; i < v->nr_blocks; i++) { + struct bch_extent_ptr *ptr = v->ptrs + i; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (ptr_stale(ca, ptr)) { + __bcache_io_error(c, + "error doing reconstruct read: stale pointer"); + clear_bit(i, buf->valid); + continue; + } + + ec_block_io(c, buf, REQ_OP_READ, i, &cl); + } + + closure_sync(&cl); + + if (ec_nr_failed(buf) > v->nr_redundant) { + __bcache_io_error(c, + "error doing reconstruct read: unable to read enough blocks"); + ret = -EIO; + goto err; + } + + ec_validate_checksums(c, buf); + + ret = ec_do_recov(c, buf); + if (ret) + goto err; + + memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, + buf->data[idx] + ((offset - buf->offset) << 9)); +err: + for (i = 0; i < v->nr_blocks; i++) + kfree(buf->data[i]); + kfree(buf); + return ret; +} + +/* ec_stripe bucket accounting: */ + +static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) +{ + ec_stripes_heap n, *h = &c->ec_stripes_heap; + + if (idx >= h->size) { + if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) + return -ENOMEM; + + spin_lock(&c->ec_stripes_heap_lock); + if (n.size > h->size) { + memcpy(n.data, h->data, h->used * sizeof(h->data[0])); + n.used = h->used; + swap(*h, n); + } + spin_unlock(&c->ec_stripes_heap_lock); + + free_heap(&n); + } + + if (!genradix_ptr_alloc(&c->ec_stripes, idx, gfp)) + return -ENOMEM; + + return 0; +} + +static int ec_stripe_mem_alloc(struct bch_fs *c, + struct btree_iter *iter) +{ + size_t idx = iter->pos.offset; + + if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT)) + return 0; + + bch2_btree_iter_unlock(iter); + + if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL)) + return -EINTR; + return -ENOMEM; +} + +static ssize_t stripe_idx_to_delete(struct bch_fs *c) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + + return h->data[0].blocks_nonempty == 0 ? h->data[0].idx : -1; +} + +static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, + struct ec_stripe_heap_entry l, + struct ec_stripe_heap_entry r) +{ + return ((l.blocks_nonempty > r.blocks_nonempty) - + (l.blocks_nonempty < r.blocks_nonempty)); +} + +static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, + size_t i) +{ + struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); + + genradix_ptr(&c->ec_stripes, h->data[i].idx)->heap_idx = i; +} + +static void heap_verify_backpointer(struct bch_fs *c, size_t idx) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + struct ec_stripe *m = genradix_ptr(&c->ec_stripes, idx); + + BUG_ON(!m->alive); + BUG_ON(m->heap_idx >= h->used); + BUG_ON(h->data[m->heap_idx].idx != idx); +} + +static inline unsigned stripe_entry_blocks(struct ec_stripe *m) +{ + return atomic_read(&m->pin) + ? UINT_MAX : atomic_read(&m->blocks_nonempty); +} + +void bch2_stripes_heap_update(struct bch_fs *c, + struct ec_stripe *m, size_t idx) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + bool queue_delete; + size_t i; + + spin_lock(&c->ec_stripes_heap_lock); + + if (!m->alive) { + spin_unlock(&c->ec_stripes_heap_lock); + return; + } + + heap_verify_backpointer(c, idx); + + h->data[m->heap_idx].blocks_nonempty = + stripe_entry_blocks(m); + + i = m->heap_idx; + heap_sift_up(h, i, ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + heap_sift_down(h, i, ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + + heap_verify_backpointer(c, idx); + + queue_delete = stripe_idx_to_delete(c) >= 0; + spin_unlock(&c->ec_stripes_heap_lock); + + if (queue_delete) + schedule_work(&c->ec_stripe_delete_work); +} + +void bch2_stripes_heap_del(struct bch_fs *c, + struct ec_stripe *m, size_t idx) +{ + spin_lock(&c->ec_stripes_heap_lock); + heap_verify_backpointer(c, idx); + + m->alive = false; + heap_del(&c->ec_stripes_heap, m->heap_idx, + ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + spin_unlock(&c->ec_stripes_heap_lock); +} + +void bch2_stripes_heap_insert(struct bch_fs *c, + struct ec_stripe *m, size_t idx) +{ + spin_lock(&c->ec_stripes_heap_lock); + + BUG_ON(heap_full(&c->ec_stripes_heap)); + + heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { + .idx = idx, + .blocks_nonempty = stripe_entry_blocks(m), + }), + ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + m->alive = true; + + heap_verify_backpointer(c, idx); + + spin_unlock(&c->ec_stripes_heap_lock); +} + +static void ec_stripe_delete(struct bch_fs *c, unsigned idx) +{ + struct btree_iter iter; + struct bch_stripe *v = NULL; + struct bkey_s_c k; + struct bkey_i delete; + u64 journal_seq = 0; + + bch2_btree_iter_init(&iter, c, BTREE_ID_EC, + POS(0, idx), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + if (btree_iter_err(k) || k.k->type != BCH_STRIPE) + goto out; + + v = kmalloc(bkey_val_bytes(k.k), GFP_KERNEL); + BUG_ON(!v); + memcpy(v, bkey_s_c_to_stripe(k).v, bkey_val_bytes(k.k)); + + bkey_init(&delete.k); + delete.k.p = iter.pos; + + bch2_btree_insert_at(c, NULL, &journal_seq, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_NOUNLOCK, + BTREE_INSERT_ENTRY(&iter, &delete)); +out: + bch2_btree_iter_unlock(&iter); + kfree(v); +} + +static void ec_stripe_delete_work(struct work_struct *work) +{ + struct bch_fs *c = + container_of(work, struct bch_fs, ec_stripe_delete_work); + ssize_t idx; + + down_read(&c->gc_lock); + + while (1) { + spin_lock(&c->ec_stripes_heap_lock); + idx = stripe_idx_to_delete(c); + spin_unlock(&c->ec_stripes_heap_lock); + + if (idx < 0) + break; + + ec_stripe_delete(c, idx); + } + + up_read(&c->gc_lock); +} + +static int ec_stripe_bkey_insert(struct bch_fs *c, + struct bkey_i_stripe *stripe) +{ + struct ec_stripe *m; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + /* XXX: start pos hint */ +retry: + for_each_btree_key(&iter, c, BTREE_ID_EC, POS_MIN, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) { + if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { + bch2_btree_iter_unlock(&iter); + return -ENOSPC; + } + + if (bkey_deleted(k.k)) + goto found_slot; + } + + return bch2_btree_iter_unlock(&iter) ?: -ENOSPC; +found_slot: + mutex_lock(&c->ec_stripes_lock); + ret = ec_stripe_mem_alloc(c, &iter); + mutex_unlock(&c->ec_stripes_lock); + + if (ret == -EINTR) + goto retry; + if (ret) + return ret; + + m = genradix_ptr(&c->ec_stripes, iter.pos.offset); + atomic_inc(&m->pin); + + stripe->k.p = iter.pos; + + ret = bch2_btree_insert_at(c, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE, + BTREE_INSERT_ENTRY(&iter, &stripe->k_i)); + bch2_btree_iter_unlock(&iter); + + if (ret) + atomic_dec(&m->pin); + + return ret; +} + +/* stripe creation: */ + +static void extent_stripe_ptr_add(struct bkey_s_extent e, + struct ec_stripe_buf *s, + struct bch_extent_ptr *ptr, + unsigned block) +{ + struct bch_extent_stripe_ptr *dst = (void *) ptr; + union bch_extent_entry *end = extent_entry_last(e); + + memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst); + e.k->u64s += sizeof(*dst) / sizeof(u64); + + *dst = (struct bch_extent_stripe_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, + .block = block, + .idx = s->key.k.p.offset, + }; +} + +static int ec_stripe_update_ptrs(struct bch_fs *c, + struct ec_stripe_buf *s, + struct bkey *pos) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_extent e; + struct bch_extent_ptr *ptr; + BKEY_PADDED(k) tmp; + int ret = 0, dev, idx; + + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, + bkey_start_pos(pos), + BTREE_ITER_INTENT); + + while ((k = bch2_btree_iter_peek(&iter)).k && + !btree_iter_err(k) && + bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { + idx = extent_matches_stripe(c, &s->key.v, k); + if (idx < 0) { + bch2_btree_iter_next(&iter); + continue; + } + + dev = s->key.v.ptrs[idx].dev; + + bkey_reassemble(&tmp.k, k); + e = bkey_i_to_s_extent(&tmp.k); + + extent_for_each_ptr(e, ptr) + if (ptr->dev != dev) + ptr->cached = true; + + ptr = (void *) bch2_extent_has_device(e.c, dev); + BUG_ON(!ptr); + + extent_stripe_ptr_add(e, s, ptr, idx); + + ret = bch2_btree_insert_at(c, NULL, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE, + BTREE_INSERT_ENTRY(&iter, &tmp.k)); + if (ret == -EINTR) + ret = 0; + if (ret) + break; + } + + return bch2_btree_iter_unlock(&iter) ?: ret; +} + +/* + * data buckets of new stripe all written: create the stripe + */ +static void ec_stripe_create(struct ec_stripe_new *s) +{ + struct ec_stripe *ec_stripe; + struct bch_fs *c = s->c; + struct open_bucket *ob; + struct bkey_i *k; + struct bch_stripe *v = &s->stripe.key.v; + unsigned i, nr_data = v->nr_blocks - v->nr_redundant; + struct closure cl; + int ret; + + BUG_ON(s->h->s == s); + + closure_init_stack(&cl); + + if (s->err) { + bch_err(c, "error creating stripe: error writing data buckets"); + goto err; + } + + if (!percpu_ref_tryget(&c->writes)) + goto err; + + BUG_ON(bitmap_weight(s->blocks_allocated, + s->blocks.nr) != s->blocks.nr); + + ec_generate_ec(&s->stripe); + + ec_generate_checksums(&s->stripe); + + /* write p/q: */ + for (i = nr_data; i < v->nr_blocks; i++) + ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl); + + closure_sync(&cl); + + for (i = nr_data; i < v->nr_blocks; i++) + if (!test_bit(i, s->stripe.valid)) { + bch_err(c, "error creating stripe: error writing redundancy buckets"); + goto err_put_writes; + } + + ret = ec_stripe_bkey_insert(c, &s->stripe.key); + if (ret) { + bch_err(c, "error creating stripe: error creating stripe key"); + goto err_put_writes; + } + + for_each_keylist_key(&s->keys, k) { + ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); + if (ret) + break; + } + + ec_stripe = genradix_ptr(&c->ec_stripes, s->stripe.key.k.p.offset); + + atomic_dec(&ec_stripe->pin); + bch2_stripes_heap_update(c, ec_stripe, + s->stripe.key.k.p.offset); + +err_put_writes: + percpu_ref_put(&c->writes); +err: + open_bucket_for_each(c, &s->blocks, ob, i) { + ob->ec = NULL; + __bch2_open_bucket_put(c, ob); + } + + bch2_open_buckets_put(c, &s->parity); + + bch2_keylist_free(&s->keys, s->inline_keys); + + mutex_lock(&s->h->lock); + list_del(&s->list); + mutex_unlock(&s->h->lock); + + for (i = 0; i < s->stripe.key.v.nr_blocks; i++) + kvpfree(s->stripe.data[i], s->stripe.size << 9); + kfree(s); +} + +static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h) +{ + struct ec_stripe_new *s = h->s; + + list_add(&s->list, &h->stripes); + h->s = NULL; + + return s; +} + +static void ec_stripe_new_put(struct ec_stripe_new *s) +{ + BUG_ON(atomic_read(&s->pin) <= 0); + if (atomic_dec_and_test(&s->pin)) + ec_stripe_create(s); +} + +/* have a full bucket - hand it off to be erasure coded: */ +void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob) +{ + struct ec_stripe_new *s = ob->ec; + + if (ob->sectors_free) + s->err = -1; + + ec_stripe_new_put(s); +} + +void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) +{ + struct ec_stripe_new *s = ob->ec; + + s->err = -EIO; +} + +void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) +{ + struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); + struct bch_dev *ca; + unsigned offset; + + if (!ob) + return NULL; + + ca = bch_dev_bkey_exists(c, ob->ptr.dev); + offset = ca->mi.bucket_size - ob->sectors_free; + + return ob->ec->stripe.data[ob->ec_idx] + (offset << 9); +} + +void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, + struct bpos pos, unsigned sectors) +{ + struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); + struct ec_stripe_new *ec; + + if (!ob) + return; + + ec = ob->ec; + mutex_lock(&ec->lock); + + if (bch2_keylist_realloc(&ec->keys, ec->inline_keys, + ARRAY_SIZE(ec->inline_keys), + BKEY_U64s)) { + BUG(); + } + + bkey_init(&ec->keys.top->k); + ec->keys.top->k.p = pos; + bch2_key_resize(&ec->keys.top->k, sectors); + bch2_keylist_push(&ec->keys); + + mutex_unlock(&ec->lock); +} + +static int unsigned_cmp(const void *_l, const void *_r) +{ + unsigned l = *((const unsigned *) _l); + unsigned r = *((const unsigned *) _r); + + return (l > r) - (l < r); +} + +/* pick most common bucket size: */ +static unsigned pick_blocksize(struct bch_fs *c, + struct bch_devs_mask *devs) +{ + struct bch_dev *ca; + unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; + struct { + unsigned nr, size; + } cur = { 0, 0 }, best = { 0, 0 }; + + for_each_member_device_rcu(ca, c, i, devs) + sizes[nr++] = ca->mi.bucket_size; + + sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); + + for (i = 0; i < nr; i++) { + if (sizes[i] != cur.size) { + if (cur.nr > best.nr) + best = cur; + + cur.nr = 0; + cur.size = sizes[i]; + } + + cur.nr++; + } + + if (cur.nr > best.nr) + best = cur; + + return best.size; +} + +int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h) +{ + struct ec_stripe_new *s; + unsigned i; + + BUG_ON(h->parity.nr != h->redundancy); + BUG_ON(!h->blocks.nr); + BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX); + lockdep_assert_held(&h->lock); + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + mutex_init(&s->lock); + atomic_set(&s->pin, 1); + s->c = c; + s->h = h; + s->blocks = h->blocks; + s->parity = h->parity; + + memset(&h->blocks, 0, sizeof(h->blocks)); + memset(&h->parity, 0, sizeof(h->parity)); + + bch2_keylist_init(&s->keys, s->inline_keys); + + s->stripe.offset = 0; + s->stripe.size = h->blocksize; + memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid)); + + ec_stripe_key_init(c, &s->stripe.key, + &s->blocks, &s->parity, + h->blocksize); + + for (i = 0; i < s->stripe.key.v.nr_blocks; i++) { + s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL); + if (!s->stripe.data[i]) + goto err; + } + + h->s = s; + + return 0; +err: + for (i = 0; i < s->stripe.key.v.nr_blocks; i++) + kvpfree(s->stripe.data[i], s->stripe.size << 9); + kfree(s); + return -ENOMEM; +} + +static struct ec_stripe_head * +ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, + unsigned algo, unsigned redundancy) +{ + struct ec_stripe_head *h; + struct bch_dev *ca; + unsigned i; + + h = kzalloc(sizeof(*h), GFP_KERNEL); + if (!h) + return NULL; + + mutex_init(&h->lock); + mutex_lock(&h->lock); + INIT_LIST_HEAD(&h->stripes); + + h->target = target; + h->algo = algo; + h->redundancy = redundancy; + + rcu_read_lock(); + h->devs = target_rw_devs(c, BCH_DATA_USER, target); + + for_each_member_device_rcu(ca, c, i, &h->devs) + if (!ca->mi.durability) + __clear_bit(i, h->devs.d); + + h->blocksize = pick_blocksize(c, &h->devs); + + for_each_member_device_rcu(ca, c, i, &h->devs) + if (ca->mi.bucket_size == h->blocksize) + h->nr_active_devs++; + + rcu_read_unlock(); + list_add(&h->list, &c->ec_new_stripe_list); + return h; +} + +void bch2_ec_stripe_head_put(struct ec_stripe_head *h) +{ + struct ec_stripe_new *s = NULL; + + if (h->s && + bitmap_weight(h->s->blocks_allocated, + h->s->blocks.nr) == h->s->blocks.nr) + s = ec_stripe_set_pending(h); + + mutex_unlock(&h->lock); + + if (s) + ec_stripe_new_put(s); +} + +struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + unsigned target, + unsigned algo, + unsigned redundancy) +{ + struct ec_stripe_head *h; + + if (!redundancy) + return NULL; + + mutex_lock(&c->ec_new_stripe_lock); + list_for_each_entry(h, &c->ec_new_stripe_list, list) + if (h->target == target && + h->algo == algo && + h->redundancy == redundancy) { + mutex_lock(&h->lock); + goto found; + } + + h = ec_new_stripe_head_alloc(c, target, algo, redundancy); +found: + mutex_unlock(&c->ec_new_stripe_lock); + return h; +} + +void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) +{ + struct ec_stripe_head *h; + struct open_bucket *ob; + unsigned i; + + mutex_lock(&c->ec_new_stripe_lock); + list_for_each_entry(h, &c->ec_new_stripe_list, list) { + struct ec_stripe_new *s = NULL; + + mutex_lock(&h->lock); + bch2_open_buckets_stop_dev(c, ca, + &h->blocks, + BCH_DATA_USER); + bch2_open_buckets_stop_dev(c, ca, + &h->parity, + BCH_DATA_USER); + + if (!h->s) + goto unlock; + + open_bucket_for_each(c, &h->s->blocks, ob, i) + if (ob->ptr.dev == ca->dev_idx) + goto found; + open_bucket_for_each(c, &h->s->parity, ob, i) + if (ob->ptr.dev == ca->dev_idx) + goto found; + goto unlock; +found: + h->s->err = -1; + s = ec_stripe_set_pending(h); +unlock: + mutex_unlock(&h->lock); + + if (s) + ec_stripe_new_put(s); + } + mutex_unlock(&c->ec_new_stripe_lock); +} + +int bch2_fs_ec_start(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + size_t i, idx = 0; + int ret = 0; + + bch2_btree_iter_init(&iter, c, BTREE_ID_EC, POS(0, U64_MAX), 0); + + k = bch2_btree_iter_prev(&iter); + if (!IS_ERR_OR_NULL(k.k)) + idx = k.k->p.offset + 1; + ret = bch2_btree_iter_unlock(&iter); + if (ret) + return ret; + + if (!init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx), + GFP_KERNEL)) + return -ENOMEM; +#if 0 + ret = genradix_prealloc(&c->ec_stripes, idx, GFP_KERNEL); +#else + for (i = 0; i < idx; i++) + if (!genradix_ptr_alloc(&c->ec_stripes, i, GFP_KERNEL)) + return -ENOMEM; +#endif + return 0; +} + +void bch2_fs_ec_exit(struct bch_fs *c) +{ + struct ec_stripe_head *h; + + while (1) { + mutex_lock(&c->ec_new_stripe_lock); + h = list_first_entry_or_null(&c->ec_new_stripe_list, + struct ec_stripe_head, list); + if (h) + list_del(&h->list); + mutex_unlock(&c->ec_new_stripe_lock); + if (!h) + break; + + BUG_ON(h->s); + BUG_ON(!list_empty(&h->stripes)); + kfree(h); + } + + free_heap(&c->ec_stripes_heap); + genradix_free(&c->ec_stripes); + bioset_exit(&c->ec_bioset); +} + +int bch2_fs_ec_init(struct bch_fs *c) +{ + INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); + + return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), + BIOSET_NEED_BVECS); +} diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h new file mode 100644 index 0000000..13b875a --- /dev/null +++ b/libbcachefs/ec.h @@ -0,0 +1,108 @@ +#ifndef _BCACHEFS_EC_H +#define _BCACHEFS_EC_H + +#include "ec_types.h" +#include "keylist_types.h" + +const char *bch2_ec_key_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_ec_key_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + +#define bch2_bkey_ec_ops (struct bkey_ops) { \ + .key_invalid = bch2_ec_key_invalid, \ + .val_to_text = bch2_ec_key_to_text, \ +} + +struct bch_read_bio; + +struct ec_stripe_buf { + /* might not be buffering the entire stripe: */ + unsigned offset; + unsigned size; + unsigned long valid[BITS_TO_LONGS(EC_STRIPE_MAX)]; + + void *data[EC_STRIPE_MAX]; + + union { + struct bkey_i_stripe key; + u64 pad[255]; + }; +}; + +struct ec_stripe_head; + +struct ec_stripe_new { + struct bch_fs *c; + struct ec_stripe_head *h; + struct mutex lock; + struct list_head list; + + /* counts in flight writes, stripe is created when pin == 0 */ + atomic_t pin; + + int err; + + unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)]; + + struct open_buckets blocks; + struct open_buckets parity; + + struct keylist keys; + u64 inline_keys[BKEY_U64s * 8]; + + struct ec_stripe_buf stripe; +}; + +struct ec_stripe_head { + struct list_head list; + struct mutex lock; + + struct list_head stripes; + + unsigned target; + unsigned algo; + unsigned redundancy; + + struct bch_devs_mask devs; + unsigned nr_active_devs; + + unsigned blocksize; + + struct dev_stripe_state block_stripe; + struct dev_stripe_state parity_stripe; + + struct open_buckets blocks; + struct open_buckets parity; + + struct ec_stripe_new *s; +}; + +int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); + +void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); +void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *, + struct bpos, unsigned); + +void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *); +void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); + +int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); + +void bch2_ec_stripe_head_put(struct ec_stripe_head *); +struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned, + unsigned, unsigned); + +void bch2_stripes_heap_update(struct bch_fs *, struct ec_stripe *, size_t); +void bch2_stripes_heap_del(struct bch_fs *, struct ec_stripe *, size_t); +void bch2_stripes_heap_insert(struct bch_fs *, struct ec_stripe *, size_t); + +void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); + +void bch2_ec_flush_new_stripes(struct bch_fs *); + +int bch2_fs_ec_start(struct bch_fs *); + +void bch2_fs_ec_exit(struct bch_fs *); +int bch2_fs_ec_init(struct bch_fs *); + +#endif /* _BCACHEFS_EC_H */ diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h new file mode 100644 index 0000000..feb3601 --- /dev/null +++ b/libbcachefs/ec_types.h @@ -0,0 +1,30 @@ +#ifndef _BCACHEFS_EC_TYPES_H +#define _BCACHEFS_EC_TYPES_H + +#include + +#define EC_STRIPE_MAX 16 + +struct ec_stripe { + size_t heap_idx; + + u16 sectors; + u8 algorithm; + + u8 nr_blocks; + u8 nr_redundant; + + u8 alive; + atomic_t pin; + atomic_t blocks_nonempty; + atomic_t block_sectors[EC_STRIPE_MAX]; +}; + +struct ec_stripe_heap_entry { + size_t idx; + unsigned blocks_nonempty; +}; + +typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap; + +#endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index a3ec1cc..ebaf390 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -193,29 +193,41 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k) return nr_ptrs; } -unsigned bch2_extent_ptr_durability(struct bch_fs *c, - const struct bch_extent_ptr *ptr) +static unsigned bch2_extent_ptr_durability(struct bch_fs *c, + struct extent_ptr_decoded p) { + unsigned i, durability = 0; struct bch_dev *ca; - if (ptr->cached) + if (p.ptr.cached) return 0; - ca = bch_dev_bkey_exists(c, ptr->dev); + ca = bch_dev_bkey_exists(c, p.ptr.dev); - if (ca->mi.state == BCH_MEMBER_STATE_FAILED) - return 0; + if (ca->mi.state != BCH_MEMBER_STATE_FAILED) + durability = max_t(unsigned, durability, ca->mi.durability); + + for (i = 0; i < p.ec_nr; i++) { + struct ec_stripe *s = + genradix_ptr(&c->ec_stripes, p.idx); - return ca->mi.durability; + if (WARN_ON(!s)) + continue; + + durability = max_t(unsigned, durability, s->nr_redundant); + } + + return durability; } unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e) { - const struct bch_extent_ptr *ptr; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; unsigned durability = 0; - extent_for_each_ptr(e, ptr) - durability += bch2_extent_ptr_durability(c, ptr); + extent_for_each_ptr_decode(e, p, entry) + durability += bch2_extent_ptr_durability(c, p); return durability; } @@ -258,30 +270,46 @@ bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e, return false; } +static union bch_extent_entry *extent_entry_prev(struct bkey_s_extent e, + union bch_extent_entry *entry) +{ + union bch_extent_entry *i = e.v->start; + + if (i == entry) + return NULL; + + while (extent_entry_next(i) != entry) + i = extent_entry_next(i); + return i; +} + union bch_extent_entry *bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr) { - union bch_extent_entry *dst; - union bch_extent_entry *src; + union bch_extent_entry *dst, *src, *prev; + bool drop_crc = true; EBUG_ON(ptr < &e.v->start->ptr || ptr >= &extent_entry_last(e)->ptr); EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); - src = to_entry(ptr + 1); - + src = extent_entry_next(to_entry(ptr)); if (src != extent_entry_last(e) && - extent_entry_type(src) == BCH_EXTENT_ENTRY_ptr) { - dst = to_entry(ptr); - } else { - extent_for_each_entry(e, dst) { - if (dst == to_entry(ptr)) - break; + !extent_entry_is_crc(src)) + drop_crc = false; - if (extent_entry_next(dst) == to_entry(ptr) && - extent_entry_is_crc(dst)) - break; + dst = to_entry(ptr); + while ((prev = extent_entry_prev(e, dst))) { + if (extent_entry_is_ptr(prev)) + break; + + if (extent_entry_is_crc(prev)) { + if (drop_crc) + dst = prev; + break; } + + dst = prev; } memmove_u64s_down(dst, src, @@ -423,6 +451,8 @@ void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k) entry->crc128.csum.lo = (__force __le64) swab64((__force u64) entry->crc128.csum.lo); break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; } } break; @@ -470,6 +500,7 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c, const union bch_extent_entry *entry; struct bch_extent_crc_unpacked crc; const struct bch_extent_ptr *ptr; + const struct bch_extent_stripe_ptr *ec; struct bch_dev *ca; bool first = true; @@ -478,6 +509,18 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c, pr_buf(out, " "); switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + ptr = entry_to_ptr(entry); + ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + ? bch_dev_bkey_exists(c, ptr->dev) + : NULL; + + pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, + (u64) ptr->offset, ptr->gen, + ptr->cached ? " cached" : "", + ca && ptr_stale(ca, ptr) + ? " stale" : ""); + break; case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc128: @@ -490,17 +533,11 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c, crc.csum_type, crc.compression_type); break; - case BCH_EXTENT_ENTRY_ptr: - ptr = entry_to_ptr(entry); - ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] - ? bch_dev_bkey_exists(c, ptr->dev) - : NULL; + case BCH_EXTENT_ENTRY_stripe_ptr: + ec = &entry->stripe_ptr; - pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, - (u64) ptr->offset, ptr->gen, - ptr->cached ? " cached" : "", - ca && ptr_stale(ca, ptr) - ? " stale" : ""); + pr_buf(out, "ec: idx %llu block %u", + (u64) ec->idx, ec->block); break; default: pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); @@ -536,6 +573,11 @@ void bch2_mark_io_failure(struct bch_io_failures *failed, f = &failed->devs[failed->nr++]; f->dev = p->ptr.dev; + f->idx = p->idx; + f->nr_failed = 1; + f->nr_retries = 0; + } else if (p->idx != f->idx) { + f->idx = p->idx; f->nr_failed = 1; f->nr_retries = 0; } else { @@ -550,15 +592,22 @@ static inline bool ptr_better(struct bch_fs *c, const struct extent_ptr_decoded p1, const struct extent_ptr_decoded p2) { - struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); - struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); + if (likely(!p1.idx && !p2.idx)) { + struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); + struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); + + u64 l1 = atomic64_read(&dev1->cur_latency[READ]); + u64 l2 = atomic64_read(&dev2->cur_latency[READ]); - u64 l1 = atomic64_read(&dev1->cur_latency[READ]); - u64 l2 = atomic64_read(&dev2->cur_latency[READ]); + /* Pick at random, biased in favor of the faster device: */ + + return bch2_rand_range(l1 + l2) > l1; + } - /* Pick at random, biased in favor of the faster device: */ + if (force_reconstruct_read(c)) + return p1.idx > p2.idx; - return bch2_rand_range(l1 + l2) > l1; + return p1.idx < p2.idx; } static int extent_pick_read_device(struct bch_fs *c, @@ -579,7 +628,20 @@ static int extent_pick_read_device(struct bch_fs *c, continue; f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; - if (f && f->nr_failed >= f->nr_retries) + if (f) + p.idx = f->nr_failed < f->nr_retries + ? f->idx + : f->idx + 1; + + if (!p.idx && + !bch2_dev_is_readable(ca)) + p.idx++; + + if (force_reconstruct_read(c) && + !p.idx && p.ec_nr) + p.idx++; + + if (p.idx >= p.ec_nr + 1) continue; if (ret && !ptr_better(c, p, *pick)) @@ -616,8 +678,8 @@ const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) return "invalid extent entry type"; - if (extent_entry_is_crc(entry)) - return "has crc field"; + if (!extent_entry_is_ptr(entry)) + return "has non ptr field"; } extent_for_each_ptr(e, ptr) { @@ -754,6 +816,8 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k) case BCH_EXTENT_ENTRY_crc128: entry->crc128.offset += e.k->size - len; break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; } if (extent_entry_is_crc(entry)) @@ -1512,7 +1576,18 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) return "invalid extent entry type"; - if (extent_entry_is_crc(entry)) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + ptr = entry_to_ptr(entry); + + reason = extent_ptr_invalid(c, e, &entry->ptr, + size_ondisk, false); + if (reason) + return reason; + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry)); if (crc.offset + e.k->size > @@ -1533,13 +1608,9 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) else if (nonce != crc.offset + crc.nonce) return "incorrect nonce"; } - } else { - ptr = entry_to_ptr(entry); - - reason = extent_ptr_invalid(c, e, &entry->ptr, - size_ondisk, false); - if (reason) - return reason; + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; } } @@ -1744,6 +1815,7 @@ void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e, { struct bch_extent_crc_unpacked crc; union bch_extent_entry *pos; + unsigned i; extent_for_each_crc(extent_i_to_s(e), crc, pos) if (!bch2_crc_unpacked_cmp(crc, p->crc)) @@ -1754,6 +1826,11 @@ void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e, found: p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; __extent_entry_insert(e, pos, to_entry(&p->ptr)); + + for (i = 0; i < p->ec_nr; i++) { + p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; + __extent_entry_insert(e, pos, to_entry(&p->ec[i])); + } } /* @@ -1808,26 +1885,27 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c, unsigned target, unsigned nr_desired_replicas) { - struct bch_extent_ptr *ptr; + union bch_extent_entry *entry; + struct extent_ptr_decoded p; int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas; if (target && extra > 0) - extent_for_each_ptr(e, ptr) { - int n = bch2_extent_ptr_durability(c, ptr); + extent_for_each_ptr_decode(e, p, entry) { + int n = bch2_extent_ptr_durability(c, p); if (n && n <= extra && - !bch2_dev_in_target(c, ptr->dev, target)) { - ptr->cached = true; + !bch2_dev_in_target(c, p.ptr.dev, target)) { + entry->ptr.cached = true; extra -= n; } } if (extra > 0) - extent_for_each_ptr(e, ptr) { - int n = bch2_extent_ptr_durability(c, ptr); + extent_for_each_ptr_decode(e, p, entry) { + int n = bch2_extent_ptr_durability(c, p); if (n && n <= extra) { - ptr->cached = true; + entry->ptr.cached = true; extra -= n; } } @@ -1903,7 +1981,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b, if ((extent_entry_type(en_l) != extent_entry_type(en_r)) || - extent_entry_is_crc(en_l)) + !extent_entry_is_ptr(en_l)) return BCH_MERGE_NOMERGE; lp = &en_l->ptr; diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 5b786cb..307abd2 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -95,8 +95,6 @@ unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent); unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c); unsigned bch2_extent_is_compressed(struct bkey_s_c); -unsigned bch2_extent_ptr_durability(struct bch_fs *, - const struct bch_extent_ptr *); unsigned bch2_extent_durability(struct bch_fs *, struct bkey_s_c_extent); bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent, @@ -361,20 +359,13 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) /* Iterate over pointers, with crcs: */ -static inline struct extent_ptr_decoded -__extent_ptr_decoded_init(const struct bkey *k) -{ - return (struct extent_ptr_decoded) { - .crc = bch2_extent_crc_unpack(k, NULL), - }; -} - -#define EXTENT_ITERATE_EC (1 << 0) - #define __extent_ptr_next_decode(_e, _ptr, _entry) \ ({ \ __label__ out; \ \ + (_ptr).idx = 0; \ + (_ptr).ec_nr = 0; \ + \ extent_for_each_entry_from(_e, _entry, _entry) \ switch (extent_entry_type(_entry)) { \ case BCH_EXTENT_ENTRY_ptr: \ @@ -386,14 +377,16 @@ __extent_ptr_decoded_init(const struct bkey *k) (_ptr).crc = bch2_extent_crc_unpack((_e).k, \ entry_to_crc(_entry)); \ break; \ + case BCH_EXTENT_ENTRY_stripe_ptr: \ + (_ptr).ec[(_ptr).ec_nr++] = _entry->stripe_ptr; \ + break; \ } \ - \ out: \ _entry < extent_entry_last(_e); \ }) #define extent_for_each_ptr_decode(_e, _ptr, _entry) \ - for ((_ptr) = __extent_ptr_decoded_init((_e).k), \ + for ((_ptr).crc = bch2_extent_crc_unpack((_e).k, NULL), \ (_entry) = (_e).v->start; \ __extent_ptr_next_decode(_e, _ptr, _entry); \ (_entry) = extent_entry_next(_entry)) diff --git a/libbcachefs/extents_types.h b/libbcachefs/extents_types.h index 02c6256..efd72e2 100644 --- a/libbcachefs/extents_types.h +++ b/libbcachefs/extents_types.h @@ -19,14 +19,18 @@ struct bch_extent_crc_unpacked { }; struct extent_ptr_decoded { + unsigned idx; + unsigned ec_nr; struct bch_extent_crc_unpacked crc; struct bch_extent_ptr ptr; + struct bch_extent_stripe_ptr ec[4]; }; struct bch_io_failures { u8 nr; struct bch_dev_io_failures { u8 dev; + u8 idx; u8 nr_failed; u8 nr_retries; } devs[BCH_REPLICAS_MAX]; diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 986bb7d..34cfd5d 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -454,12 +454,12 @@ struct bch_page_state { union { struct { /* existing data: */ unsigned sectors:PAGE_SECTOR_SHIFT + 1; + + /* Uncompressed, fully allocated replicas: */ unsigned nr_replicas:4; - unsigned compressed:1; - /* Owns PAGE_SECTORS sized reservation: */ - unsigned reserved:1; - unsigned reservation_replicas:4; + /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */ + unsigned replicas_reserved:4; /* Owns PAGE_SECTORS sized quota reservation: */ unsigned quota_reserved:1; @@ -506,7 +506,7 @@ static inline struct bch_page_state *page_state(struct page *page) static inline unsigned page_res_sectors(struct bch_page_state s) { - return s.reserved ? s.reservation_replicas * PAGE_SECTORS : 0; + return s.replicas_reserved * PAGE_SECTORS; } static void __bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode, @@ -524,8 +524,10 @@ static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *i { struct bch_page_state s; + EBUG_ON(!PageLocked(page)); + s = page_state_cmpxchg(page_state(page), s, { - s.reserved = 0; + s.replicas_reserved = 0; s.quota_reserved = 0; }); @@ -535,62 +537,46 @@ static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *i static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode, struct page *page, bool check_enospc) { - struct bch_page_state *s = page_state(page), new, old; + struct bch_page_state *s = page_state(page), new; /* XXX: this should not be open coded */ unsigned nr_replicas = inode->ei_inode.bi_data_replicas ? inode->ei_inode.bi_data_replicas - 1 : c->opts.data_replicas; - - struct disk_reservation disk_res = bch2_disk_reservation_init(c, - nr_replicas); + struct disk_reservation disk_res; struct quota_res quota_res = { 0 }; - int ret = 0; + int ret; - /* - * XXX: this could likely be quite a bit simpler, page reservations - * _should_ only be manipulated with page locked: - */ + EBUG_ON(!PageLocked(page)); - old = page_state_cmpxchg(s, new, { - if (new.reserved - ? (new.reservation_replicas < disk_res.nr_replicas) - : (new.sectors < PAGE_SECTORS || - new.nr_replicas < disk_res.nr_replicas || - new.compressed)) { - int sectors = (disk_res.nr_replicas * PAGE_SECTORS - - page_res_sectors(new) - - disk_res.sectors); - - if (sectors > 0) { - ret = bch2_disk_reservation_add(c, &disk_res, sectors, - !check_enospc - ? BCH_DISK_RESERVATION_NOFAIL : 0); - if (unlikely(ret)) - goto err; - } + if (s->replicas_reserved < nr_replicas) { + ret = bch2_disk_reservation_get(c, &disk_res, PAGE_SECTORS, + nr_replicas - s->replicas_reserved, + !check_enospc ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (unlikely(ret)) + return ret; - new.reserved = 1; - new.reservation_replicas = disk_res.nr_replicas; - } + page_state_cmpxchg(s, new, ({ + BUG_ON(new.replicas_reserved + + disk_res.nr_replicas != nr_replicas); + new.replicas_reserved += disk_res.nr_replicas; + })); + } - if (!new.quota_reserved && - new.sectors + new.dirty_sectors < PAGE_SECTORS) { - ret = bch2_quota_reservation_add(c, inode, "a_res, - PAGE_SECTORS - quota_res.sectors, - check_enospc); - if (unlikely(ret)) - goto err; + if (!s->quota_reserved && + s->sectors + s->dirty_sectors < PAGE_SECTORS) { + ret = bch2_quota_reservation_add(c, inode, "a_res, + PAGE_SECTORS, + check_enospc); + if (unlikely(ret)) + return ret; + page_state_cmpxchg(s, new, ({ + BUG_ON(new.quota_reserved); new.quota_reserved = 1; - } - }); + })); + } - quota_res.sectors -= (new.quota_reserved - old.quota_reserved) * PAGE_SECTORS; - disk_res.sectors -= page_res_sectors(new) - page_res_sectors(old); -err: - bch2_quota_reservation_put(c, inode, "a_res); - bch2_disk_reservation_put(c, &disk_res); return ret; } @@ -600,6 +586,8 @@ static void bch2_clear_page_bits(struct page *page) struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_page_state s; + EBUG_ON(!PageLocked(page)); + if (!PagePrivate(page)) return; @@ -710,6 +698,9 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage, { int ret; + EBUG_ON(!PageLocked(page)); + EBUG_ON(!PageLocked(newpage)); + ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; @@ -856,10 +847,13 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) { struct bvec_iter iter; struct bio_vec bv; - bool compressed = bch2_extent_is_compressed(k); - unsigned nr_ptrs = bch2_extent_nr_dirty_ptrs(k); + unsigned nr_ptrs = !bch2_extent_is_compressed(k) + ? bch2_extent_nr_dirty_ptrs(k) + : 0; bio_for_each_segment(bv, bio, iter) { + /* brand new pages, don't need to be locked: */ + struct bch_page_state *s = page_state(bv.bv_page); /* sectors in @k from the start of this page: */ @@ -867,14 +861,11 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) unsigned page_sectors = min(bv.bv_len >> 9, k_sectors); - s->nr_replicas = !s->sectors - ? nr_ptrs - : min_t(unsigned, s->nr_replicas, nr_ptrs); + s->nr_replicas = page_sectors == PAGE_SECTORS + ? nr_ptrs : 0; BUG_ON(s->sectors + page_sectors > PAGE_SECTORS); s->sectors += page_sectors; - - s->compressed |= compressed; } } @@ -1214,7 +1205,7 @@ static int __bch2_writepage(struct page *page, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_writepage_state *w = data; struct bch_page_state new, old; - unsigned offset; + unsigned offset, nr_replicas_this_write; loff_t i_size = i_size_read(&inode->v); pgoff_t end_index = i_size >> PAGE_SHIFT; @@ -1240,19 +1231,31 @@ static int __bch2_writepage(struct page *page, */ zero_user_segment(page, offset, PAGE_SIZE); do_io: + EBUG_ON(!PageLocked(page)); + /* Before unlocking the page, transfer reservation to w->io: */ old = page_state_cmpxchg(page_state(page), new, { - EBUG_ON(!new.reserved && - (new.sectors != PAGE_SECTORS || - new.compressed)); + /* + * If we didn't get a reservation, we can only write out the + * number of (fully allocated) replicas that currently exist, + * and only if the entire page has been written: + */ + nr_replicas_this_write = + max_t(unsigned, + new.replicas_reserved, + (new.sectors == PAGE_SECTORS + ? new.nr_replicas : 0)); + + BUG_ON(!nr_replicas_this_write); - if (new.reserved) - new.nr_replicas = new.reservation_replicas; - new.reserved = 0; + new.nr_replicas = w->opts.compression + ? 0 + : nr_replicas_this_write; - new.compressed |= w->opts.compression != 0; + new.replicas_reserved = 0; new.sectors += new.dirty_sectors; + BUG_ON(new.sectors != PAGE_SECTORS); new.dirty_sectors = 0; }); @@ -1261,21 +1264,20 @@ do_io: unlock_page(page); if (w->io && - (w->io->op.op.res.nr_replicas != new.nr_replicas || + (w->io->op.op.res.nr_replicas != nr_replicas_this_write || !bio_can_add_page_contig(&w->io->op.op.wbio.bio, page))) bch2_writepage_do_io(w); if (!w->io) - bch2_writepage_io_alloc(c, w, inode, page, new.nr_replicas); + bch2_writepage_io_alloc(c, w, inode, page, + nr_replicas_this_write); w->io->new_sectors += new.sectors - old.sectors; BUG_ON(inode != w->io->op.inode); BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page)); - if (old.reserved) - w->io->op.op.res.sectors += old.reservation_replicas * PAGE_SECTORS; - + w->io->op.op.res.sectors += old.replicas_reserved * PAGE_SECTORS; w->io->op.new_i_size = i_size; if (wbc->sync_mode == WB_SYNC_ALL) @@ -2547,10 +2549,9 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, &disk_res, "a_res, iter, &reservation.k_i, 0, true, true, NULL); - +btree_iter_err: bch2_quota_reservation_put(c, inode, "a_res); bch2_disk_reservation_put(c, &disk_res); -btree_iter_err: if (ret == -EINTR) ret = 0; if (ret) @@ -2612,6 +2613,8 @@ long bch2_fallocate_dispatch(struct file *file, int mode, static bool page_is_data(struct page *page) { + EBUG_ON(!PageLocked(page)); + /* XXX: should only have to check PageDirty */ return PagePrivate(page) && (page_state(page)->sectors || diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 34cab25..12d77ec 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -15,6 +15,7 @@ #include "clock.h" #include "debug.h" #include "disk_groups.h" +#include "ec.h" #include "error.h" #include "extents.h" #include "io.h" @@ -302,6 +303,7 @@ static void __bch2_write_index(struct bch_write_op *op) struct bkey_s_extent e; struct bch_extent_ptr *ptr; struct bkey_i *src, *dst = keys->keys, *n, *k; + unsigned dev; int ret; for (src = keys->keys; src != keys->top; src = n) { @@ -345,6 +347,10 @@ static void __bch2_write_index(struct bch_write_op *op) } } out: + /* If some a bucket wasn't written, we can't erasure code it: */ + for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) + bch2_open_bucket_write_error(c, &op->open_buckets, dev); + bch2_open_buckets_put(c, &op->open_buckets); return; err: @@ -421,7 +427,8 @@ static void init_append_extent(struct bch_write_op *op, static struct bio *bch2_write_bio_alloc(struct bch_fs *c, struct write_point *wp, struct bio *src, - bool *page_alloc_failed) + bool *page_alloc_failed, + void *buf) { struct bch_write_bio *wbio; struct bio *bio; @@ -431,11 +438,18 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write); wbio = wbio_init(bio); - wbio->bounce = true; wbio->put_bio = true; /* copy WRITE_SYNC flag */ wbio->bio.bi_opf = src->bi_opf; + if (buf) { + bio->bi_iter.bi_size = output_available; + bch2_bio_map(bio, buf); + return bio; + } + + wbio->bounce = true; + /* * We can't use mempool for more than c->sb.encoded_extent_max * worth of pages, but we'd like to allocate more if we can: @@ -600,14 +614,18 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) struct bio *src = &op->wbio.bio, *dst = src; struct bvec_iter saved_iter; struct bkey_i *key_to_write; + void *ec_buf; unsigned key_to_write_offset = op->insert_keys.top_p - op->insert_keys.keys_p; - unsigned total_output = 0; - bool bounce = false, page_alloc_failed = false; + unsigned total_output = 0, total_input = 0; + bool bounce = false; + bool page_alloc_failed = false; int ret, more = 0; BUG_ON(!bio_sectors(src)); + ec_buf = bch2_writepoint_ec_buf(c, wp); + switch (bch2_write_prep_encoded_data(op, wp)) { case PREP_ENCODED_OK: break; @@ -617,16 +635,26 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) case PREP_ENCODED_CHECKSUM_ERR: goto csum_err; case PREP_ENCODED_DO_WRITE: + if (ec_buf) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bio_copy_data(dst, src); + bounce = true; + } init_append_extent(op, wp, op->version, op->crc); goto do_write; } - if (op->compression_type || + if (ec_buf || + op->compression_type || (op->csum_type && !(op->flags & BCH_WRITE_PAGES_STABLE)) || (bch2_csum_type_is_encryption(op->csum_type) && !(op->flags & BCH_WRITE_PAGES_OWNED))) { - dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed); + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); bounce = true; } @@ -729,7 +757,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) if (dst != src) bio_advance(dst, dst_len); bio_advance(src, src_len); - total_output += dst_len; + total_output += dst_len; + total_input += src_len; } while (dst->bi_iter.bi_size && src->bi_iter.bi_size && wp->sectors_free && @@ -742,16 +771,20 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) dst->bi_iter = saved_iter; - if (!bounce && more) { - dst = bio_split(src, total_output >> 9, + if (dst == src && more) { + BUG_ON(total_output != total_input); + + dst = bio_split(src, total_input >> 9, GFP_NOIO, &c->bio_write); - wbio_init(dst)->put_bio = true; + wbio_init(dst)->put_bio = true; + /* copy WRITE_SYNC flag */ + dst->bi_opf = src->bi_opf; } dst->bi_iter.bi_size = total_output; /* Free unneeded pages after compressing: */ - if (bounce) + if (to_wbio(dst)->bounce) while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE)) mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page, &c->bio_bounce_pages); @@ -760,6 +793,10 @@ do_write: key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); + bch2_ec_add_backpointer(c, wp, + bkey_start_pos(&key_to_write->k), + total_input >> 9); + dst->bi_end_io = bch2_write_endio; dst->bi_private = &op->cl; bio_set_op_attrs(dst, REQ_OP_WRITE, 0); @@ -774,10 +811,10 @@ csum_err: "rewriting existing data (memory corruption?)"); ret = -EIO; err: - if (bounce) { + if (to_wbio(dst)->bounce) bch2_bio_free_pages_pool(c, dst); + if (to_wbio(dst)->put_bio) bio_put(dst); - } return ret; } @@ -789,6 +826,8 @@ static void __bch2_write(struct closure *cl) struct write_point *wp; int ret; again: + memset(&op->failed, 0, sizeof(op->failed)); + do { /* +1 for possible cache device: */ if (op->open_buckets.nr + op->nr_replicas + 1 > @@ -803,6 +842,7 @@ again: wp = bch2_alloc_sectors_start(c, op->target, + op->opts.erasure_code, op->write_point, &op->devs_have, op->nr_replicas, @@ -882,8 +922,6 @@ void bch2_write(struct closure *cl) op->start_time = local_clock(); - memset(&op->failed, 0, sizeof(op->failed)); - bch2_keylist_init(&op->insert_keys, op->inline_keys); wbio_init(&op->wbio.bio)->put_bio = false; @@ -1557,8 +1595,10 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, if (!pick_ret) goto hole; - if (pick_ret < 0) - goto no_device; + if (pick_ret < 0) { + __bcache_io_error(c, "no device to read from"); + goto err; + } if (pick_ret > 0) ca = bch_dev_bkey_exists(c, pick.ptr.dev); @@ -1683,31 +1723,46 @@ noclone: bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - if (!rbio->have_ioref) - goto no_device_postclone; - percpu_down_read_preempt_disable(&c->usage_lock); bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); percpu_up_read_preempt_enable(&c->usage_lock); - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], - bio_sectors(&rbio->bio)); + if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) { + bio_inc_remaining(&orig->bio); + trace_read_split(&orig->bio); + } + + if (!rbio->pick.idx) { + if (!rbio->have_ioref) { + __bcache_io_error(c, "no device to read from"); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; + } - bio_set_dev(&rbio->bio, ca->disk_sb.bdev); + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], + bio_sectors(&rbio->bio)); + bio_set_dev(&rbio->bio, ca->disk_sb.bdev); - if (likely(!(flags & BCH_READ_IN_RETRY))) { - if (!(flags & BCH_READ_LAST_FRAGMENT)) { - bio_inc_remaining(&orig->bio); - trace_read_split(&orig->bio); + if (likely(!(flags & BCH_READ_IN_RETRY))) + submit_bio(&rbio->bio); + else + submit_bio_wait(&rbio->bio); + } else { + /* Attempting reconstruct read: */ + if (bch2_ec_read_extent(c, rbio)) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; } - submit_bio(&rbio->bio); + if (likely(!(flags & BCH_READ_IN_RETRY))) + bio_endio(&rbio->bio); + } +out: + if (likely(!(flags & BCH_READ_IN_RETRY))) { return 0; } else { int ret; - submit_bio_wait(&rbio->bio); - rbio->context = RBIO_CONTEXT_UNBOUND; bch2_read_endio(&rbio->bio); @@ -1722,22 +1777,12 @@ noclone: return ret; } -no_device_postclone: - if (!rbio->split) - rbio->bio.bi_end_io = rbio->end_io; - bch2_rbio_free(rbio); -no_device: - __bcache_io_error(c, "no device to read from"); - - if (likely(!(flags & BCH_READ_IN_RETRY))) { - orig->bio.bi_status = BLK_STS_IOERR; - - if (flags & BCH_READ_LAST_FRAGMENT) - bch2_rbio_done(orig); - return 0; - } else { +err: + if (flags & BCH_READ_IN_RETRY) return READ_ERR; - } + + orig->bio.bi_status = BLK_STS_IOERR; + goto out_read_done; hole: /* @@ -1749,7 +1794,7 @@ hole: orig->hole = true; zero_fill_bio_iter(&orig->bio, iter); - +out_read_done: if (flags & BCH_READ_LAST_FRAGMENT) bch2_rbio_done(orig); return 0; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 26c7ae7..ac1219f 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -134,6 +134,8 @@ static enum { c->opts.block_size; BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors); + bkey_extent_init(&buf->key); + /* * We have to set last_seq here, _before_ opening a new journal entry: * @@ -334,15 +336,14 @@ u64 bch2_inode_journal_seq(struct journal *j, u64 inode) } static int __journal_res_get(struct journal *j, struct journal_res *res, - unsigned u64s_min, unsigned u64s_max) + unsigned flags) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf; int ret; retry: - ret = journal_res_get_fast(j, res, u64s_min, u64s_max); - if (ret) - return ret; + if (journal_res_get_fast(j, res)) + return 0; spin_lock(&j->lock); /* @@ -350,10 +351,9 @@ retry: * that just did journal_entry_open() and call journal_entry_close() * unnecessarily */ - ret = journal_res_get_fast(j, res, u64s_min, u64s_max); - if (ret) { + if (journal_res_get_fast(j, res)) { spin_unlock(&j->lock); - return 1; + return 0; } /* @@ -376,7 +376,12 @@ retry: spin_unlock(&j->lock); return -EROFS; case JOURNAL_ENTRY_INUSE: - /* haven't finished writing out the previous one: */ + /* + * haven't finished writing out the previous entry, can't start + * another yet: + * signal to caller which sequence number we're trying to open: + */ + res->seq = journal_cur_seq(j) + 1; spin_unlock(&j->lock); trace_journal_entry_full(c); goto blocked; @@ -388,6 +393,8 @@ retry: /* We now have a new, closed journal buf - see if we can open it: */ ret = journal_entry_open(j); + if (!ret) + res->seq = journal_cur_seq(j); spin_unlock(&j->lock); if (ret < 0) @@ -407,7 +414,7 @@ retry: blocked: if (!j->res_get_blocked_start) j->res_get_blocked_start = local_clock() ?: 1; - return 0; + return -EAGAIN; } /* @@ -421,14 +428,14 @@ blocked: * btree node write locks. */ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, - unsigned u64s_min, unsigned u64s_max) + unsigned flags) { int ret; wait_event(j->wait, - (ret = __journal_res_get(j, res, u64s_min, - u64s_max))); - return ret < 0 ? ret : 0; + (ret = __journal_res_get(j, res, flags)) != -EAGAIN || + (flags & JOURNAL_RES_GET_NONBLOCK)); + return ret; } u64 bch2_journal_last_unwritten_seq(struct journal *j) @@ -452,28 +459,55 @@ u64 bch2_journal_last_unwritten_seq(struct journal *j) * btree root - every journal entry contains the roots of all the btrees, so it * doesn't need to bother with getting a journal reservation */ -int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent) +int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl) { - int ret; - + struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool need_reclaim = false; +retry: spin_lock(&j->lock); - BUG_ON(seq > journal_cur_seq(j)); if (seq < journal_cur_seq(j) || journal_entry_is_open(j)) { spin_unlock(&j->lock); - return 1; + return 0; + } + + if (journal_cur_seq(j) < seq) { + switch (journal_buf_switch(j, false)) { + case JOURNAL_ENTRY_ERROR: + spin_unlock(&j->lock); + return -EROFS; + case JOURNAL_ENTRY_INUSE: + /* haven't finished writing out the previous one: */ + trace_journal_entry_full(c); + goto blocked; + case JOURNAL_ENTRY_CLOSED: + break; + case JOURNAL_UNLOCKED: + goto retry; + } + } + + BUG_ON(journal_cur_seq(j) < seq); + + if (!journal_entry_open(j)) { + need_reclaim = true; + goto blocked; } - ret = journal_entry_open(j); - if (!ret) - closure_wait(&j->async_wait, parent); spin_unlock(&j->lock); - if (!ret) - bch2_journal_reclaim_work(&j->reclaim_work.work); + return 0; +blocked: + if (!j->res_get_blocked_start) + j->res_get_blocked_start = local_clock() ?: 1; - return ret; + closure_wait(&j->async_wait, cl); + spin_unlock(&j->lock); + + if (need_reclaim) + bch2_journal_reclaim_work(&j->reclaim_work.work); + return -EAGAIN; } static int journal_seq_error(struct journal *j, u64 seq) @@ -593,11 +627,10 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq) void bch2_journal_meta_async(struct journal *j, struct closure *parent) { struct journal_res res; - unsigned u64s = jset_u64s(0); memset(&res, 0, sizeof(res)); - bch2_journal_res_get(j, &res, u64s, u64s); + bch2_journal_res_get(j, &res, jset_u64s(0), 0); bch2_journal_res_put(j, &res); bch2_journal_flush_seq_async(j, res.seq, parent); @@ -606,12 +639,11 @@ void bch2_journal_meta_async(struct journal *j, struct closure *parent) int bch2_journal_meta(struct journal *j) { struct journal_res res; - unsigned u64s = jset_u64s(0); int ret; memset(&res, 0, sizeof(res)); - ret = bch2_journal_res_get(j, &res, u64s, u64s); + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); if (ret) return ret; @@ -751,9 +783,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL, ca->mi.bucket_size, gc_phase(GC_PHASE_SB), - new_fs - ? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE - : 0); + 0); if (c) { spin_unlock(&c->journal.lock); @@ -861,10 +891,6 @@ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) { - spin_lock(&j->lock); - bch2_extent_drop_device(bkey_i_to_s_extent(&j->key), ca->dev_idx); - spin_unlock(&j->lock); - wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx)); } @@ -1000,8 +1026,6 @@ int bch2_fs_journal_init(struct journal *j) j->write_delay_ms = 1000; j->reclaim_delay_ms = 100; - bkey_extent_init(&j->key); - atomic64_set(&j->reservations.counter, ((union journal_res_state) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 5870392..0595597 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -269,12 +269,10 @@ static inline void bch2_journal_res_put(struct journal *j, } int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, - unsigned, unsigned); + unsigned); static inline int journal_res_get_fast(struct journal *j, - struct journal_res *res, - unsigned u64s_min, - unsigned u64s_max) + struct journal_res *res) { union journal_res_state old, new; u64 v = atomic64_read(&j->reservations.counter); @@ -286,37 +284,37 @@ static inline int journal_res_get_fast(struct journal *j, * Check if there is still room in the current journal * entry: */ - if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s) + if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) return 0; - res->offset = old.cur_entry_offset; - res->u64s = min(u64s_max, j->cur_entry_u64s - - old.cur_entry_offset); - - journal_state_inc(&new); new.cur_entry_offset += res->u64s; + journal_state_inc(&new); } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - res->ref = true; - res->idx = new.idx; - res->seq = le64_to_cpu(j->buf[res->idx].data->seq); + res->ref = true; + res->idx = old.idx; + res->offset = old.cur_entry_offset; + res->seq = le64_to_cpu(j->buf[old.idx].data->seq); return 1; } +#define JOURNAL_RES_GET_NONBLOCK (1 << 0) + static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, - unsigned u64s_min, unsigned u64s_max) + unsigned u64s, unsigned flags) { int ret; EBUG_ON(res->ref); - EBUG_ON(u64s_max < u64s_min); EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); - if (journal_res_get_fast(j, res, u64s_min, u64s_max)) + res->u64s = u64s; + + if (journal_res_get_fast(j, res)) goto out; - ret = bch2_journal_res_get_slowpath(j, res, u64s_min, u64s_max); + ret = bch2_journal_res_get_slowpath(j, res, flags); if (ret) return ret; out: diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index c83e8eb..3840764 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -426,7 +426,7 @@ static int journal_read_buf_realloc(struct journal_read_buf *b, static int journal_read_bucket(struct bch_dev *ca, struct journal_read_buf *buf, struct journal_list *jlist, - unsigned bucket, u64 *seq, bool *entries_found) + unsigned bucket) { struct bch_fs *c = ca->fs; struct journal_device *ja = &ca->journal; @@ -511,7 +511,6 @@ reread: switch (ret) { case JOURNAL_ENTRY_ADD_OK: - *entries_found = true; break; case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: break; @@ -519,9 +518,6 @@ reread: return ret; } - if (le64_to_cpu(j->seq) > *seq) - *seq = le64_to_cpu(j->seq); - sectors = vstruct_sectors(j, c->block_bits); next_block: pr_debug("next"); @@ -535,120 +531,51 @@ next_block: static void bch2_journal_read_device(struct closure *cl) { -#define read_bucket(b) \ - ({ \ - bool entries_found = false; \ - ret = journal_read_bucket(ca, &buf, jlist, b, &seq, \ - &entries_found); \ - if (ret) \ - goto err; \ - __set_bit(b, bitmap); \ - entries_found; \ - }) - struct journal_device *ja = container_of(cl, struct journal_device, read); struct bch_dev *ca = container_of(ja, struct bch_dev, journal); struct journal_list *jlist = container_of(cl->parent, struct journal_list, cl); - struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev); struct journal_read_buf buf = { NULL, 0 }; - - DECLARE_BITMAP(bitmap, ja->nr); - unsigned i, l, r; - u64 seq = 0; + u64 min_seq = U64_MAX; + unsigned i; int ret; if (!ja->nr) goto out; - bitmap_zero(bitmap, ja->nr); ret = journal_read_buf_realloc(&buf, PAGE_SIZE); if (ret) goto err; pr_debug("%u journal buckets", ja->nr); - /* - * If the device supports discard but not secure discard, we can't do - * the fancy fibonacci hash/binary search because the live journal - * entries might not form a contiguous range: - */ - for (i = 0; i < ja->nr; i++) - read_bucket(i); - goto search_done; - - if (!blk_queue_nonrot(q)) - goto linear_scan; - - /* - * Read journal buckets ordered by golden ratio hash to quickly - * find a sequence of buckets with valid journal entries - */ for (i = 0; i < ja->nr; i++) { - l = (i * 2654435769U) % ja->nr; - - if (test_bit(l, bitmap)) - break; - - if (read_bucket(l)) - goto bsearch; + ret = journal_read_bucket(ca, &buf, jlist, i); + if (ret) + goto err; } - /* - * If that fails, check all the buckets we haven't checked - * already - */ - pr_debug("falling back to linear search"); -linear_scan: - for (l = find_first_zero_bit(bitmap, ja->nr); - l < ja->nr; - l = find_next_zero_bit(bitmap, ja->nr, l + 1)) - if (read_bucket(l)) - goto bsearch; - - /* no journal entries on this device? */ - if (l == ja->nr) - goto out; -bsearch: - /* Binary search */ - r = find_next_bit(bitmap, ja->nr, l + 1); - pr_debug("starting binary search, l %u r %u", l, r); - - while (l + 1 < r) { - unsigned m = (l + r) >> 1; - u64 cur_seq = seq; - - read_bucket(m); + /* Find the journal bucket with the highest sequence number: */ + for (i = 0; i < ja->nr; i++) { + if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx]) + ja->cur_idx = i; - if (cur_seq != seq) - l = m; - else - r = m; + min_seq = min(ja->bucket_seq[i], min_seq); } -search_done: /* - * Find the journal bucket with the highest sequence number: - * * If there's duplicate journal entries in multiple buckets (which * definitely isn't supposed to happen, but...) - make sure to start * cur_idx at the last of those buckets, so we don't deadlock trying to * allocate */ - seq = 0; - - for (i = 0; i < ja->nr; i++) - if (ja->bucket_seq[i] >= seq && - ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) { - /* - * When journal_next_bucket() goes to allocate for - * the first time, it'll use the bucket after - * ja->cur_idx - */ - ja->cur_idx = i; - seq = ja->bucket_seq[i]; - } + while (ja->bucket_seq[ja->cur_idx] > min_seq && + ja->bucket_seq[ja->cur_idx] > + ja->bucket_seq[(ja->cur_idx + 1) % ja->nr]) + ja->cur_idx++; + + ja->sectors_free = 0; /* * Set last_idx to indicate the entire journal is full and needs to be @@ -656,17 +583,6 @@ search_done: * pinned when it first runs: */ ja->last_idx = (ja->cur_idx + 1) % ja->nr; - - /* - * Read buckets in reverse order until we stop finding more journal - * entries: - */ - for (i = (ja->cur_idx + ja->nr - 1) % ja->nr; - i != ja->cur_idx; - i = (i + ja->nr - 1) % ja->nr) - if (!test_bit(i, bitmap) && - !read_bucket(i)) - break; out: kvpfree(buf.data, buf.size); percpu_ref_put(&ca->io_ref); @@ -677,7 +593,6 @@ err: jlist->ret = ret; mutex_unlock(&jlist->lock); goto out; -#undef read_bucket } void bch2_journal_entries_free(struct list_head *list) @@ -865,7 +780,6 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list) int ret = 0; list_for_each_entry_safe(i, n, list, list) { - j->replay_journal_seq = le64_to_cpu(i->j.seq); for_each_jset_key(k, _n, entry, &i->j) { @@ -875,7 +789,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list) * allocation code handles replay for * BTREE_ID_ALLOC keys: */ - ret = bch2_alloc_replay_key(c, k->k.p); + ret = bch2_alloc_replay_key(c, k); } else { /* * We might cause compressed extents to be @@ -886,9 +800,9 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list) bch2_disk_reservation_init(c, 0); ret = bch2_btree_insert(c, entry->btree_id, k, - &disk_res, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_REPLAY); + &disk_res, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_JOURNAL_REPLAY); } if (ret) { @@ -932,32 +846,18 @@ static void bch2_journal_add_btree_root(struct journal_buf *buf, } static unsigned journal_dev_buckets_available(struct journal *j, - struct bch_dev *ca) + struct journal_device *ja) { - struct journal_device *ja = &ca->journal; unsigned next = (ja->cur_idx + 1) % ja->nr; unsigned available = (ja->last_idx + ja->nr - next) % ja->nr; - /* - * Hack to avoid a deadlock during journal replay: - * journal replay might require setting a new btree - * root, which requires writing another journal entry - - * thus, if the journal is full (and this happens when - * replaying the first journal bucket's entries) we're - * screwed. - * - * So don't let the journal fill up unless we're in - * replay: - */ - if (test_bit(JOURNAL_REPLAY_DONE, &j->flags)) - available = max((int) available - 2, 0); - /* * Don't use the last bucket unless writing the new last_seq * will make another bucket available: */ - if (ja->bucket_seq[ja->last_idx] >= journal_last_seq(j)) - available = max((int) available - 1, 0); + if (available && + journal_last_seq(j) <= ja->bucket_seq[ja->last_idx]) + --available; return available; } @@ -967,7 +867,6 @@ int bch2_journal_entry_sectors(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_dev *ca; - struct bkey_s_extent e = bkey_i_to_s_extent(&j->key); unsigned sectors_available = UINT_MAX; unsigned i, nr_online = 0, nr_devs = 0; @@ -977,38 +876,39 @@ int bch2_journal_entry_sectors(struct journal *j) for_each_member_device_rcu(ca, c, i, &c->rw_devs[BCH_DATA_JOURNAL]) { struct journal_device *ja = &ca->journal; - unsigned buckets_required = 0; + unsigned buckets_this_device, sectors_this_device; if (!ja->nr) continue; - sectors_available = min_t(unsigned, sectors_available, - ca->mi.bucket_size); + buckets_this_device = journal_dev_buckets_available(j, ja); + sectors_this_device = ja->sectors_free; + + nr_online++; /* - * Note that we don't allocate the space for a journal entry - * until we write it out - thus, if we haven't started the write - * for the previous entry we have to make sure we have space for - * it too: + * We that we don't allocate the space for a journal entry + * until we write it out - thus, account for it here: */ - if (bch2_extent_has_device(e.c, ca->dev_idx)) { - if (j->prev_buf_sectors > ja->sectors_free) - buckets_required++; - - if (j->prev_buf_sectors + sectors_available > - ja->sectors_free) - buckets_required++; - } else { - if (j->prev_buf_sectors + sectors_available > - ca->mi.bucket_size) - buckets_required++; - - buckets_required++; + if (j->prev_buf_sectors >= sectors_this_device) { + if (!buckets_this_device) + continue; + + buckets_this_device--; + sectors_this_device = ca->mi.bucket_size; } - if (journal_dev_buckets_available(j, ca) >= buckets_required) - nr_devs++; - nr_online++; + sectors_this_device -= j->prev_buf_sectors; + + if (buckets_this_device) + sectors_this_device = ca->mi.bucket_size; + + if (!sectors_this_device) + continue; + + sectors_available = min(sectors_available, + sectors_this_device); + nr_devs++; } rcu_read_unlock(); @@ -1021,107 +921,111 @@ int bch2_journal_entry_sectors(struct journal *j) return sectors_available; } -/** - * journal_next_bucket - move on to the next journal bucket if possible - */ -static int journal_write_alloc(struct journal *j, struct journal_buf *w, - unsigned sectors) +static void __journal_write_alloc(struct journal *j, + struct journal_buf *w, + struct dev_alloc_list *devs_sorted, + unsigned sectors, + unsigned *replicas, + unsigned replicas_want) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bkey_s_extent e; - struct bch_extent_ptr *ptr; + struct bkey_i_extent *e = bkey_i_to_extent(&w->key); struct journal_device *ja; struct bch_dev *ca; - struct dev_alloc_list devs_sorted; - unsigned i, replicas, replicas_want = - READ_ONCE(c->opts.metadata_replicas); - - spin_lock(&j->lock); - e = bkey_i_to_s_extent(&j->key); - - /* - * Drop any pointers to devices that have been removed, are no longer - * empty, or filled up their current journal bucket: - * - * Note that a device may have had a small amount of free space (perhaps - * one sector) that wasn't enough for the smallest possible journal - * entry - that's why we drop pointers to devices <= current free space, - * i.e. whichever device was limiting the current journal entry size. - */ - bch2_extent_drop_ptrs(e, ptr, ({ - ca = bch_dev_bkey_exists(c, ptr->dev); - - ca->mi.state != BCH_MEMBER_STATE_RW || - ca->journal.sectors_free <= sectors; - })); - - extent_for_each_ptr(e, ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); + unsigned i; - BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW || - ca->journal.sectors_free <= sectors); - ca->journal.sectors_free -= sectors; - } - - replicas = bch2_extent_nr_ptrs(e.c); - - rcu_read_lock(); - devs_sorted = bch2_wp_alloc_list(c, &j->wp, - &c->rw_devs[BCH_DATA_JOURNAL]); + if (*replicas >= replicas_want) + return; - for (i = 0; i < devs_sorted.nr; i++) { - ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); + for (i = 0; i < devs_sorted->nr; i++) { + ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); if (!ca) continue; - if (!ca->mi.durability) - continue; - ja = &ca->journal; - if (!ja->nr) - continue; - - if (replicas >= replicas_want) - break; /* * Check that we can use this device, and aren't already using * it: */ - if (bch2_extent_has_device(e.c, ca->dev_idx) || - !journal_dev_buckets_available(j, ca) || - sectors > ca->mi.bucket_size) + if (!ca->mi.durability || + ca->mi.state != BCH_MEMBER_STATE_RW || + !ja->nr || + bch2_extent_has_device(extent_i_to_s_c(e), ca->dev_idx) || + sectors > ja->sectors_free) continue; - j->wp.next_alloc[ca->dev_idx] += U32_MAX; - bch2_wp_rescale(c, ca, &j->wp); - - ja->sectors_free = ca->mi.bucket_size - sectors; - ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); + bch2_dev_stripe_increment(c, ca, &j->wp.stripe); - extent_ptr_append(bkey_i_to_extent(&j->key), + extent_ptr_append(e, (struct bch_extent_ptr) { .offset = bucket_to_sector(ca, - ja->buckets[ja->cur_idx]), + ja->buckets[ja->cur_idx]) + + ca->mi.bucket_size - + ja->sectors_free, .dev = ca->dev_idx, }); - replicas += ca->mi.durability; + ja->sectors_free -= sectors; + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); + + *replicas += ca->mi.durability; + + if (*replicas >= replicas_want) + break; } - rcu_read_unlock(); +} - j->prev_buf_sectors = 0; +/** + * journal_next_bucket - move on to the next journal bucket if possible + */ +static int journal_write_alloc(struct journal *j, struct journal_buf *w, + unsigned sectors) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_device *ja; + struct bch_dev *ca; + struct dev_alloc_list devs_sorted; + unsigned i, replicas = 0, replicas_want = + READ_ONCE(c->opts.metadata_replicas); - bkey_copy(&w->key, &j->key); - spin_unlock(&j->lock); + rcu_read_lock(); - if (replicas < c->opts.metadata_replicas_required) - return -EROFS; + devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, + &c->rw_devs[BCH_DATA_JOURNAL]); - BUG_ON(!replicas); + spin_lock(&j->lock); + __journal_write_alloc(j, w, &devs_sorted, + sectors, &replicas, replicas_want); - return 0; + if (replicas >= replicas_want) + goto done; + + for (i = 0; i < devs_sorted.nr; i++) { + ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); + if (!ca) + continue; + + ja = &ca->journal; + + if (sectors > ja->sectors_free && + sectors <= ca->mi.bucket_size && + journal_dev_buckets_available(j, ja)) { + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + ja->sectors_free = ca->mi.bucket_size; + } + } + + __journal_write_alloc(j, w, &devs_sorted, + sectors, &replicas, replicas_want); +done: + if (replicas >= replicas_want) + j->prev_buf_sectors = 0; + + spin_unlock(&j->lock); + rcu_read_unlock(); + + return replicas >= replicas_want ? 0 : -EROFS; } static void journal_write_compact(struct jset *jset) @@ -1376,9 +1280,6 @@ void bch2_journal_write(struct closure *cl) } no_io: - extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) - ptr->offset += sectors; - bch2_bucket_seq_cleanup(c); continue_at(cl, journal_write_done, system_highpri_wq); diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 978aba7..9ac65d0 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -125,7 +125,8 @@ void bch2_journal_reclaim_fast(struct journal *j) * Unpin journal entries whose reference counts reached zero, meaning * all btree nodes got written out */ - while (!atomic_read(&fifo_peek_front(&j->pin).count)) { + while (!fifo_empty(&j->pin) && + !atomic_read(&fifo_peek_front(&j->pin).count)) { BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); BUG_ON(!fifo_pop(&j->pin, temp)); popped = true; diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 2670248..a593368 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -184,7 +184,6 @@ struct journal { struct list_head seq_blacklist; struct journal_seq_blacklist *new_blacklist; - BKEY_PADDED(key); struct write_point wp; spinlock_t err_lock; diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 775d6a6..449cd5b 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -278,11 +278,37 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) case Opt_background_compression: ret = bch2_check_set_has_compressed_data(c, v); break; + case Opt_erasure_code: + if (v && + !(c->sb.features & (1ULL << BCH_FEATURE_EC))) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->features[0] |= + cpu_to_le64(1ULL << BCH_FEATURE_EC); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + break; } return ret; } +int bch2_opts_check_may_set(struct bch_fs *c) +{ + unsigned i; + int ret; + + for (i = 0; i < bch2_opts_nr; i++) { + ret = bch2_opt_check_may_set(c, i, + bch2_opt_get_by_id(&c->opts, i)); + if (ret) + return ret; + } + + return 0; +} + int bch2_parse_mount_opts(struct bch_opts *opts, char *options) { char *opt, *name, *val; diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index bdf1e4f..8ffae3d 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -110,6 +110,9 @@ enum opt_type { BCH_OPT(promote_target, u16, OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_PROMOTE_TARGET, 0) \ + BCH_OPT(erasure_code, u16, OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_ERASURE_CODE, false) \ BCH_OPT(inodes_32bit, u8, OPT_RUNTIME, \ OPT_BOOL(), \ BCH_SB_INODE_32BIT, false) \ @@ -266,6 +269,7 @@ void bch2_opt_to_text(struct printbuf *, struct bch_fs *, const struct bch_option *, u64, unsigned); int bch2_opt_check_may_set(struct bch_fs *, int, u64); +int bch2_opts_check_may_set(struct bch_fs *); int bch2_parse_mount_opts(struct bch_opts *, char *); /* inode opts: */ @@ -277,7 +281,8 @@ int bch2_parse_mount_opts(struct bch_opts *, char *); BCH_INODE_OPT(data_replicas, 8) \ BCH_INODE_OPT(promote_target, 16) \ BCH_INODE_OPT(foreground_target, 16) \ - BCH_INODE_OPT(background_target, 16) + BCH_INODE_OPT(background_target, 16) \ + BCH_INODE_OPT(erasure_code, 16) struct bch_io_opts { #define BCH_INODE_OPT(_name, _bits) unsigned _name##_defined:1; diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index c5d9dc4..0e3c321 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -6,6 +6,7 @@ #include "btree_update_interior.h" #include "btree_io.h" #include "dirent.h" +#include "ec.h" #include "error.h" #include "fsck.h" #include "journal_io.h" @@ -212,6 +213,11 @@ int bch2_fs_recovery(struct bch_fs *c) set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); + err = "cannot allocate memory"; + ret = bch2_fs_ec_start(c); + if (ret) + goto err; + bch_verbose(c, "starting mark and sweep:"); err = "error in recovery"; ret = bch2_initial_gc(c, &journal); diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index a7a4e28..0ba5ce5 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -79,9 +79,33 @@ static void extent_to_replicas(struct bkey_s_c k, r->nr_required = 1; - extent_for_each_ptr_decode(e, p, entry) - if (!p.ptr.cached) - r->devs[r->nr_devs++] = p.ptr.dev; + extent_for_each_ptr_decode(e, p, entry) { + if (p.ptr.cached) + continue; + + if (p.ec_nr) { + r->nr_devs = 0; + break; + } + + r->devs[r->nr_devs++] = p.ptr.dev; + } + } +} + +static void stripe_to_replicas(struct bkey_s_c k, + struct bch_replicas_entry *r) +{ + if (k.k->type == BCH_STRIPE) { + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + const struct bch_extent_ptr *ptr; + + r->nr_required = s.v->nr_blocks - s.v->nr_redundant; + + for (ptr = s.v->ptrs; + ptr < s.v->ptrs + s.v->nr_blocks; + ptr++) + r->devs[r->nr_devs++] = ptr->dev; } } @@ -100,6 +124,10 @@ static void bkey_to_replicas(enum bkey_type type, e->data_type = BCH_DATA_USER; extent_to_replicas(k, e); break; + case BKEY_TYPE_EC: + e->data_type = BCH_DATA_USER; + stripe_to_replicas(k, e); + break; default: break; } diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 8352357..7192007 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "checksum.h" #include "disk_groups.h" +#include "ec.h" #include "error.h" #include "io.h" #include "journal.h" diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 5b27ead..0eb6b7e 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -19,6 +19,7 @@ #include "compress.h" #include "debug.h" #include "disk_groups.h" +#include "ec.h" #include "error.h" #include "fs.h" #include "fs-io.h" @@ -395,6 +396,7 @@ static void bch2_fs_free(struct bch_fs *c) bch2_fs_quota_exit(c); bch2_fs_fsio_exit(c); + bch2_fs_ec_exit(c); bch2_fs_encryption_exit(c); bch2_fs_io_exit(c); bch2_fs_btree_cache_exit(c); @@ -403,7 +405,7 @@ static void bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); percpu_free_rwsem(&c->usage_lock); - free_percpu(c->usage_percpu); + free_percpu(c->usage[0]); mempool_exit(&c->btree_iters_pool); mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); @@ -576,6 +578,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_LIST_HEAD(&c->fsck_errors); mutex_init(&c->fsck_error_lock); + INIT_LIST_HEAD(&c->ec_new_stripe_list); + mutex_init(&c->ec_new_stripe_lock); + mutex_init(&c->ec_stripes_lock); + spin_lock_init(&c->ec_stripes_heap_lock); + seqcount_init(&c->gc_pos_lock); c->copy_gc_enabled = 1; @@ -631,7 +638,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) max(offsetof(struct btree_read_bio, bio), offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || - !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) || + !(c->usage[0] = alloc_percpu(struct bch_fs_usage)) || percpu_init_rwsem(&c->usage_lock) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || @@ -644,6 +651,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_io_init(c) || bch2_fs_encryption_init(c) || bch2_fs_compress_init(c) || + bch2_fs_ec_init(c) || bch2_fs_fsio_init(c)) goto err; @@ -715,6 +723,10 @@ const char *bch2_fs_start(struct bch_fs *c) if (ret) goto err; + ret = bch2_opts_check_may_set(c); + if (ret) + goto err; + err = "dynamic fault"; if (bch2_fs_init_fault("fs_start")) goto err; @@ -1054,8 +1066,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) return ret; mutex_lock(&c->sb_lock); - bch2_mark_dev_superblock(ca->fs, ca, - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); + bch2_mark_dev_superblock(ca->fs, ca, 0); mutex_unlock(&c->sb_lock); bch2_dev_sysfs_online(c, ca); @@ -1340,7 +1351,7 @@ static void dev_usage_clear(struct bch_dev *ca) for_each_possible_cpu(cpu) { struct bch_dev_usage *p = - per_cpu_ptr(ca->usage_percpu, cpu); + per_cpu_ptr(ca->usage[0], cpu); memset(p, 0, sizeof(*p)); } @@ -1401,8 +1412,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) * allocate the journal, reset all the marks, then remark after we * attach... */ - bch2_mark_dev_superblock(ca->fs, ca, - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); + bch2_mark_dev_superblock(ca->fs, ca, 0); err = "journal alloc failed"; ret = bch2_dev_journal_alloc(ca); @@ -1461,8 +1471,7 @@ have_slot: ca->disk_sb.sb->dev_idx = dev_idx; bch2_dev_attach(c, ca, dev_idx); - bch2_mark_dev_superblock(c, ca, - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); + bch2_mark_dev_superblock(c, ca, 0); bch2_write_super(c); mutex_unlock(&c->sb_lock); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index f793cfb..0c3bdcd 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -18,6 +18,7 @@ #include "btree_gc.h" #include "buckets.h" #include "disk_groups.h" +#include "ec.h" #include "inode.h" #include "journal.h" #include "keylist.h" @@ -187,6 +188,8 @@ sysfs_pd_controller_attribute(rebalance); read_attribute(rebalance_work); rw_attribute(promote_whole_extents); +read_attribute(new_stripes); + rw_attribute(pd_controllers_update_seconds); read_attribute(meta_replicas_have); @@ -241,6 +244,8 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) pr_buf(&out, "\t%s:\t\t%llu\n", bch2_data_types[type], stats.replicas[replicas].data[type]); + pr_buf(&out, "\terasure coded:\t%llu\n", + stats.replicas[replicas].ec_data); pr_buf(&out, "\treserved:\t%llu\n", stats.replicas[replicas].persistent_reserved); } @@ -309,6 +314,41 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) compressed_sectors_uncompressed << 9); } +static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf) +{ + char *out = buf, *end = buf + PAGE_SIZE; + struct ec_stripe_head *h; + struct ec_stripe_new *s; + + mutex_lock(&c->ec_new_stripe_lock); + list_for_each_entry(h, &c->ec_new_stripe_list, list) { + out += scnprintf(out, end - out, + "target %u algo %u redundancy %u:\n", + h->target, h->algo, h->redundancy); + + if (h->s) + out += scnprintf(out, end - out, + "\tpending: blocks %u allocated %u\n", + h->s->blocks.nr, + bitmap_weight(h->s->blocks_allocated, + h->s->blocks.nr)); + + mutex_lock(&h->lock); + list_for_each_entry(s, &h->stripes, list) + out += scnprintf(out, end - out, + "\tin flight: blocks %u allocated %u pin %u\n", + s->blocks.nr, + bitmap_weight(s->blocks_allocated, + s->blocks.nr), + atomic_read(&s->pin)); + mutex_unlock(&h->lock); + + } + mutex_unlock(&c->ec_new_stripe_lock); + + return out - buf; +} + SHOW(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); @@ -368,6 +408,9 @@ SHOW(bch2_fs) if (attr == &sysfs_compression_stats) return bch2_compression_stats(c, buf); + if (attr == &sysfs_new_stripes) + return bch2_new_stripes(c, buf); + #define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM @@ -434,7 +477,7 @@ STORE(__bch2_fs) bch2_coalesce(c); if (attr == &sysfs_trigger_gc) - bch2_gc(c); + bch2_gc(c, NULL, false); if (attr == &sysfs_prune_cache) { struct shrink_control sc; @@ -536,6 +579,8 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_rebalance_work, sysfs_pd_controller_files(rebalance), + &sysfs_new_stripes, + &sysfs_internal_uuid, #define BCH_DEBUG_PARAM(name, description) &sysfs_##name, @@ -764,6 +809,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) " meta: %llu\n" " user: %llu\n" " cached: %llu\n" + " erasure coded: %llu\n" " available: %lli\n" "sectors:\n" " sb: %llu\n" @@ -787,6 +833,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) stats.buckets[BCH_DATA_BTREE], stats.buckets[BCH_DATA_USER], stats.buckets[BCH_DATA_CACHED], + stats.buckets_ec, ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable, stats.sectors[BCH_DATA_SB], stats.sectors[BCH_DATA_JOURNAL], -- 2.39.2