]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to da7fefde29 bcachefs: shim for userspace raid library
authorKent Overstreet <kent.overstreet@gmail.com>
Fri, 23 Nov 2018 08:04:34 +0000 (03:04 -0500)
committerKent Overstreet <kent.overstreet@gmail.com>
Fri, 23 Nov 2018 08:05:20 +0000 (03:05 -0500)
41 files changed:
.bcachefs_revision
include/linux/blkdev.h
libbcachefs/alloc_background.c
libbcachefs/alloc_background.h
libbcachefs/alloc_foreground.c
libbcachefs/alloc_foreground.h
libbcachefs/alloc_types.h
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bkey.h
libbcachefs/bkey_methods.c
libbcachefs/btree_gc.c
libbcachefs/btree_gc.h
libbcachefs/btree_iter.c
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/buckets_types.h
libbcachefs/compress.c
libbcachefs/disk_groups.h
libbcachefs/ec.c [new file with mode: 0644]
libbcachefs/ec.h [new file with mode: 0644]
libbcachefs/ec_types.h [new file with mode: 0644]
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/extents_types.h
libbcachefs/fs-io.c
libbcachefs/io.c
libbcachefs/journal.c
libbcachefs/journal.h
libbcachefs/journal_io.c
libbcachefs/journal_reclaim.c
libbcachefs/journal_types.h
libbcachefs/opts.c
libbcachefs/opts.h
libbcachefs/recovery.c
libbcachefs/replicas.c
libbcachefs/super-io.c
libbcachefs/super.c
libbcachefs/sysfs.c

index 48cf256fa03ab78e8a709ece837d52894d156d44..abb9e489a2dedaa6712b26ec3bf3acd4295f643d 100644 (file)
@@ -1 +1 @@
-a9f14c773fb122a4b283fc7b79d9f98703a18890
+da7fefde294e3c56359ee498a62a77182a4733cd
index 1d5581dc1b918e735bc56e7659775e6e51ca6e42..e4982f9673bb493c50ea1d3d33301ea78e07e5ff 100644 (file)
@@ -6,6 +6,8 @@
 #include <linux/kobject.h>
 #include <linux/types.h>
 
+#define BIO_MAX_PAGES  256
+
 typedef unsigned fmode_t;
 
 struct bio;
index 41ea73aeb33639e64cad616d7b27c4b01fceda6e..899291633f98f87fadaf2832cade166d91131458 100644 (file)
@@ -9,6 +9,7 @@
 #include "buckets.h"
 #include "clock.h"
 #include "debug.h"
+#include "ec.h"
 #include "error.h"
 #include "journal_io.h"
 
@@ -82,7 +83,8 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
        case BCH_ALLOC: {
                struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 
-               if (bch_alloc_val_u64s(a.v) != bkey_val_u64s(a.k))
+               /* allow for unknown fields */
+               if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v))
                        return "incorrect value size";
                break;
        }
@@ -235,6 +237,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
        __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key;
        struct bucket *g;
        struct bkey_i_alloc *a;
+       int ret;
        u8 *d;
 
        percpu_down_read_preempt_disable(&c->usage_lock);
@@ -258,32 +261,50 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
 
        bch2_btree_iter_set_pos(iter, a->k.p);
 
-       return bch2_btree_insert_at(c, NULL, journal_seq,
-                                   BTREE_INSERT_NOFAIL|
-                                   BTREE_INSERT_USE_RESERVE|
-                                   BTREE_INSERT_USE_ALLOC_RESERVE|
-                                   flags,
-                                   BTREE_INSERT_ENTRY(iter, &a->k_i));
+       ret = bch2_btree_insert_at(c, NULL, journal_seq,
+                                  BTREE_INSERT_NOFAIL|
+                                  BTREE_INSERT_USE_RESERVE|
+                                  BTREE_INSERT_USE_ALLOC_RESERVE|
+                                  flags,
+                                  BTREE_INSERT_ENTRY(iter, &a->k_i));
+
+       if (!ret && ca->buckets_written)
+               set_bit(b, ca->buckets_written);
+
+       return ret;
 }
 
-int bch2_alloc_replay_key(struct bch_fs *c, struct bpos pos)
+int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
 {
        struct bch_dev *ca;
        struct btree_iter iter;
        int ret;
 
-       if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
+       if (k->k.p.inode >= c->sb.nr_devices ||
+           !c->devs[k->k.p.inode])
                return 0;
 
-       ca = bch_dev_bkey_exists(c, pos.inode);
+       ca = bch_dev_bkey_exists(c, k->k.p.inode);
 
-       if (pos.offset >= ca->mi.nbuckets)
+       if (k->k.p.offset >= ca->mi.nbuckets)
                return 0;
 
-       bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN,
-                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, k->k.p,
+                            BTREE_ITER_INTENT);
+
+       ret = bch2_btree_iter_traverse(&iter);
+       if (ret)
+               goto err;
+
+       /* check buckets_written with btree node locked: */
 
-       ret = __bch2_alloc_write_key(c, ca, pos.offset, &iter, NULL, 0);
+       ret = test_bit(k->k.p.offset, ca->buckets_written)
+               ? 0
+               : bch2_btree_insert_at(c, NULL, NULL,
+                                      BTREE_INSERT_NOFAIL|
+                                      BTREE_INSERT_JOURNAL_REPLAY,
+                                      BTREE_INSERT_ENTRY(&iter, k));
+err:
        bch2_btree_iter_unlock(&iter);
        return ret;
 }
@@ -909,12 +930,6 @@ static int bch2_allocator_thread(void *arg)
                pr_debug("free_inc now empty");
 
                do {
-                       if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) {
-                               up_read(&c->gc_lock);
-                               bch_err(ca, "gc failure");
-                               goto stop;
-                       }
-
                        /*
                         * Find some buckets that we can invalidate, either
                         * they're completely unused, or only contain clean data
@@ -1112,6 +1127,24 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
        }
        mutex_unlock(&c->btree_reserve_cache_lock);
 
+       while (1) {
+               struct open_bucket *ob;
+
+               spin_lock(&c->freelist_lock);
+               if (!ca->open_buckets_partial_nr) {
+                       spin_unlock(&c->freelist_lock);
+                       break;
+               }
+               ob = c->open_buckets +
+                       ca->open_buckets_partial[--ca->open_buckets_partial_nr];
+               ob->on_partial_list = false;
+               spin_unlock(&c->freelist_lock);
+
+               bch2_open_bucket_put(c, ob);
+       }
+
+       bch2_ec_stop_dev(c, ca);
+
        /*
         * Wake up threads that were blocked on allocation, so they can notice
         * the device can no longer be removed and the capacity has changed:
@@ -1254,9 +1287,6 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
        bool invalidating_data = false;
        int ret = 0;
 
-       if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
-               return -1;
-
        if (test_alloc_startup(c)) {
                invalidating_data = true;
                goto not_enough;
@@ -1264,51 +1294,47 @@ static int __bch2_fs_allocator_start(struct bch_fs *c)
 
        /* Scan for buckets that are already invalidated: */
        for_each_rw_member(ca, c, dev_iter) {
-               struct btree_iter iter;
+               struct bucket_array *buckets;
                struct bucket_mark m;
-               struct bkey_s_c k;
 
-               for_each_btree_key(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), 0, k) {
-                       if (k.k->type != BCH_ALLOC)
-                               continue;
+               down_read(&ca->bucket_lock);
+               percpu_down_read_preempt_disable(&c->usage_lock);
 
-                       bu = k.k->p.offset;
-                       m = READ_ONCE(bucket(ca, bu)->mark);
+               buckets = bucket_array(ca);
+
+               for (bu = buckets->first_bucket;
+                    bu < buckets->nbuckets; bu++) {
+                       m = READ_ONCE(buckets->b[bu].mark);
 
-                       if (!is_available_bucket(m) || m.cached_sectors)
+                       if (!m.gen_valid ||
+                           !is_available_bucket(m) ||
+                           m.cached_sectors)
                                continue;
 
-                       percpu_down_read_preempt_disable(&c->usage_lock);
                        bch2_mark_alloc_bucket(c, ca, bu, true,
-                                       gc_pos_alloc(c, NULL),
-                                       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-                                       BCH_BUCKET_MARK_GC_LOCK_HELD);
-                       percpu_up_read_preempt_enable(&c->usage_lock);
+                                       gc_pos_alloc(c, NULL), 0);
 
                        fifo_push(&ca->free_inc, bu);
 
-                       if (fifo_full(&ca->free_inc))
+                       discard_invalidated_buckets(c, ca);
+
+                       if (fifo_full(&ca->free[RESERVE_BTREE]))
                                break;
                }
-               bch2_btree_iter_unlock(&iter);
+               percpu_up_read_preempt_enable(&c->usage_lock);
+               up_read(&ca->bucket_lock);
        }
 
        /* did we find enough buckets? */
        for_each_rw_member(ca, c, dev_iter)
-               if (fifo_used(&ca->free_inc) < ca->free[RESERVE_BTREE].size) {
+               if (!fifo_full(&ca->free[RESERVE_BTREE])) {
                        percpu_ref_put(&ca->io_ref);
                        goto not_enough;
                }
 
        return 0;
 not_enough:
-       pr_debug("did not find enough empty buckets; issuing discards");
-
-       /* clear out free_inc, we'll be using it again below: */
-       for_each_rw_member(ca, c, dev_iter)
-               discard_invalidated_buckets(c, ca);
-
-       pr_debug("scanning for reclaimable buckets");
+       pr_debug("not enough empty buckets; scanning for reclaimable buckets");
 
        for_each_rw_member(ca, c, dev_iter) {
                find_reclaimable_buckets(c, ca);
index 2de9357c7b019fc7b9c2ab645c53ad0198c03b8c..6911fa69b0770df168178d98f8f7665d0bdef065 100644 (file)
@@ -16,7 +16,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 }
 
 int bch2_alloc_read(struct bch_fs *, struct list_head *);
-int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
+int bch2_alloc_replay_key(struct bch_fs *, struct bkey_i *);
 
 static inline void bch2_wake_allocator(struct bch_dev *ca)
 {
index 06859960d906910daf09f14b33e78aeac1b8dcf0..91ab33690063edb8bf5fa2869755606ff29cc668 100644 (file)
@@ -61,6 +61,7 @@
 #include "clock.h"
 #include "debug.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "io.h"
 
 #include <linux/math64.h>
@@ -94,6 +95,11 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
 {
        struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 
+       if (ob->ec) {
+               bch2_ec_bucket_written(c, ob);
+               return;
+       }
+
        percpu_down_read_preempt_disable(&c->usage_lock);
        spin_lock(&ob->lock);
 
@@ -113,6 +119,19 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
        closure_wake_up(&c->open_buckets_wait);
 }
 
+void bch2_open_bucket_write_error(struct bch_fs *c,
+                                 struct open_buckets *obs,
+                                 unsigned dev)
+{
+       struct open_bucket *ob;
+       unsigned i;
+
+       open_bucket_for_each(c, obs, ob, i)
+               if (ob->ptr.dev == dev &&
+                   ob->ec)
+                       bch2_ec_bucket_cancel(c, ob);
+}
+
 static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
 {
        struct open_bucket *ob;
@@ -128,15 +147,17 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
 }
 
 static void open_bucket_free_unused(struct bch_fs *c,
-                                   struct write_point *wp,
-                                   struct open_bucket *ob)
+                                   struct open_bucket *ob,
+                                   bool may_realloc)
 {
        struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev);
 
        BUG_ON(ca->open_buckets_partial_nr >=
               ARRAY_SIZE(ca->open_buckets_partial));
 
-       if (wp->type == BCH_DATA_USER) {
+       if (ca->open_buckets_partial_nr <
+           ARRAY_SIZE(ca->open_buckets_partial) &&
+           may_realloc) {
                spin_lock(&c->freelist_lock);
                ob->on_partial_list = true;
                ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
@@ -284,18 +305,18 @@ out:
        return ob;
 }
 
-static int __dev_alloc_cmp(struct write_point *wp,
-                          unsigned l, unsigned r)
+static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
+                           unsigned l, unsigned r)
 {
-       return ((wp->next_alloc[l] > wp->next_alloc[r]) -
-               (wp->next_alloc[l] < wp->next_alloc[r]));
+       return ((stripe->next_alloc[l] > stripe->next_alloc[r]) -
+               (stripe->next_alloc[l] < stripe->next_alloc[r]));
 }
 
-#define dev_alloc_cmp(l, r) __dev_alloc_cmp(wp, l, r)
+#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r)
 
-struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
-                                        struct write_point *wp,
-                                        struct bch_devs_mask *devs)
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c,
+                                         struct dev_stripe_state *stripe,
+                                         struct bch_devs_mask *devs)
 {
        struct dev_alloc_list ret = { .nr = 0 };
        struct bch_dev *ca;
@@ -304,14 +325,14 @@ struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
        for_each_member_device_rcu(ca, c, i, devs)
                ret.devs[ret.nr++] = i;
 
-       bubble_sort(ret.devs, ret.nr, dev_alloc_cmp);
+       bubble_sort(ret.devs, ret.nr, dev_stripe_cmp);
        return ret;
 }
 
-void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
-                    struct write_point *wp)
+void bch2_dev_stripe_increment(struct bch_fs *c, struct bch_dev *ca,
+                              struct dev_stripe_state *stripe)
 {
-       u64 *v = wp->next_alloc + ca->dev_idx;
+       u64 *v = stripe->next_alloc + ca->dev_idx;
        u64 free_space = dev_buckets_free(c, ca);
        u64 free_space_inv = free_space
                ? div64_u64(1ULL << 48, free_space)
@@ -323,26 +344,30 @@ void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
        else
                *v = U64_MAX;
 
-       for (v = wp->next_alloc;
-            v < wp->next_alloc + ARRAY_SIZE(wp->next_alloc); v++)
+       for (v = stripe->next_alloc;
+            v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++)
                *v = *v < scale ? 0 : *v - scale;
 }
 
+#define BUCKET_MAY_ALLOC_PARTIAL       (1 << 0)
+#define BUCKET_ALLOC_USE_DURABILITY    (1 << 1)
+
 static int bch2_bucket_alloc_set(struct bch_fs *c,
                                 struct open_buckets *ptrs,
-                                struct write_point *wp,
+                                struct dev_stripe_state *stripe,
                                 struct bch_devs_mask *devs_may_alloc,
                                 unsigned nr_replicas,
                                 unsigned *nr_effective,
                                 bool *have_cache,
                                 enum alloc_reserve reserve,
+                                unsigned flags,
                                 struct closure *cl)
 {
        struct dev_alloc_list devs_sorted =
-               bch2_wp_alloc_list(c, wp, devs_may_alloc);
+               bch2_dev_alloc_list(c, stripe, devs_may_alloc);
        struct bch_dev *ca;
        bool alloc_failure = false;
-       unsigned i;
+       unsigned i, durability;
 
        BUG_ON(*nr_effective >= nr_replicas);
 
@@ -353,13 +378,11 @@ static int bch2_bucket_alloc_set(struct bch_fs *c,
                if (!ca)
                        continue;
 
-               if (!ca->mi.durability &&
-                   (*have_cache ||
-                    wp->type != BCH_DATA_USER))
+               if (!ca->mi.durability && *have_cache)
                        continue;
 
                ob = bch2_bucket_alloc(c, ca, reserve,
-                                      wp->type == BCH_DATA_USER, cl);
+                               flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
                if (IS_ERR(ob)) {
                        enum bucket_alloc_ret ret = -PTR_ERR(ob);
 
@@ -374,13 +397,16 @@ static int bch2_bucket_alloc_set(struct bch_fs *c,
                        continue;
                }
 
+               durability = (flags & BUCKET_ALLOC_USE_DURABILITY)
+                       ? ca->mi.durability : 1;
+
                __clear_bit(ca->dev_idx, devs_may_alloc->d);
-               *nr_effective   += ca->mi.durability;
-               *have_cache     |= !ca->mi.durability;
+               *nr_effective   += durability;
+               *have_cache     |= !durability;
 
                ob_push(c, ptrs, ob);
 
-               bch2_wp_rescale(c, ca, wp);
+               bch2_dev_stripe_increment(c, ca, stripe);
 
                if (*nr_effective >= nr_replicas)
                        return 0;
@@ -389,15 +415,150 @@ static int bch2_bucket_alloc_set(struct bch_fs *c,
        return alloc_failure ? -ENOSPC : -EROFS;
 }
 
+/* Allocate from stripes: */
+
+/*
+ * XXX: use a higher watermark for allocating open buckets here:
+ */
+static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
+{
+       struct bch_devs_mask devs;
+       struct open_bucket *ob;
+       unsigned i, nr_have = 0, nr_data =
+               min_t(unsigned, h->nr_active_devs,
+                     EC_STRIPE_MAX) - h->redundancy;
+       bool have_cache = true;
+       int ret = 0;
+
+       BUG_ON(h->blocks.nr > nr_data);
+       BUG_ON(h->parity.nr > h->redundancy);
+
+       devs = h->devs;
+
+       open_bucket_for_each(c, &h->parity, ob, i)
+               __clear_bit(ob->ptr.dev, devs.d);
+       open_bucket_for_each(c, &h->blocks, ob, i)
+               __clear_bit(ob->ptr.dev, devs.d);
+
+       percpu_down_read_preempt_disable(&c->usage_lock);
+       rcu_read_lock();
+
+       if (h->parity.nr < h->redundancy) {
+               nr_have = h->parity.nr;
+
+               ret = bch2_bucket_alloc_set(c, &h->parity,
+                                           &h->parity_stripe,
+                                           &devs,
+                                           h->redundancy,
+                                           &nr_have,
+                                           &have_cache,
+                                           RESERVE_NONE,
+                                           0,
+                                           NULL);
+               if (ret)
+                       goto err;
+       }
+
+       if (h->blocks.nr < nr_data) {
+               nr_have = h->blocks.nr;
+
+               ret = bch2_bucket_alloc_set(c, &h->blocks,
+                                           &h->block_stripe,
+                                           &devs,
+                                           nr_data,
+                                           &nr_have,
+                                           &have_cache,
+                                           RESERVE_NONE,
+                                           0,
+                                           NULL);
+               if (ret)
+                       goto err;
+       }
+
+       rcu_read_unlock();
+       percpu_up_read_preempt_enable(&c->usage_lock);
+
+       return bch2_ec_stripe_new_alloc(c, h);
+err:
+       rcu_read_unlock();
+       percpu_up_read_preempt_enable(&c->usage_lock);
+       return -1;
+}
+
+/*
+ * if we can't allocate a new stripe because there are already too many
+ * partially filled stripes, force allocating from an existing stripe even when
+ * it's to a device we don't want:
+ */
+
+static void bucket_alloc_from_stripe(struct bch_fs *c,
+                                    struct open_buckets *ptrs,
+                                    struct write_point *wp,
+                                    struct bch_devs_mask *devs_may_alloc,
+                                    u16 target,
+                                    unsigned erasure_code,
+                                    unsigned nr_replicas,
+                                    unsigned *nr_effective,
+                                    bool *have_cache)
+{
+       struct dev_alloc_list devs_sorted;
+       struct ec_stripe_head *h;
+       struct open_bucket *ob;
+       struct bch_dev *ca;
+       unsigned i, ec_idx;
+
+       if (!erasure_code)
+               return;
+
+       if (nr_replicas < 2)
+               return;
+
+       if (ec_open_bucket(c, ptrs))
+               return;
+
+       h = bch2_ec_stripe_head_get(c, target, erasure_code, nr_replicas - 1);
+       if (!h)
+               return;
+
+       if (!h->s && ec_stripe_alloc(c, h))
+               goto out_put_head;
+
+       rcu_read_lock();
+       devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
+       rcu_read_unlock();
+
+       for (i = 0; i < devs_sorted.nr; i++)
+               open_bucket_for_each(c, &h->s->blocks, ob, ec_idx)
+                       if (ob->ptr.dev == devs_sorted.devs[i] &&
+                           !test_and_set_bit(ec_idx, h->s->blocks_allocated))
+                               goto got_bucket;
+       goto out_put_head;
+got_bucket:
+       ca = bch_dev_bkey_exists(c, ob->ptr.dev);
+
+       ob->ec_idx      = ec_idx;
+       ob->ec          = h->s;
+
+       __clear_bit(ob->ptr.dev, devs_may_alloc->d);
+       *nr_effective   += ca->mi.durability;
+       *have_cache     |= !ca->mi.durability;
+
+       ob_push(c, ptrs, ob);
+       atomic_inc(&h->s->pin);
+out_put_head:
+       bch2_ec_stripe_head_put(h);
+}
+
 /* Sector allocator */
 
-static int get_buckets_from_writepoint(struct bch_fs *c,
-                                      struct open_buckets *ptrs,
-                                      struct write_point *wp,
-                                      struct bch_devs_mask *devs_may_alloc,
-                                      unsigned nr_replicas,
-                                      unsigned *nr_effective,
-                                      bool *have_cache)
+static void get_buckets_from_writepoint(struct bch_fs *c,
+                                       struct open_buckets *ptrs,
+                                       struct write_point *wp,
+                                       struct bch_devs_mask *devs_may_alloc,
+                                       unsigned nr_replicas,
+                                       unsigned *nr_effective,
+                                       bool *have_cache,
+                                       bool need_ec)
 {
        struct open_buckets ptrs_skip = { .nr = 0 };
        struct open_bucket *ob;
@@ -409,7 +570,8 @@ static int get_buckets_from_writepoint(struct bch_fs *c,
                if (*nr_effective < nr_replicas &&
                    test_bit(ob->ptr.dev, devs_may_alloc->d) &&
                    (ca->mi.durability ||
-                    (wp->type == BCH_DATA_USER && !*have_cache))) {
+                    (wp->type == BCH_DATA_USER && !*have_cache)) &&
+                   (ob->ec || !need_ec)) {
                        __clear_bit(ob->ptr.dev, devs_may_alloc->d);
                        *nr_effective   += ca->mi.durability;
                        *have_cache     |= !ca->mi.durability;
@@ -420,8 +582,6 @@ static int get_buckets_from_writepoint(struct bch_fs *c,
                }
        }
        wp->ptrs = ptrs_skip;
-
-       return *nr_effective < nr_replicas ? -ENOSPC : 0;
 }
 
 static int open_bucket_add_buckets(struct bch_fs *c,
@@ -429,22 +589,25 @@ static int open_bucket_add_buckets(struct bch_fs *c,
                                   struct write_point *wp,
                                   struct bch_devs_list *devs_have,
                                   u16 target,
+                                  unsigned erasure_code,
                                   unsigned nr_replicas,
                                   unsigned *nr_effective,
                                   bool *have_cache,
                                   enum alloc_reserve reserve,
-                                  struct closure *cl)
+                                  struct closure *_cl)
 {
        struct bch_devs_mask devs;
-       const struct bch_devs_mask *t;
        struct open_bucket *ob;
-       unsigned i;
+       struct closure *cl = NULL;
+       unsigned i, flags = BUCKET_ALLOC_USE_DURABILITY;
        int ret;
 
-       percpu_down_read_preempt_disable(&c->usage_lock);
-       rcu_read_lock();
+       if (wp->type == BCH_DATA_USER)
+               flags |= BUCKET_MAY_ALLOC_PARTIAL;
 
-       devs = c->rw_devs[wp->type];
+       rcu_read_lock();
+       devs = target_rw_devs(c, wp->type, target);
+       rcu_read_unlock();
 
        /* Don't allocate from devices we already have pointers to: */
        for (i = 0; i < devs_have->nr; i++)
@@ -453,50 +616,83 @@ static int open_bucket_add_buckets(struct bch_fs *c,
        open_bucket_for_each(c, ptrs, ob, i)
                __clear_bit(ob->ptr.dev, devs.d);
 
-       t = bch2_target_to_mask(c, target);
-       if (t)
-               bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
+       if (erasure_code) {
+               get_buckets_from_writepoint(c, ptrs, wp, &devs,
+                                           nr_replicas, nr_effective,
+                                           have_cache, true);
+               if (*nr_effective >= nr_replicas)
+                       return 0;
 
-       ret = get_buckets_from_writepoint(c, ptrs, wp, &devs,
-                               nr_replicas, nr_effective, have_cache);
-       if (!ret)
-               goto out;
+               bucket_alloc_from_stripe(c, ptrs, wp, &devs,
+                                        target, erasure_code,
+                                        nr_replicas, nr_effective,
+                                        have_cache);
+               if (*nr_effective >= nr_replicas)
+                       return 0;
+       }
+
+       get_buckets_from_writepoint(c, ptrs, wp, &devs,
+                                   nr_replicas, nr_effective,
+                                   have_cache, false);
+       if (*nr_effective >= nr_replicas)
+               return 0;
+
+       percpu_down_read_preempt_disable(&c->usage_lock);
+       rcu_read_lock();
 
+retry_blocking:
        /*
         * Try nonblocking first, so that if one device is full we'll try from
         * other devices:
         */
-       ret = bch2_bucket_alloc_set(c, ptrs, wp, &devs,
+       ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs,
                                nr_replicas, nr_effective, have_cache,
-                               reserve, NULL);
-       if (!ret || ret == -EROFS || !cl)
-               goto out;
+                               reserve, flags, cl);
+       if (ret && ret != -EROFS && !cl && _cl) {
+               cl = _cl;
+               goto retry_blocking;
+       }
 
-       ret = bch2_bucket_alloc_set(c, ptrs, wp, &devs,
-                               nr_replicas, nr_effective, have_cache,
-                               reserve, cl);
-out:
        rcu_read_unlock();
        percpu_up_read_preempt_enable(&c->usage_lock);
 
        return ret;
 }
 
-void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
-                         struct write_point *wp)
+void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca,
+                               struct open_buckets *obs,
+                               enum bch_data_type data_type)
 {
        struct open_buckets ptrs = { .nr = 0 };
-       struct open_bucket *ob;
-       unsigned i;
+       struct open_bucket *ob, *ob2;
+       unsigned i, j;
 
-       mutex_lock(&wp->lock);
-       open_bucket_for_each(c, &wp->ptrs, ob, i)
-               if (!ca || ob->ptr.dev == ca->dev_idx)
-                       open_bucket_free_unused(c, wp, ob);
+       open_bucket_for_each(c, obs, ob, i) {
+               bool drop = !ca || ob->ptr.dev == ca->dev_idx;
+
+               if (!drop && ob->ec) {
+                       mutex_lock(&ob->ec->lock);
+                       open_bucket_for_each(c, &ob->ec->blocks, ob2, j)
+                               drop |= ob2->ptr.dev == ca->dev_idx;
+                       open_bucket_for_each(c, &ob->ec->parity, ob2, j)
+                               drop |= ob2->ptr.dev == ca->dev_idx;
+                       mutex_unlock(&ob->ec->lock);
+               }
+
+               if (drop)
+                       bch2_open_bucket_put(c, ob);
                else
                        ob_push(c, &ptrs, ob);
+       }
 
-       wp->ptrs = ptrs;
+       *obs = ptrs;
+}
+
+void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca,
+                         struct write_point *wp)
+{
+       mutex_lock(&wp->lock);
+       bch2_open_buckets_stop_dev(c, ca, &wp->ptrs, wp->type);
        mutex_unlock(&wp->lock);
 }
 
@@ -629,6 +825,7 @@ out:
  */
 struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
                                unsigned target,
+                               unsigned erasure_code,
                                struct write_point_specifier write_point,
                                struct bch_devs_list *devs_have,
                                unsigned nr_replicas,
@@ -648,26 +845,37 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
        BUG_ON(!nr_replicas || !nr_replicas_required);
 retry:
        write_points_nr = c->write_points_nr;
+
        wp = writepoint_find(c, write_point.v);
 
+       /* metadata may not allocate on cache devices: */
+       if (wp->type != BCH_DATA_USER)
+               have_cache = true;
+
        if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) {
-               ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, target,
+               ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+                                             target, erasure_code,
                                              nr_replicas, &nr_effective,
                                              &have_cache, reserve, cl);
        } else {
-               ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, target,
+               ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+                                             target, erasure_code,
                                              nr_replicas, &nr_effective,
                                              &have_cache, reserve, NULL);
                if (!ret)
                        goto alloc_done;
 
-               ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, 0,
+               ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have,
+                                             0, erasure_code,
                                              nr_replicas, &nr_effective,
                                              &have_cache, reserve, cl);
        }
 alloc_done:
        BUG_ON(!ret && nr_effective < nr_replicas);
 
+       if (erasure_code && !ec_open_bucket(c, &ptrs))
+               pr_debug("failed to get ec bucket: ret %u", ret);
+
        if (ret == -EROFS &&
            nr_effective >= nr_replicas_required)
                ret = 0;
@@ -677,7 +885,7 @@ alloc_done:
 
        /* Free buckets we didn't use: */
        open_bucket_for_each(c, &wp->ptrs, ob, i)
-               open_bucket_free_unused(c, wp, ob);
+               open_bucket_free_unused(c, ob, wp->type == BCH_DATA_USER);
 
        wp->ptrs = ptrs;
 
@@ -696,7 +904,8 @@ err:
                if (ptrs.nr < ARRAY_SIZE(ptrs.v))
                        ob_push(c, &ptrs, ob);
                else
-                       open_bucket_free_unused(c, wp, ob);
+                       open_bucket_free_unused(c, ob,
+                                       wp->type == BCH_DATA_USER);
        wp->ptrs = ptrs;
 
        mutex_unlock(&wp->lock);
index 729afc922b7f869b09580c9cff6112a853a320dc..a332e9d70def3c463191e6d812d6a5c929ab12b3 100644 (file)
@@ -16,11 +16,11 @@ struct dev_alloc_list {
        u8              devs[BCH_SB_MEMBERS_MAX];
 };
 
-struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *,
-                                        struct write_point *,
-                                        struct bch_devs_mask *);
-void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
-                    struct write_point *);
+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *,
+                                         struct dev_stripe_state *,
+                                         struct bch_devs_mask *);
+void bch2_dev_stripe_increment(struct bch_fs *, struct bch_dev *,
+                              struct dev_stripe_state *);
 
 long bch2_bucket_alloc_new_fs(struct bch_dev *);
 
@@ -42,6 +42,22 @@ static inline void ob_push(struct bch_fs *c, struct open_buckets *obs,
             ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true);        \
             (_i)++)
 
+static inline struct open_bucket *ec_open_bucket(struct bch_fs *c,
+                                                struct open_buckets *obs)
+{
+       struct open_bucket *ob;
+       unsigned i;
+
+       open_bucket_for_each(c, obs, ob, i)
+               if (ob->ec)
+                       return ob;
+
+       return NULL;
+}
+
+void bch2_open_bucket_write_error(struct bch_fs *,
+                       struct open_buckets *, unsigned);
+
 void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
 
 static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
@@ -75,7 +91,7 @@ static inline void bch2_open_bucket_get(struct bch_fs *c,
 }
 
 struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
-                                            unsigned,
+                                            unsigned, unsigned,
                                             struct write_point_specifier,
                                             struct bch_devs_list *,
                                             unsigned, unsigned,
@@ -87,6 +103,9 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
                                    struct bkey_i_extent *, unsigned);
 void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
 
+void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *,
+                               struct open_buckets *, enum bch_data_type);
+
 void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *,
                          struct write_point *);
 
index 110663ffc17052786d00412a3e92acc7066595ea..6f17f094c21e47f6547d61fed45097c0f766649e 100644 (file)
@@ -7,6 +7,8 @@
 #include "clock_types.h"
 #include "fifo.h"
 
+struct ec_bucket_buf;
+
 /* There's two of these clocks, one for reads and one for writes: */
 struct bucket_clock {
        /*
@@ -55,8 +57,10 @@ struct open_bucket {
        u8                      freelist;
        bool                    valid;
        bool                    on_partial_list;
+       u8                      ec_idx;
        unsigned                sectors_free;
        struct bch_extent_ptr   ptr;
+       struct ec_stripe_new    *ec;
 };
 
 #define OPEN_BUCKET_LIST_MAX   15
@@ -66,18 +70,23 @@ struct open_buckets {
        u8                      v[OPEN_BUCKET_LIST_MAX];
 };
 
+struct dev_stripe_state {
+       u64                     next_alloc[BCH_SB_MEMBERS_MAX];
+};
+
 struct write_point {
        struct hlist_node       node;
        struct mutex            lock;
        u64                     last_used;
        unsigned long           write_point;
        enum bch_data_type      type;
+       bool                    is_ec;
 
        /* calculated based on how many pointers we're actually going to use: */
        unsigned                sectors_free;
 
        struct open_buckets     ptrs;
-       u64                     next_alloc[BCH_SB_MEMBERS_MAX];
+       struct dev_stripe_state stripe;
 };
 
 struct write_point_specifier {
index e23f45e88e96a11c77f91dbc82273c2eb154b818..05891a011b8409c0942ba0db5eb7de409d7dd4ba 100644 (file)
 
 #include <linux/dynamic_fault.h>
 
-#define bch2_fs_init_fault(name)                                               \
+#define bch2_fs_init_fault(name)                                       \
        dynamic_fault("bcachefs:bch_fs_init:" name)
 #define bch2_meta_read_fault(name)                                     \
         dynamic_fault("bcachefs:meta:read:" name)
@@ -270,7 +270,10 @@ do {                                                                       \
        BCH_DEBUG_PARAM(test_alloc_startup,                             \
                "Force allocator startup to use the slowpath where it"  \
                "can't find enough free buckets without invalidating"   \
-               "cached data")
+               "cached data")                                          \
+       BCH_DEBUG_PARAM(force_reconstruct_read,                         \
+               "Force reads to use the reconstruct path, when reading" \
+               "from erasure coded extents")
 
 #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
 
@@ -308,6 +311,7 @@ enum bch_time_stats {
 #include "btree_types.h"
 #include "buckets_types.h"
 #include "clock_types.h"
+#include "ec_types.h"
 #include "journal_types.h"
 #include "keylist_types.h"
 #include "quota_types.h"
@@ -330,13 +334,16 @@ enum gc_phase {
        GC_PHASE_START,
        GC_PHASE_SB,
 
-#define DEF_BTREE_ID(kwd, val, name) GC_PHASE_BTREE_##kwd,
-       DEFINE_BCH_BTREE_IDS()
-#undef DEF_BTREE_ID
+       GC_PHASE_BTREE_EC,
+       GC_PHASE_BTREE_EXTENTS,
+       GC_PHASE_BTREE_INODES,
+       GC_PHASE_BTREE_DIRENTS,
+       GC_PHASE_BTREE_XATTRS,
+       GC_PHASE_BTREE_ALLOC,
+       GC_PHASE_BTREE_QUOTAS,
 
        GC_PHASE_PENDING_DELETE,
        GC_PHASE_ALLOC,
-       GC_PHASE_DONE
 };
 
 struct gc_pos {
@@ -381,14 +388,14 @@ struct bch_dev {
         * gc_lock, for device resize - holding any is sufficient for access:
         * Or rcu_read_lock(), but only for ptr_stale():
         */
-       struct bucket_array __rcu *buckets;
+       struct bucket_array __rcu *buckets[2];
        unsigned long           *buckets_dirty;
+       unsigned long           *buckets_written;
        /* most out of date gen in the btree */
        u8                      *oldest_gens;
        struct rw_semaphore     bucket_lock;
 
-       struct bch_dev_usage __percpu *usage_percpu;
-       struct bch_dev_usage    usage_cached;
+       struct bch_dev_usage __percpu *usage[2];
 
        /* Allocator: */
        struct task_struct __rcu *alloc_thread;
@@ -466,7 +473,6 @@ enum {
 
        /* errors: */
        BCH_FS_ERROR,
-       BCH_FS_GC_FAILURE,
 
        /* misc: */
        BCH_FS_BDEV_MOUNTED,
@@ -602,8 +608,8 @@ struct bch_fs {
 
        atomic64_t              sectors_available;
 
-       struct bch_fs_usage __percpu *usage_percpu;
-       struct bch_fs_usage     usage_cached;
+       struct bch_fs_usage __percpu *usage[2];
+
        struct percpu_rw_semaphore usage_lock;
 
        struct closure_waitlist freelist_wait;
@@ -644,9 +650,6 @@ struct bch_fs {
         *
         * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.)
         *
-        * gc_cur_phase == GC_PHASE_DONE indicates that gc is finished/not
-        * currently running, and gc marks are currently valid
-        *
         * Protected by gc_pos_lock. Only written to by GC thread, so GC thread
         * can read without a lock.
         */
@@ -681,6 +684,21 @@ struct bch_fs {
        /* REBALANCE */
        struct bch_fs_rebalance rebalance;
 
+       /* ERASURE CODING */
+       struct list_head        ec_new_stripe_list;
+       struct mutex            ec_new_stripe_lock;
+
+       GENRADIX(struct ec_stripe) ec_stripes;
+       struct mutex            ec_stripes_lock;
+
+       ec_stripes_heap         ec_stripes_heap;
+       spinlock_t              ec_stripes_heap_lock;
+
+       struct bio_set          ec_bioset;
+
+       struct work_struct      ec_stripe_delete_work;
+       struct llist_head       ec_stripe_delete_list;
+
        /* VFS IO PATH - fs-io.c */
        struct bio_set          writepage_bioset;
        struct bio_set          dio_write_bioset;
index 56fef9e41eb5f372af3f9c44c1ebf62a5ad183f8..c462ab277b608cbab857a11fb0d1ae9cbb87b5af 100644 (file)
@@ -233,6 +233,9 @@ struct bkey_packed {
 } __attribute__((packed, aligned(8)));
 
 #define BKEY_U64s                      (sizeof(struct bkey) / sizeof(__u64))
+#define BKEY_U64s_MAX                  U8_MAX
+#define BKEY_VAL_U64s_MAX              (BKEY_U64s_MAX - BKEY_U64s)
+
 #define KEY_PACKED_BITS_START          24
 
 #define KEY_FORMAT_LOCAL_BTREE         0
@@ -460,8 +463,9 @@ enum bch_compression_type {
        x(ptr,                  0)              \
        x(crc32,                1)              \
        x(crc64,                2)              \
-       x(crc128,               3)
-#define BCH_EXTENT_ENTRY_MAX   4
+       x(crc128,               3)              \
+       x(stripe_ptr,           4)
+#define BCH_EXTENT_ENTRY_MAX   5
 
 enum bch_extent_entry_type {
 #define x(f, n) BCH_EXTENT_ENTRY_##f = n,
@@ -552,7 +556,7 @@ struct bch_extent_ptr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
        __u64                   type:1,
                                cached:1,
-                               erasure_coded:1,
+                               unused:1,
                                reservation:1,
                                offset:44, /* 8 petabytes */
                                dev:8,
@@ -562,23 +566,35 @@ struct bch_extent_ptr {
                                dev:8,
                                offset:44,
                                reservation:1,
-                               erasure_coded:1,
+                               unused:1,
                                cached:1,
                                type:1;
 #endif
 } __attribute__((packed, aligned(8)));
 
-struct bch_extent_reservation {
+struct bch_extent_stripe_ptr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
        __u64                   type:5,
-                               unused:23,
+                               block:8,
+                               idx:51;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+       __u64                   idx:51,
+                               block:8,
+                               type:5;
+#endif
+};
+
+struct bch_extent_reservation {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+       __u64                   type:6,
+                               unused:22,
                                replicas:4,
                                generation:32;
 #elif defined (__BIG_ENDIAN_BITFIELD)
        __u64                   generation:32,
                                replicas:4,
-                               unused:23,
-                               type:5;
+                               unused:22,
+                               type:6;
 #endif
 };
 
@@ -701,7 +717,8 @@ BKEY_VAL_TYPE(inode_generation,     BCH_INODE_GENERATION);
        BCH_INODE_FIELD(bi_data_replicas,               8)      \
        BCH_INODE_FIELD(bi_promote_target,              16)     \
        BCH_INODE_FIELD(bi_foreground_target,           16)     \
-       BCH_INODE_FIELD(bi_background_target,           16)
+       BCH_INODE_FIELD(bi_background_target,           16)     \
+       BCH_INODE_FIELD(bi_erasure_code,                16)
 
 #define BCH_INODE_FIELDS_INHERIT()                             \
        BCH_INODE_FIELD(bi_data_checksum)                       \
@@ -711,7 +728,8 @@ BKEY_VAL_TYPE(inode_generation,     BCH_INODE_GENERATION);
        BCH_INODE_FIELD(bi_data_replicas)                       \
        BCH_INODE_FIELD(bi_promote_target)                      \
        BCH_INODE_FIELD(bi_foreground_target)                   \
-       BCH_INODE_FIELD(bi_background_target)
+       BCH_INODE_FIELD(bi_background_target)                   \
+       BCH_INODE_FIELD(bi_erasure_code)
 
 enum {
        /*
@@ -871,6 +889,27 @@ struct bch_quota {
 } __attribute__((packed, aligned(8)));
 BKEY_VAL_TYPE(quota,   BCH_QUOTA);
 
+/* Erasure coding */
+
+enum {
+       BCH_STRIPE              = 128,
+};
+
+struct bch_stripe {
+       struct bch_val          v;
+       __le16                  sectors;
+       __u8                    algorithm;
+       __u8                    nr_blocks;
+       __u8                    nr_redundant;
+
+       __u8                    csum_granularity_bits;
+       __u8                    csum_type;
+       __u8                    pad;
+
+       struct bch_extent_ptr   ptrs[0];
+} __attribute__((packed, aligned(8)));
+BKEY_VAL_TYPE(stripe,  BCH_STRIPE);
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -1060,7 +1099,7 @@ struct bch_sb_field_quota {
 struct bch_disk_group {
        __u8                    label[BCH_SB_LABEL_SIZE];
        __le64                  flags[2];
-};
+} __attribute__((packed, aligned(8)));
 
 LE64_BITMASK(BCH_GROUP_DELETED,                struct bch_disk_group, flags[0], 0,  1)
 LE64_BITMASK(BCH_GROUP_DATA_ALLOWED,   struct bch_disk_group, flags[0], 1,  6)
@@ -1069,7 +1108,7 @@ LE64_BITMASK(BCH_GROUP_PARENT,            struct bch_disk_group, flags[0], 6, 24)
 struct bch_sb_field_disk_groups {
        struct bch_sb_field     field;
        struct bch_disk_group   entries[0];
-};
+} __attribute__((packed, aligned(8)));
 
 /*
  * On clean shutdown, store btree roots and current journal sequence number in
@@ -1235,12 +1274,15 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
                                        struct bch_sb, flags[2],  0,  4);
 LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,  struct bch_sb, flags[2],  4, 64);
 
+LE64_BITMASK(BCH_SB_ERASURE_CODE,      struct bch_sb, flags[3],  0, 16);
+
 /* Features: */
 enum bch_sb_features {
        BCH_FEATURE_LZ4                 = 0,
        BCH_FEATURE_GZIP                = 1,
        BCH_FEATURE_ZSTD                = 2,
        BCH_FEATURE_ATOMIC_NLINK        = 3, /* should have gone under compat */
+       BCH_FEATURE_EC                  = 4,
        BCH_FEATURE_NR,
 };
 
@@ -1407,7 +1449,8 @@ LE32_BITMASK(JSET_BIG_ENDIAN,     struct jset, flags, 4, 5);
        DEF_BTREE_ID(DIRENTS,   2, "dirents")                   \
        DEF_BTREE_ID(XATTRS,    3, "xattrs")                    \
        DEF_BTREE_ID(ALLOC,     4, "alloc")                     \
-       DEF_BTREE_ID(QUOTAS,    5, "quotas")
+       DEF_BTREE_ID(QUOTAS,    5, "quotas")                    \
+       DEF_BTREE_ID(EC,        6, "erasure_coding")
 
 #define DEF_BTREE_ID(kwd, val, name) BTREE_ID_##kwd = val,
 
index bd1d21b0e49b17f609757091526add4826e0b71d..28bf646c207d66e022967e052975a02d3c0373ce 100644 (file)
@@ -579,6 +579,8 @@ BKEY_VAL_ACCESSORS(alloc,           BCH_ALLOC);
 
 BKEY_VAL_ACCESSORS(quota,              BCH_QUOTA);
 
+BKEY_VAL_ACCESSORS(stripe,             BCH_STRIPE);
+
 /* byte order helpers */
 
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
index 43bcbb08367d89eed3f3e502fbb86e10c31b2bc0..97d72d2bb89a395613429c4e2d5f445b1d9362d2 100644 (file)
@@ -4,6 +4,7 @@
 #include "btree_types.h"
 #include "alloc_background.h"
 #include "dirent.h"
+#include "ec.h"
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
@@ -17,6 +18,7 @@ const struct bkey_ops bch2_bkey_ops[] = {
        [BKEY_TYPE_XATTRS]      = bch2_bkey_xattr_ops,
        [BKEY_TYPE_ALLOC]       = bch2_bkey_alloc_ops,
        [BKEY_TYPE_QUOTAS]      = bch2_bkey_quota_ops,
+       [BKEY_TYPE_EC]          = bch2_bkey_ec_ops,
        [BKEY_TYPE_BTREE]       = bch2_bkey_btree_ops,
 };
 
index 6b67da90bcea160f5e1f51db60a8483eb300146a..9fe438d0cbc76c0fc4db00b182f2f557ab00cdae 100644 (file)
@@ -14,6 +14,7 @@
 #include "buckets.h"
 #include "clock.h"
 #include "debug.h"
+#include "ec.h"
 #include "error.h"
 #include "extents.h"
 #include "journal.h"
@@ -113,6 +114,7 @@ static bool bkey_type_needs_gc(enum bkey_type type)
        switch (type) {
        case BKEY_TYPE_BTREE:
        case BKEY_TYPE_EXTENTS:
+       case BKEY_TYPE_EC:
                return true;
        default:
                return false;
@@ -153,6 +155,17 @@ static u8 ptr_gens_recalc_oldest(struct bch_fs *c,
                }
                }
                break;
+       case BKEY_TYPE_EC:
+               switch (k.k->type) {
+               case BCH_STRIPE: {
+                       struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+                       for (ptr = s.v->ptrs;
+                            ptr < s.v->ptrs + s.v->nr_blocks;
+                            ptr++)
+                               ptr_gen_recalc_oldest(c, ptr, &max_stale);
+               }
+               }
        default:
                break;
        }
@@ -214,6 +227,21 @@ static int ptr_gens_check(struct bch_fs *c, enum bkey_type type,
                }
                }
                break;
+       case BKEY_TYPE_EC:
+               switch (k.k->type) {
+               case BCH_STRIPE: {
+                       struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+                       for (ptr = s.v->ptrs;
+                            ptr < s.v->ptrs + s.v->nr_blocks;
+                            ptr++) {
+                               ret = ptr_gen_check(c, type, ptr);
+                               if (ret)
+                                       return ret;
+                       }
+               }
+               }
+               break;
        default:
                break;
        }
@@ -229,8 +257,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
 {
        struct gc_pos pos = { 0 };
        unsigned flags =
-               BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-               BCH_BUCKET_MARK_GC_LOCK_HELD|
+               BCH_BUCKET_MARK_GC|
                (initial ? BCH_BUCKET_MARK_NOATOMIC : 0);
        int ret = 0;
 
@@ -359,15 +386,27 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
        return 0;
 }
 
+static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
+{
+       return  (int) btree_id_to_gc_phase(l) -
+               (int) btree_id_to_gc_phase(r);
+}
+
 static int bch2_gc_btrees(struct bch_fs *c, struct list_head *journal,
                          bool initial)
 {
+       enum btree_id ids[BTREE_ID_NR];
        unsigned i;
 
+       for (i = 0; i < BTREE_ID_NR; i++)
+               ids[i] = i;
+       bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
+
        for (i = 0; i < BTREE_ID_NR; i++) {
-               enum bkey_type type = bkey_type(0, i);
+               enum btree_id id = ids[i];
+               enum bkey_type type = bkey_type(0, id);
 
-               int ret = bch2_gc_btree(c, i, initial);
+               int ret = bch2_gc_btree(c, id, initial);
                if (ret)
                        return ret;
 
@@ -441,9 +480,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
                                      BCH_DATA_SB, flags);
        }
 
-       if (c)
-               spin_lock(&c->journal.lock);
-
        for (i = 0; i < ca->journal.nr; i++) {
                b = ca->journal.buckets[i];
                bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
@@ -453,7 +489,6 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 
        if (c) {
                percpu_up_read_preempt_enable(&c->usage_lock);
-               spin_unlock(&c->journal.lock);
        } else {
                preempt_enable();
        }
@@ -468,9 +503,7 @@ static void bch2_mark_superblocks(struct bch_fs *c)
        gc_pos_set(c, gc_phase(GC_PHASE_SB));
 
        for_each_online_member(ca, c, i)
-               bch2_mark_dev_superblock(c, ca,
-                                        BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-                                        BCH_BUCKET_MARK_GC_LOCK_HELD);
+               bch2_mark_dev_superblock(c, ca, BCH_BUCKET_MARK_GC);
        mutex_unlock(&c->sb_lock);
 }
 
@@ -478,7 +511,6 @@ static void bch2_mark_superblocks(struct bch_fs *c)
 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 {
        struct gc_pos pos = { 0 };
-       struct bch_fs_usage stats = { 0 };
        struct btree_update *as;
        struct pending_btree_node_free *d;
 
@@ -490,13 +522,8 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
                        bch2_mark_key(c, BKEY_TYPE_BTREE,
                                      bkey_i_to_s_c(&d->key),
                                      true, 0,
-                                     pos, &stats, 0,
-                                     BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-                                     BCH_BUCKET_MARK_GC_LOCK_HELD);
-       /*
-        * Don't apply stats - pending deletes aren't tracked in
-        * bch_alloc_stats:
-        */
+                                     pos, NULL, 0,
+                                     BCH_BUCKET_MARK_GC);
 
        mutex_unlock(&c->btree_interior_update_lock);
 }
@@ -517,8 +544,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
                fifo_for_each_entry(i, &ca->free_inc, iter)
                        bch2_mark_alloc_bucket(c, ca, i, true,
                                               gc_pos_alloc(c, NULL),
-                                              BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-                                              BCH_BUCKET_MARK_GC_LOCK_HELD);
+                                              BCH_BUCKET_MARK_GC);
 
 
 
@@ -526,8 +552,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
                        fifo_for_each_entry(i, &ca->free[j], iter)
                                bch2_mark_alloc_bucket(c, ca, i, true,
                                                       gc_pos_alloc(c, NULL),
-                                                      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-                                                      BCH_BUCKET_MARK_GC_LOCK_HELD);
+                                                      BCH_BUCKET_MARK_GC);
        }
 
        spin_unlock(&c->freelist_lock);
@@ -541,8 +566,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
                        ca = bch_dev_bkey_exists(c, ob->ptr.dev);
                        bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true,
                                               gc_pos_alloc(c, ob),
-                                              BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
-                                              BCH_BUCKET_MARK_GC_LOCK_HELD);
+                                              BCH_BUCKET_MARK_GC);
                }
                spin_unlock(&ob->lock);
        }
@@ -550,121 +574,310 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c)
        percpu_up_read_preempt_enable(&c->usage_lock);
 }
 
-static void bch2_gc_start(struct bch_fs *c)
+static void bch2_gc_free(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i;
+
+       for_each_member_device(ca, c, i) {
+               kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
+                       sizeof(struct bucket_array) +
+                       ca->mi.nbuckets * sizeof(struct bucket));
+               ca->buckets[1] = NULL;
+
+               free_percpu(ca->usage[1]);
+               ca->usage[1] = NULL;
+       }
+
+       free_percpu(c->usage[1]);
+       c->usage[1] = NULL;
+}
+
+static void bch2_gc_done_nocheck(struct bch_fs *c)
 {
        struct bch_dev *ca;
-       struct bucket_array *buckets;
-       struct bucket_mark new;
        unsigned i;
-       size_t b;
        int cpu;
 
-       percpu_down_write(&c->usage_lock);
+       for_each_member_device(ca, c, i) {
+               struct bucket_array *src = __bucket_array(ca, 1);
 
-       /*
-        * Indicates to buckets code that gc is now in progress - done under
-        * usage_lock to avoid racing with bch2_mark_key():
-        */
-       __gc_pos_set(c, gc_phase(GC_PHASE_START));
+               memcpy(__bucket_array(ca, 0), src,
+                      sizeof(struct bucket_array) +
+                      sizeof(struct bucket) * src->nbuckets);
+       };
 
-       /* Save a copy of the existing bucket stats while we recompute them: */
        for_each_member_device(ca, c, i) {
-               ca->usage_cached = __bch2_dev_usage_read(ca);
+               struct bch_dev_usage *p;
+
                for_each_possible_cpu(cpu) {
-                       struct bch_dev_usage *p =
-                               per_cpu_ptr(ca->usage_percpu, cpu);
+                       p = per_cpu_ptr(ca->usage[0], cpu);
                        memset(p, 0, sizeof(*p));
                }
+
+               preempt_disable();
+               *this_cpu_ptr(ca->usage[0]) = __bch2_dev_usage_read(ca, 1);
+               preempt_enable();
+       }
+
+       {
+               struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
+               struct bch_fs_usage *p;
+
+               for_each_possible_cpu(cpu) {
+                       p = per_cpu_ptr(c->usage[0], cpu);
+                       memset(p, 0, offsetof(typeof(*p), online_reserved));
+               }
+
+               preempt_disable();
+               memcpy(this_cpu_ptr(c->usage[0]),
+                      &src,
+                      offsetof(typeof(*p), online_reserved));
+               preempt_enable();
+       }
+
+}
+
+static void bch2_gc_done(struct bch_fs *c, bool initial)
+{
+       struct bch_dev *ca;
+       unsigned i;
+       int cpu;
+
+#define copy_field(_f, _msg, ...)                                      \
+       if (dst._f != src._f) {                                         \
+               pr_info(_msg ": got %llu, should be %llu, fixing"       \
+                       , ##__VA_ARGS__, dst._f, src._f);               \
+               dst._f = src._f;                                        \
+       }
+#define copy_bucket_field(_f)                                          \
+       if (dst->b[b].mark._f != src->b[b].mark._f) {                   \
+               pr_info("dev %u bucket %zu has wrong " #_f              \
+                       ": got %u, should be %u, fixing",               \
+                       i, b, dst->b[b].mark._f, src->b[b].mark._f);    \
+               dst->b[b]._mark._f = src->b[b].mark._f;                 \
+       }
+#define copy_dev_field(_f, _msg, ...)                                  \
+       copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
+#define copy_fs_field(_f, _msg, ...)                                   \
+       copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__)
+
+       percpu_down_write(&c->usage_lock);
+
+       if (initial) {
+               bch2_gc_done_nocheck(c);
+               goto out;
        }
 
-       c->usage_cached = __bch2_fs_usage_read(c);
-       for_each_possible_cpu(cpu) {
-               struct bch_fs_usage *p =
-                       per_cpu_ptr(c->usage_percpu, cpu);
+       for_each_member_device(ca, c, i) {
+               struct bucket_array *dst = __bucket_array(ca, 0);
+               struct bucket_array *src = __bucket_array(ca, 1);
+               size_t b;
+
+               if (initial) {
+                       memcpy(dst, src,
+                              sizeof(struct bucket_array) +
+                              sizeof(struct bucket) * dst->nbuckets);
+               }
 
-               memset(p->replicas, 0, sizeof(p->replicas));
-               memset(p->buckets, 0, sizeof(p->buckets));
+               for (b = 0; b < src->nbuckets; b++) {
+                       copy_bucket_field(gen);
+                       copy_bucket_field(data_type);
+                       copy_bucket_field(owned_by_allocator);
+                       copy_bucket_field(stripe);
+                       copy_bucket_field(dirty_sectors);
+                       copy_bucket_field(cached_sectors);
+               }
+       };
+
+       for_each_member_device(ca, c, i) {
+               struct bch_dev_usage dst = __bch2_dev_usage_read(ca, 0);
+               struct bch_dev_usage src = __bch2_dev_usage_read(ca, 1);
+               struct bch_dev_usage *p;
+               unsigned b;
+
+               for (b = 0; b < BCH_DATA_NR; b++)
+                       copy_dev_field(buckets[b],
+                                      "buckets[%s]", bch2_data_types[b]);
+               copy_dev_field(buckets_alloc, "buckets_alloc");
+               copy_dev_field(buckets_ec, "buckets_ec");
+
+               for (b = 0; b < BCH_DATA_NR; b++)
+                       copy_dev_field(sectors[b],
+                                      "sectors[%s]", bch2_data_types[b]);
+               copy_dev_field(sectors_fragmented,
+                              "sectors_fragmented");
+
+               for_each_possible_cpu(cpu) {
+                       p = per_cpu_ptr(ca->usage[0], cpu);
+                       memset(p, 0, sizeof(*p));
+               }
+
+               preempt_disable();
+               p = this_cpu_ptr(ca->usage[0]);
+               *p = dst;
+               preempt_enable();
        }
 
+       {
+               struct bch_fs_usage dst = __bch2_fs_usage_read(c, 0);
+               struct bch_fs_usage src = __bch2_fs_usage_read(c, 1);
+               struct bch_fs_usage *p;
+               unsigned r, b;
+
+               for (r = 0; r < BCH_REPLICAS_MAX; r++) {
+                       for (b = 0; b < BCH_DATA_NR; b++)
+                               copy_fs_field(replicas[r].data[b],
+                                             "replicas[%i].data[%s]",
+                                             r, bch2_data_types[b]);
+                       copy_fs_field(replicas[r].ec_data,
+                                     "replicas[%i].ec_data", r);
+                       copy_fs_field(replicas[r].persistent_reserved,
+                                     "replicas[%i].persistent_reserved", r);
+               }
+
+               for (b = 0; b < BCH_DATA_NR; b++)
+                       copy_fs_field(buckets[b],
+                                     "buckets[%s]", bch2_data_types[b]);
+
+               for_each_possible_cpu(cpu) {
+                       p = per_cpu_ptr(c->usage[0], cpu);
+                       memset(p, 0, offsetof(typeof(*p), online_reserved));
+               }
+
+               preempt_disable();
+               p = this_cpu_ptr(c->usage[0]);
+               memcpy(p, &dst, offsetof(typeof(*p), online_reserved));
+               preempt_enable();
+       }
+out:
        percpu_up_write(&c->usage_lock);
 
-       /* Clear bucket marks: */
+#undef copy_field
+#undef copy_fs_field
+#undef copy_dev_field
+#undef copy_bucket_field
+}
+
+static int bch2_gc_start(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i;
+
+       BUG_ON(c->usage[1]);
+
+       c->usage[1] = alloc_percpu(struct bch_fs_usage);
+       if (!c->usage[1])
+               return -ENOMEM;
+
        for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               buckets = bucket_array(ca);
-
-               for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
-                       bucket_cmpxchg(buckets->b + b, new, ({
-                               new.owned_by_allocator  = 0;
-                               new.data_type           = 0;
-                               new.cached_sectors      = 0;
-                               new.dirty_sectors       = 0;
-                       }));
-                       ca->oldest_gens[b] = new.gen;
+               BUG_ON(ca->buckets[1]);
+               BUG_ON(ca->usage[1]);
+
+               ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
+                               ca->mi.nbuckets * sizeof(struct bucket),
+                               GFP_KERNEL|__GFP_ZERO);
+               if (!ca->buckets[1]) {
+                       percpu_ref_put(&ca->ref);
+                       return -ENOMEM;
+               }
+
+               ca->usage[1] = alloc_percpu(struct bch_dev_usage);
+               if (!ca->usage[1]) {
+                       percpu_ref_put(&ca->ref);
+                       return -ENOMEM;
                }
-               up_read(&ca->bucket_lock);
        }
+
+       percpu_down_write(&c->usage_lock);
+
+       for_each_member_device(ca, c, i) {
+               struct bucket_array *dst = __bucket_array(ca, 1);
+               struct bucket_array *src = __bucket_array(ca, 0);
+               size_t b;
+
+               dst->first_bucket       = src->first_bucket;
+               dst->nbuckets           = src->nbuckets;
+
+               for (b = 0; b < src->nbuckets; b++)
+                       dst->b[b]._mark.gen = src->b[b].mark.gen;
+       };
+
+       percpu_up_write(&c->usage_lock);
+
+       return 0;
 }
 
 /**
- * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes
+ * bch2_gc - walk _all_ references to buckets, and recompute them:
+ *
+ * Order matters here:
+ *  - Concurrent GC relies on the fact that we have a total ordering for
+ *    everything that GC walks - see  gc_will_visit_node(),
+ *    gc_will_visit_root()
+ *
+ *  - also, references move around in the course of index updates and
+ *    various other crap: everything needs to agree on the ordering
+ *    references are allowed to move around in - e.g., we're allowed to
+ *    start with a reference owned by an open_bucket (the allocator) and
+ *    move it to the btree, but not the reverse.
+ *
+ *    This is necessary to ensure that gc doesn't miss references that
+ *    move around - if references move backwards in the ordering GC
+ *    uses, GC could skip past them
  */
-void bch2_gc(struct bch_fs *c)
+int bch2_gc(struct bch_fs *c, struct list_head *journal, bool initial)
 {
        struct bch_dev *ca;
        u64 start_time = local_clock();
-       unsigned i;
+       unsigned i, iter = 0;
        int ret;
 
-       /*
-        * Walk _all_ references to buckets, and recompute them:
-        *
-        * Order matters here:
-        *  - Concurrent GC relies on the fact that we have a total ordering for
-        *    everything that GC walks - see  gc_will_visit_node(),
-        *    gc_will_visit_root()
-        *
-        *  - also, references move around in the course of index updates and
-        *    various other crap: everything needs to agree on the ordering
-        *    references are allowed to move around in - e.g., we're allowed to
-        *    start with a reference owned by an open_bucket (the allocator) and
-        *    move it to the btree, but not the reverse.
-        *
-        *    This is necessary to ensure that gc doesn't miss references that
-        *    move around - if references move backwards in the ordering GC
-        *    uses, GC could skip past them
-        */
        trace_gc_start(c);
 
-       /*
-        * Do this before taking gc_lock - bch2_disk_reservation_get() blocks on
-        * gc_lock if sectors_available goes to 0:
-        */
-       bch2_recalc_sectors_available(c);
-
        down_write(&c->gc_lock);
-       if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
+again:
+       ret = bch2_gc_start(c);
+       if (ret)
                goto out;
 
-       bch2_gc_start(c);
-
        bch2_mark_superblocks(c);
 
-       ret = bch2_gc_btrees(c, NULL, false);
-       if (ret) {
-               bch_err(c, "btree gc failed: %d", ret);
-               set_bit(BCH_FS_GC_FAILURE, &c->flags);
+       ret = bch2_gc_btrees(c, journal, initial);
+       if (ret)
                goto out;
-       }
 
        bch2_mark_pending_btree_node_frees(c);
        bch2_mark_allocator_buckets(c);
 
-       /* Indicates that gc is no longer in progress: */
-       gc_pos_set(c, gc_phase(GC_PHASE_DONE));
        c->gc_count++;
 out:
+       if (!ret && test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
+               /*
+                * XXX: make sure gens we fixed got saved
+                */
+               if (iter++ <= 2) {
+                       bch_info(c, "Fixed gens, restarting mark and sweep:");
+                       clear_bit(BCH_FS_FIXED_GENS, &c->flags);
+                       goto again;
+               }
+
+               bch_info(c, "Unable to fix bucket gens, looping");
+               ret = -EINVAL;
+       }
+
+       if (!ret)
+               bch2_gc_done(c, initial);
+
+       /* Indicates that gc is no longer in progress: */
+       __gc_pos_set(c, gc_phase(GC_PHASE_START));
+
+       bch2_gc_free(c);
        up_write(&c->gc_lock);
+
+       if (!ret && initial)
+               set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
+
        trace_gc_end(c);
        bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
 
@@ -680,6 +893,7 @@ out:
         * allocator thread - issue wakeup in case they blocked on gc_lock:
         */
        closure_wake_up(&c->freelist_wait);
+       return ret;
 }
 
 /* Btree coalescing */
@@ -995,9 +1209,6 @@ void bch2_coalesce(struct bch_fs *c)
 {
        enum btree_id id;
 
-       if (test_bit(BCH_FS_GC_FAILURE, &c->flags))
-               return;
-
        down_read(&c->gc_lock);
        trace_gc_coalesce_start(c);
 
@@ -1009,7 +1220,6 @@ void bch2_coalesce(struct bch_fs *c)
                if (ret) {
                        if (ret != -ESHUTDOWN)
                                bch_err(c, "btree coalescing failed: %d", ret);
-                       set_bit(BCH_FS_GC_FAILURE, &c->flags);
                        return;
                }
        }
@@ -1024,6 +1234,7 @@ static int bch2_gc_thread(void *arg)
        struct io_clock *clock = &c->io_clock[WRITE];
        unsigned long last = atomic_long_read(&clock->now);
        unsigned last_kick = atomic_read(&c->kick_gc);
+       int ret;
 
        set_freezable();
 
@@ -1057,7 +1268,9 @@ static int bch2_gc_thread(void *arg)
                last = atomic_long_read(&clock->now);
                last_kick = atomic_read(&c->kick_gc);
 
-               bch2_gc(c);
+               ret = bch2_gc(c, NULL, false);
+               if (ret)
+                       bch_err(c, "btree gc failed: %i", ret);
 
                debug_check_no_locks_held();
        }
@@ -1098,30 +1311,7 @@ int bch2_gc_thread_start(struct bch_fs *c)
 
 int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
 {
-       unsigned iter = 0;
-       int ret = 0;
-
-       down_write(&c->gc_lock);
-again:
-       bch2_gc_start(c);
-
-       bch2_mark_superblocks(c);
-
-       ret = bch2_gc_btrees(c, journal, true);
-       if (ret)
-               goto err;
-
-       if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
-               if (iter++ > 2) {
-                       bch_info(c, "Unable to fix bucket gens, looping");
-                       ret = -EINVAL;
-                       goto err;
-               }
-
-               bch_info(c, "Fixed gens, restarting initial mark and sweep:");
-               clear_bit(BCH_FS_FIXED_GENS, &c->flags);
-               goto again;
-       }
+       int ret = bch2_gc(c, journal, true);
 
        /*
         * Skip past versions that might have possibly been used (as nonces),
@@ -1130,9 +1320,5 @@ again:
        if (c->sb.encryption_type)
                atomic64_add(1 << 16, &c->key_version);
 
-       gc_pos_set(c, gc_phase(GC_PHASE_DONE));
-       set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-err:
-       up_write(&c->gc_lock);
        return ret;
 }
index 101a6a890bad59eeba2846b9fd3c7dfd90074b6c..d7809c2e7507c794b3b07104c9c440bc42eb2dfe 100644 (file)
@@ -6,7 +6,7 @@
 enum bkey_type;
 
 void bch2_coalesce(struct bch_fs *);
-void bch2_gc(struct bch_fs *);
+int bch2_gc(struct bch_fs *, struct list_head *, bool);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
 int bch2_initial_gc(struct bch_fs *, struct list_head *);
@@ -54,11 +54,22 @@ static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
        return 0;
 }
 
+static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
+{
+       switch (id) {
+#define DEF_BTREE_ID(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n;
+       DEFINE_BCH_BTREE_IDS()
+#undef DEF_BTREE_ID
+       default:
+               BUG();
+       }
+}
+
 static inline struct gc_pos gc_pos_btree(enum btree_id id,
                                         struct bpos pos, unsigned level)
 {
        return (struct gc_pos) {
-               .phase  = GC_PHASE_BTREE_EXTENTS + id,
+               .phase  = btree_id_to_gc_phase(id),
                .pos    = pos,
                .level  = level,
        };
@@ -93,14 +104,14 @@ static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *o
        };
 }
 
-static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
+static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
 {
        unsigned seq;
        bool ret;
 
        do {
                seq = read_seqcount_begin(&c->gc_pos_lock);
-               ret = gc_pos_cmp(c->gc_pos, pos) < 0;
+               ret = gc_pos_cmp(pos, c->gc_pos) <= 0;
        } while (read_seqcount_retry(&c->gc_pos_lock, seq));
 
        return ret;
index 1eae181dc9ff436a70cdbc97cc7c798a17b84573..ae1d4f85a16e5801b0e0e8feb996988d9c18b659 100644 (file)
@@ -817,7 +817,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
                         */
                        iter->level = depth_want;
                        iter->l[iter->level].b = NULL;
-                       return 0;
+                       return 1;
                }
 
                lock_type = __btree_lock_want(iter, iter->level);
@@ -1044,6 +1044,9 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
                        ? btree_iter_down(iter)
                        : btree_iter_lock_root(iter, depth_want);
                if (unlikely(ret)) {
+                       if (ret == 1)
+                               return 0;
+
                        iter->level = depth_want;
                        iter->l[iter->level].b = BTREE_ITER_NOT_END;
                        return ret;
index 4d34bdca317fd5ab542e43e58d0732fae334e729..537b8da74ece37cd0cad84a5315d1296c8e71acc 100644 (file)
@@ -159,7 +159,6 @@ static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b,
 {
        struct bch_fs *c = as->c;
        struct pending_btree_node_free *d;
-       unsigned replicas;
 
        /*
         * btree_update lock is only needed here to avoid racing with
@@ -177,15 +176,6 @@ found:
        BUG_ON(d->index_update_done);
        d->index_update_done = true;
 
-       /*
-        * Btree nodes are accounted as freed in bch_alloc_stats when they're
-        * freed from the index:
-        */
-       replicas = bch2_extent_nr_dirty_ptrs(k);
-       if (replicas)
-               stats->replicas[replicas - 1].data[BCH_DATA_BTREE] -=
-                       c->opts.btree_node_size * replicas;
-
        /*
         * We're dropping @k from the btree, but it's still live until the
         * index update is persistent so we need to keep a reference around for
@@ -207,15 +197,16 @@ found:
         * bch2_mark_key() compares the current gc pos to the pos we're
         * moving this reference from, hence one comparison here:
         */
-       if (gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
-               struct bch_fs_usage tmp = { 0 };
+       if (gc_pos_cmp(c->gc_pos, b
+                      ? gc_pos_btree_node(b)
+                      : gc_pos_btree_root(as->btree_id)) >= 0 &&
+           gc_pos_cmp(c->gc_pos, gc_phase(GC_PHASE_PENDING_DELETE)) < 0) {
+               struct gc_pos pos = { 0 };
 
                bch2_mark_key(c, BKEY_TYPE_BTREE,
                              bkey_i_to_s_c(&d->key),
-                             false, 0, b
-                             ? gc_pos_btree_node(b)
-                             : gc_pos_btree_root(as->btree_id),
-                             &tmp, 0, 0);
+                             false, 0, pos,
+                             NULL, 0, BCH_BUCKET_MARK_GC);
                /*
                 * Don't apply tmp - pending deletes aren't tracked in
                 * bch_alloc_stats:
@@ -286,19 +277,13 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
 static void bch2_btree_node_free_ondisk(struct bch_fs *c,
                                        struct pending_btree_node_free *pending)
 {
-       struct bch_fs_usage stats = { 0 };
-
        BUG_ON(!pending->index_update_done);
 
        bch2_mark_key(c, BKEY_TYPE_BTREE,
                      bkey_i_to_s_c(&pending->key),
                      false, 0,
                      gc_phase(GC_PHASE_PENDING_DELETE),
-                     &stats, 0, 0);
-       /*
-        * Don't apply stats - pending deletes aren't tracked in
-        * bch_alloc_stats:
-        */
+                     NULL, 0, 0);
 }
 
 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@@ -339,7 +324,7 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
        mutex_unlock(&c->btree_reserve_cache_lock);
 
 retry:
-       wp = bch2_alloc_sectors_start(c, c->opts.foreground_target,
+       wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
                                      writepoint_ptr(&c->btree_write_point),
                                      &devs_have,
                                      res->nr_replicas,
@@ -637,12 +622,12 @@ static void btree_update_wait_on_journal(struct closure *cl)
        int ret;
 
        ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
-       if (ret < 0)
-               goto err;
-       if (!ret) {
+       if (ret == -EAGAIN) {
                continue_at(cl, btree_update_wait_on_journal, system_wq);
                return;
        }
+       if (ret < 0)
+               goto err;
 
        bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
 err:
index 288d7ca66947baf450153d363e2435a8f363f36c..e8d6e07825bf583048bf924cc80027f2e5e80cc1 100644 (file)
@@ -343,19 +343,40 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
        trans_for_each_entry(trans, i)
                BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
 
-       u64s = 0;
-       trans_for_each_entry(trans, i)
-               u64s += jset_u64s(i->k->k.u64s);
-
        memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 
-       ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)
-               ? bch2_journal_res_get(&c->journal,
-                                     &trans->journal_res,
-                                     u64s, u64s)
-               : 0;
-       if (ret)
-               return ret;
+       if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+               u64s = 0;
+               trans_for_each_entry(trans, i)
+                       u64s += jset_u64s(i->k->k.u64s);
+
+               while ((ret = bch2_journal_res_get(&c->journal,
+                                       &trans->journal_res, u64s,
+                                       JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) {
+                       struct btree_iter *iter = trans->entries[0].iter;
+                       struct closure cl;
+
+                       bch2_btree_iter_unlock(iter);
+
+                       closure_init_stack(&cl);
+
+                       while ((ret = bch2_journal_open_seq_async(&c->journal,
+                                                       trans->journal_res.seq,
+                                                       &cl)) == -EAGAIN)
+                               closure_sync(&cl);
+
+                       if (ret)
+                               return ret;
+
+                       if (!bch2_btree_iter_relock(iter)) {
+                               trans_restart(" (iter relock after journal res get blocked)");
+                               return -EINTR;
+                       }
+               }
+
+               if (ret)
+                       return ret;
+       }
 
        multi_lock_write(c, trans);
 
index 86d57f3b4ac8807e88478edd9645b994f0860a1c..603776303ac5d5c68485ed70c27aecc2b8f3a0d0 100644 (file)
@@ -68,6 +68,7 @@
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "buckets.h"
+#include "ec.h"
 #include "error.h"
 #include "movinggc.h"
 
@@ -83,8 +84,7 @@ static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
 
 static void bch2_fs_stats_verify(struct bch_fs *c)
 {
-       struct bch_fs_usage stats =
-               __bch2_fs_usage_read(c);
+       struct bch_fs_usage stats =_bch2_fs_usage_read(c);
        unsigned i, j;
 
        for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
@@ -207,43 +207,24 @@ do {                                                                      \
        _acc;                                                           \
 })
 
-#define bch2_usage_read_cached(_c, _cached, _uncached)                 \
-({                                                                     \
-       typeof(_cached) _ret;                                           \
-       unsigned _seq;                                                  \
-                                                                       \
-       do {                                                            \
-               _seq = read_seqcount_begin(&(_c)->gc_pos_lock);         \
-               _ret = (_c)->gc_pos.phase == GC_PHASE_DONE              \
-                       ? bch2_usage_read_raw(_uncached)                        \
-                       : (_cached);                                    \
-       } while (read_seqcount_retry(&(_c)->gc_pos_lock, _seq));        \
-                                                                       \
-       _ret;                                                           \
-})
-
-struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
+struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca, bool gc)
 {
-       return bch2_usage_read_raw(ca->usage_percpu);
+       return bch2_usage_read_raw(ca->usage[gc]);
 }
 
 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
 {
-       return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
+       return bch2_usage_read_raw(ca->usage[0]);
 }
 
-struct bch_fs_usage
-__bch2_fs_usage_read(struct bch_fs *c)
+struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *c, bool gc)
 {
-       return bch2_usage_read_raw(c->usage_percpu);
+       return bch2_usage_read_raw(c->usage[gc]);
 }
 
-struct bch_fs_usage
-bch2_fs_usage_read(struct bch_fs *c)
+struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *c)
 {
-       return bch2_usage_read_cached(c,
-                                    c->usage_cached,
-                                    c->usage_percpu);
+       return bch2_usage_read_raw(c->usage[0]);
 }
 
 struct fs_usage_sum {
@@ -269,6 +250,7 @@ static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
        for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) {
                sum.data        += stats.replicas[i].data[BCH_DATA_BTREE];
                sum.data        += stats.replicas[i].data[BCH_DATA_USER];
+               sum.data        += stats.replicas[i].ec_data;
                sum.cached      += stats.replicas[i].data[BCH_DATA_CACHED];
                sum.reserved    += stats.replicas[i].persistent_reserved;
        }
@@ -324,13 +306,11 @@ static inline enum bch_data_type bucket_type(struct bucket_mark m)
                : m.data_type;
 }
 
-static bool bucket_became_unavailable(struct bch_fs *c,
-                                     struct bucket_mark old,
+static bool bucket_became_unavailable(struct bucket_mark old,
                                      struct bucket_mark new)
 {
        return is_available_bucket(old) &&
-              !is_available_bucket(new) &&
-              (!c || c->gc_pos.phase == GC_PHASE_DONE);
+              !is_available_bucket(new);
 }
 
 void bch2_fs_usage_apply(struct bch_fs *c,
@@ -360,12 +340,14 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 
        percpu_down_read_preempt_disable(&c->usage_lock);
        /* online_reserved not subject to gc: */
-       this_cpu_ptr(c->usage_percpu)->online_reserved +=
+       this_cpu_ptr(c->usage[0])->online_reserved +=
                stats->online_reserved;
        stats->online_reserved = 0;
 
-       if (!gc_will_visit(c, gc_pos))
-               bch2_usage_add(this_cpu_ptr(c->usage_percpu), stats);
+       bch2_usage_add(this_cpu_ptr(c->usage[0]), stats);
+
+       if (gc_visited(c, gc_pos))
+               bch2_usage_add(this_cpu_ptr(c->usage[1]), stats);
 
        bch2_fs_stats_verify(c);
        percpu_up_read_preempt_enable(&c->usage_lock);
@@ -374,8 +356,9 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 }
 
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
-                                 struct bch_fs_usage *stats,
-                                 struct bucket_mark old, struct bucket_mark new)
+                                 struct bch_fs_usage *fs_usage,
+                                 struct bucket_mark old, struct bucket_mark new,
+                                 bool gc)
 {
        struct bch_dev_usage *dev_usage;
 
@@ -387,16 +370,22 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
                bch2_data_types[old.data_type],
                bch2_data_types[new.data_type]);
 
-       stats->buckets[bucket_type(old)] -= ca->mi.bucket_size;
-       stats->buckets[bucket_type(new)] += ca->mi.bucket_size;
-
-       dev_usage = this_cpu_ptr(ca->usage_percpu);
+       dev_usage = this_cpu_ptr(ca->usage[gc]);
 
-       dev_usage->buckets[bucket_type(old)]--;
-       dev_usage->buckets[bucket_type(new)]++;
+       if (bucket_type(old) != bucket_type(new)) {
+               if (bucket_type(old)) {
+                       fs_usage->buckets[bucket_type(old)] -= ca->mi.bucket_size;
+                       dev_usage->buckets[bucket_type(old)]--;
+               } else {
+                       fs_usage->buckets[bucket_type(new)] += ca->mi.bucket_size;
+                       dev_usage->buckets[bucket_type(new)]++;
+               }
+       }
 
        dev_usage->buckets_alloc +=
                (int) new.owned_by_allocator - (int) old.owned_by_allocator;
+       dev_usage->buckets_ec +=
+               (int) new.stripe - (int) old.stripe;
        dev_usage->buckets_unavailable +=
                is_unavailable_bucket(new) - is_unavailable_bucket(old);
 
@@ -417,21 +406,18 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
 ({                                                             \
        struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
                                                                \
-       bch2_dev_usage_update(c, ca, stats, _old, new);         \
+       bch2_dev_usage_update(c, ca, stats, _old, new, gc);     \
        _old;                                                   \
 })
 
-void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
-                           size_t b, struct bucket_mark *old)
+static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+                                    size_t b, struct bucket_mark *old,
+                                    bool gc)
 {
-       struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
-       struct bucket *g;
+       struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]);
+       struct bucket *g = __bucket(ca, b, gc);
        struct bucket_mark new;
 
-       percpu_rwsem_assert_held(&c->usage_lock);
-
-       g = bucket(ca, b);
-
        *old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
                BUG_ON(!is_available_bucket(new));
 
@@ -442,38 +428,49 @@ void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
                new.gen++;
        }));
 
-       /*
-        * This isn't actually correct yet, since fs usage is still
-        * uncompressed sectors:
-        */
        stats->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors;
+}
+
+void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+                           size_t b, struct bucket_mark *old)
+{
+       percpu_rwsem_assert_held(&c->usage_lock);
+
+       __bch2_invalidate_bucket(c, ca, b, old, false);
 
        if (!old->owned_by_allocator && old->cached_sectors)
                trace_invalidate(ca, bucket_to_sector(ca, b),
                                 old->cached_sectors);
 }
 
-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
-                           size_t b, bool owned_by_allocator,
-                           struct gc_pos pos, unsigned flags)
+static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+                                    size_t b, bool owned_by_allocator,
+                                    bool gc)
 {
-       struct bch_fs_usage *stats = this_cpu_ptr(c->usage_percpu);
-       struct bucket *g;
+       struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]);
+       struct bucket *g = __bucket(ca, b, gc);
        struct bucket_mark old, new;
 
-       percpu_rwsem_assert_held(&c->usage_lock);
-       g = bucket(ca, b);
-
-       if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-           gc_will_visit(c, pos))
-               return;
-
        old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
                new.owned_by_allocator  = owned_by_allocator;
        }));
 
-       BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
-              c->gc_pos.phase == GC_PHASE_DONE);
+       BUG_ON(!gc &&
+              !owned_by_allocator && !old.owned_by_allocator);
+}
+
+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+                           size_t b, bool owned_by_allocator,
+                           struct gc_pos pos, unsigned flags)
+{
+       percpu_rwsem_assert_held(&c->usage_lock);
+
+       if (!(flags & BCH_BUCKET_MARK_GC))
+               __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false);
+
+       if ((flags & BCH_BUCKET_MARK_GC) ||
+           gc_visited(c, pos))
+               __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true);
 }
 
 #define checked_add(a, b)                                      \
@@ -483,35 +480,47 @@ do {                                                              \
        BUG_ON((a) != _res);                                    \
 } while (0)
 
+static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+                                       size_t b, enum bch_data_type type,
+                                       unsigned sectors, bool gc)
+{
+       struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
+       struct bucket *g = __bucket(ca, b, gc);
+       struct bucket_mark old, new;
+
+       BUG_ON(type != BCH_DATA_SB &&
+              type != BCH_DATA_JOURNAL);
+
+       old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+               new.data_type   = type;
+               checked_add(new.dirty_sectors, sectors);
+       }));
+
+       fs_usage->replicas[0].data[type] += sectors;
+}
+
 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
                               size_t b, enum bch_data_type type,
                               unsigned sectors, struct gc_pos pos,
                               unsigned flags)
 {
-       struct bch_fs_usage *stats;
-       struct bucket *g;
-       struct bucket_mark old, new;
-
        BUG_ON(type != BCH_DATA_SB &&
               type != BCH_DATA_JOURNAL);
 
        if (likely(c)) {
                percpu_rwsem_assert_held(&c->usage_lock);
 
-               if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-                   gc_will_visit(c, pos))
-                       return;
-
-               stats = this_cpu_ptr(c->usage_percpu);
-
-               g = bucket(ca, b);
-               old = bucket_data_cmpxchg(c, ca, stats, g, new, ({
-                       new.data_type = type;
-                       checked_add(new.dirty_sectors, sectors);
-               }));
-
-               stats->replicas[0].data[type] += sectors;
+               if (!(flags & BCH_BUCKET_MARK_GC))
+                       __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
+                                                   false);
+               if ((flags & BCH_BUCKET_MARK_GC) ||
+                   gc_visited(c, pos))
+                       __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
+                                                   true);
        } else {
+               struct bucket *g;
+               struct bucket_mark old, new;
+
                rcu_read_lock();
 
                g = bucket(ca, b);
@@ -522,9 +531,6 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 
                rcu_read_unlock();
        }
-
-       BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
-              bucket_became_unavailable(c, old, new));
 }
 
 static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
@@ -569,23 +575,15 @@ static void bch2_mark_pointer(struct bch_fs *c,
                              struct extent_ptr_decoded p,
                              s64 sectors, enum bch_data_type data_type,
                              struct bch_fs_usage *fs_usage,
-                             u64 journal_seq, unsigned flags)
+                             u64 journal_seq, unsigned flags,
+                             bool gc)
 {
        struct bucket_mark old, new;
        struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-       struct bucket *g = PTR_BUCKET(ca, &p.ptr);
+       size_t b = PTR_BUCKET_NR(ca, &p.ptr);
+       struct bucket *g = __bucket(ca, b, gc);
        u64 v;
 
-       if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) {
-               if (journal_seq)
-                       bucket_cmpxchg(g, new, ({
-                               new.journal_seq_valid   = 1;
-                               new.journal_seq         = journal_seq;
-                       }));
-
-               return;
-       }
-
        v = atomic64_read(&g->_mark.v);
        do {
                new.v.counter = old.v.counter = v;
@@ -627,17 +625,59 @@ static void bch2_mark_pointer(struct bch_fs *c,
                              old.v.counter,
                              new.v.counter)) != old.v.counter);
 
-       bch2_dev_usage_update(c, ca, fs_usage, old, new);
+       bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+
+       BUG_ON(!gc && bucket_became_unavailable(old, new));
+}
+
+static void bch2_mark_stripe_ptr(struct bch_fs *c,
+                                struct bch_extent_stripe_ptr p,
+                                s64 sectors, unsigned flags,
+                                s64 *adjusted_disk_sectors,
+                                unsigned *redundancy)
+{
+       struct ec_stripe *m;
+       unsigned old, new, nr_data;
+       int blocks_nonempty_delta;
+       s64 parity_sectors;
+
+       m = genradix_ptr(&c->ec_stripes, p.idx);
+       if (WARN_ON(!m))
+               return;
+
+       if (WARN_ON(!m->alive))
+               return;
+
+       nr_data = m->nr_blocks - m->nr_redundant;
+
+       parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data);
+
+       if (sectors < 0)
+               parity_sectors = -parity_sectors;
+
+       *adjusted_disk_sectors += parity_sectors;
+
+       *redundancy = max_t(unsigned, *redundancy, m->nr_redundant + 1);
+
+       new = atomic_add_return(sectors, &m->block_sectors[p.block]);
+       old = new - sectors;
+
+       blocks_nonempty_delta = (int) !!new - (int) !!old;
+       if (!blocks_nonempty_delta)
+               return;
+
+       atomic_add(blocks_nonempty_delta, &m->blocks_nonempty);
+
+       BUG_ON(atomic_read(&m->blocks_nonempty) < 0);
 
-       BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
-              bucket_became_unavailable(c, old, new));
+       bch2_stripes_heap_update(c, m, p.idx);
 }
 
 static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
                             s64 sectors, enum bch_data_type data_type,
-                            struct gc_pos pos,
                             struct bch_fs_usage *stats,
-                            u64 journal_seq, unsigned flags)
+                            u64 journal_seq, unsigned flags,
+                            bool gc)
 {
        BUG_ON(!sectors);
 
@@ -649,28 +689,43 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
                struct extent_ptr_decoded p;
                s64 cached_sectors      = 0;
                s64 dirty_sectors       = 0;
+               s64 ec_sectors          = 0;
                unsigned replicas       = 0;
+               unsigned ec_redundancy  = 0;
+               unsigned i;
 
                extent_for_each_ptr_decode(e, p, entry) {
                        s64 disk_sectors = ptr_disk_sectors(e, p, sectors);
+                       s64 adjusted_disk_sectors = disk_sectors;
 
                        bch2_mark_pointer(c, e, p, disk_sectors, data_type,
-                                         stats, journal_seq, flags);
+                                         stats, journal_seq, flags, gc);
 
+                       if (!p.ptr.cached)
+                               for (i = 0; i < p.ec_nr; i++)
+                                       bch2_mark_stripe_ptr(c, p.ec[i],
+                                                       disk_sectors, flags,
+                                                       &adjusted_disk_sectors,
+                                                       &ec_redundancy);
                        if (!p.ptr.cached)
                                replicas++;
 
                        if (p.ptr.cached)
-                               cached_sectors  += disk_sectors;
+                               cached_sectors  += adjusted_disk_sectors;
+                       else if (!p.ec_nr)
+                               dirty_sectors   += adjusted_disk_sectors;
                        else
-                               dirty_sectors   += disk_sectors;
+                               ec_sectors      += adjusted_disk_sectors;
                }
 
                replicas        = clamp_t(unsigned,     replicas,
                                          1, ARRAY_SIZE(stats->replicas));
+               ec_redundancy   = clamp_t(unsigned,     ec_redundancy,
+                                         1, ARRAY_SIZE(stats->replicas));
 
                stats->replicas[0].data[BCH_DATA_CACHED]        += cached_sectors;
                stats->replicas[replicas - 1].data[data_type]   += dirty_sectors;
+               stats->replicas[ec_redundancy - 1].ec_data      += ec_sectors;
                break;
        }
        case BCH_RESERVATION: {
@@ -686,64 +741,129 @@ static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
        }
 }
 
-void bch2_mark_key(struct bch_fs *c,
-                  enum bkey_type type, struct bkey_s_c k,
-                  bool inserting, s64 sectors,
-                  struct gc_pos pos,
-                  struct bch_fs_usage *stats,
-                  u64 journal_seq, unsigned flags)
+static void bucket_set_stripe(struct bch_fs *c,
+                             const struct bch_stripe *v,
+                             bool enabled,
+                             struct bch_fs_usage *fs_usage,
+                             u64 journal_seq,
+                             bool gc)
 {
-       /*
-        * synchronization w.r.t. GC:
-        *
-        * Normally, bucket sector counts/marks are updated on the fly, as
-        * references are added/removed from the btree, the lists of buckets the
-        * allocator owns, other metadata buckets, etc.
-        *
-        * When GC is in progress and going to mark this reference, we do _not_
-        * mark this reference here, to avoid double counting - GC will count it
-        * when it gets to it.
-        *
-        * To know whether we should mark a given reference (GC either isn't
-        * running, or has already marked references at this position) we
-        * construct a total order for everything GC walks. Then, we can simply
-        * compare the position of the reference we're marking - @pos - with
-        * GC's current position. If GC is going to mark this reference, GC's
-        * current position will be less than @pos; if GC's current position is
-        * greater than @pos GC has either already walked this position, or
-        * isn't running.
-        *
-        * To avoid racing with GC's position changing, we have to deal with
-        *  - GC's position being set to GC_POS_MIN when GC starts:
-        *    usage_lock guards against this
-        *  - GC's position overtaking @pos: we guard against this with
-        *    whatever lock protects the data structure the reference lives in
-        *    (e.g. the btree node lock, or the relevant allocator lock).
-        */
+       unsigned i;
 
-       percpu_down_read_preempt_disable(&c->usage_lock);
-       if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
-           gc_will_visit(c, pos))
-               flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
+       for (i = 0; i < v->nr_blocks; i++) {
+               const struct bch_extent_ptr *ptr = v->ptrs + i;
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+               size_t b = PTR_BUCKET_NR(ca, ptr);
+               struct bucket *g = __bucket(ca, b, gc);
+               struct bucket_mark new, old;
+
+               BUG_ON(ptr_stale(ca, ptr));
+
+               old = bucket_cmpxchg(g, new, ({
+                       new.stripe                      = enabled;
+                       if (journal_seq) {
+                               new.journal_seq_valid   = 1;
+                               new.journal_seq         = journal_seq;
+                       }
+               }));
+
+               BUG_ON(old.stripe == enabled);
+
+               bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+       }
+}
+
+static void bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
+                            bool inserting,
+                            struct bch_fs_usage *fs_usage,
+                            u64 journal_seq, unsigned flags,
+                            bool gc)
+{
+       switch (k.k->type) {
+       case BCH_STRIPE: {
+               struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+               size_t idx = s.k->p.offset;
+               struct ec_stripe *m = genradix_ptr(&c->ec_stripes, idx);
+               unsigned i;
 
-       if (!stats)
-               stats = this_cpu_ptr(c->usage_percpu);
+               BUG_ON(!m);
+               BUG_ON(m->alive == inserting);
 
+               BUG_ON(atomic_read(&m->blocks_nonempty));
+
+               for (i = 0; i < EC_STRIPE_MAX; i++)
+                       BUG_ON(atomic_read(&m->block_sectors[i]));
+
+               if (inserting) {
+                       m->sectors      = le16_to_cpu(s.v->sectors);
+                       m->algorithm    = s.v->algorithm;
+                       m->nr_blocks    = s.v->nr_blocks;
+                       m->nr_redundant = s.v->nr_redundant;
+               }
+
+               if (inserting)
+                       bch2_stripes_heap_insert(c, m, idx);
+               else
+                       bch2_stripes_heap_del(c, m, idx);
+
+               bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
+               break;
+       }
+       }
+}
+
+static void __bch2_mark_key(struct bch_fs *c,
+                           enum bkey_type type, struct bkey_s_c k,
+                           bool inserting, s64 sectors,
+                           struct bch_fs_usage *stats,
+                           u64 journal_seq, unsigned flags,
+                           bool gc)
+{
        switch (type) {
        case BKEY_TYPE_BTREE:
                bch2_mark_extent(c, k, inserting
                                 ?  c->opts.btree_node_size
                                 : -c->opts.btree_node_size,
                                 BCH_DATA_BTREE,
-                                pos, stats, journal_seq, flags);
+                                stats, journal_seq, flags, gc);
                break;
        case BKEY_TYPE_EXTENTS:
                bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
-                                pos, stats, journal_seq, flags);
+                                stats, journal_seq, flags, gc);
+               break;
+       case BKEY_TYPE_EC:
+               bch2_mark_stripe(c, k, inserting,
+                                stats, journal_seq, flags, gc);
                break;
        default:
                break;
        }
+}
+
+void bch2_mark_key(struct bch_fs *c,
+                  enum bkey_type type, struct bkey_s_c k,
+                  bool inserting, s64 sectors,
+                  struct gc_pos pos,
+                  struct bch_fs_usage *stats,
+                  u64 journal_seq, unsigned flags)
+{
+       percpu_down_read_preempt_disable(&c->usage_lock);
+
+       if (!(flags & BCH_BUCKET_MARK_GC)) {
+               if (!stats)
+                       stats = this_cpu_ptr(c->usage[0]);
+
+               __bch2_mark_key(c, type, k, inserting, sectors,
+                               stats, journal_seq, flags, false);
+       }
+
+       if ((flags & BCH_BUCKET_MARK_GC) ||
+           gc_visited(c, pos)) {
+               __bch2_mark_key(c, type, k, inserting, sectors,
+                               this_cpu_ptr(c->usage[1]),
+                               journal_seq, flags, true);
+       }
+
        percpu_up_read_preempt_enable(&c->usage_lock);
 }
 
@@ -819,28 +939,20 @@ void bch2_mark_update(struct btree_insert *trans,
 
 /* Disk reservations: */
 
-static u64 __recalc_sectors_available(struct bch_fs *c)
+static u64 bch2_recalc_sectors_available(struct bch_fs *c)
 {
        int cpu;
 
        for_each_possible_cpu(cpu)
-               per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
+               per_cpu_ptr(c->usage[0], cpu)->available_cache = 0;
 
        return avail_factor(bch2_fs_sectors_free(c, bch2_fs_usage_read(c)));
 }
 
-/* Used by gc when it's starting: */
-void bch2_recalc_sectors_available(struct bch_fs *c)
-{
-       percpu_down_write(&c->usage_lock);
-       atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
-       percpu_up_write(&c->usage_lock);
-}
-
 void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
 {
        percpu_down_read_preempt_disable(&c->usage_lock);
-       this_cpu_sub(c->usage_percpu->online_reserved,
+       this_cpu_sub(c->usage[0]->online_reserved,
                     res->sectors);
 
        bch2_fs_stats_verify(c);
@@ -860,7 +972,7 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
        int ret;
 
        percpu_down_read_preempt_disable(&c->usage_lock);
-       stats = this_cpu_ptr(c->usage_percpu);
+       stats = this_cpu_ptr(c->usage[0]);
 
        if (sectors <= stats->available_cache)
                goto out;
@@ -908,7 +1020,7 @@ recalculate:
        }
 
        percpu_down_write(&c->usage_lock);
-       sectors_available = __recalc_sectors_available(c);
+       sectors_available = bch2_recalc_sectors_available(c);
 
        if (sectors <= sectors_available ||
            (flags & BCH_DISK_RESERVATION_NOFAIL)) {
@@ -949,6 +1061,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 {
        struct bucket_array *buckets = NULL, *old_buckets = NULL;
        unsigned long *buckets_dirty = NULL;
+       unsigned long *buckets_written = NULL;
        u8 *oldest_gens = NULL;
        alloc_fifo      free[RESERVE_NR];
        alloc_fifo      free_inc;
@@ -962,7 +1075,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
        size_t copygc_reserve   = max_t(size_t, 2, nbuckets >> 7);
        size_t free_inc_nr      = max(max_t(size_t, 1, nbuckets >> 12),
                                      btree_reserve);
-       bool resize = ca->buckets != NULL,
+       bool resize = ca->buckets[0] != NULL,
             start_copygc = ca->copygc_thread != NULL;
        int ret = -ENOMEM;
        unsigned i;
@@ -980,6 +1093,9 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
            !(buckets_dirty     = kvpmalloc(BITS_TO_LONGS(nbuckets) *
                                            sizeof(unsigned long),
                                            GFP_KERNEL|__GFP_ZERO)) ||
+           !(buckets_written   = kvpmalloc(BITS_TO_LONGS(nbuckets) *
+                                           sizeof(unsigned long),
+                                           GFP_KERNEL|__GFP_ZERO)) ||
            !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
            !init_fifo(&free[RESERVE_MOVINGGC],
                       copygc_reserve, GFP_KERNEL) ||
@@ -1014,13 +1130,17 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
                memcpy(buckets_dirty,
                       ca->buckets_dirty,
                       BITS_TO_LONGS(n) * sizeof(unsigned long));
+               memcpy(buckets_written,
+                      ca->buckets_written,
+                      BITS_TO_LONGS(n) * sizeof(unsigned long));
        }
 
-       rcu_assign_pointer(ca->buckets, buckets);
+       rcu_assign_pointer(ca->buckets[0], buckets);
        buckets = old_buckets;
 
        swap(ca->oldest_gens, oldest_gens);
        swap(ca->buckets_dirty, buckets_dirty);
+       swap(ca->buckets_written, buckets_written);
 
        if (resize)
                percpu_up_write(&c->usage_lock);
@@ -1060,6 +1180,8 @@ err:
                free_fifo(&free[i]);
        kvpfree(buckets_dirty,
                BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+       kvpfree(buckets_written,
+               BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
        kvpfree(oldest_gens,
                nbuckets * sizeof(u8));
        if (buckets)
@@ -1077,19 +1199,21 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
        free_fifo(&ca->free_inc);
        for (i = 0; i < RESERVE_NR; i++)
                free_fifo(&ca->free[i]);
+       kvpfree(ca->buckets_written,
+               BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
        kvpfree(ca->buckets_dirty,
                BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
        kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
-       kvpfree(rcu_dereference_protected(ca->buckets, 1),
+       kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
                sizeof(struct bucket_array) +
                ca->mi.nbuckets * sizeof(struct bucket));
 
-       free_percpu(ca->usage_percpu);
+       free_percpu(ca->usage[0]);
 }
 
 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
-       if (!(ca->usage_percpu = alloc_percpu(struct bch_dev_usage)))
+       if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
                return -ENOMEM;
 
        return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
index e84247d55aa4022f9057667e6d6261b40e7426df..76ebe2ecb56ff834a442bf586150b4b21e2e0f90 100644 (file)
        _old;                                                   \
 })
 
-static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+static inline struct bucket_array *__bucket_array(struct bch_dev *ca,
+                                                 bool gc)
 {
-       return rcu_dereference_check(ca->buckets,
+       return rcu_dereference_check(ca->buckets[gc],
                                     !ca->fs ||
                                     percpu_rwsem_is_held(&ca->fs->usage_lock) ||
                                     lockdep_is_held(&ca->fs->gc_lock) ||
                                     lockdep_is_held(&ca->bucket_lock));
 }
 
-static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
+static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+{
+       return __bucket_array(ca, false);
+}
+
+static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
 {
-       struct bucket_array *buckets = bucket_array(ca);
+       struct bucket_array *buckets = __bucket_array(ca, gc);
 
        BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
        return buckets->b + b;
 }
 
+static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
+{
+       return __bucket(ca, b, false);
+}
+
 static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
                                         size_t b, int rw)
 {
@@ -128,7 +139,7 @@ static inline bool bucket_unused(struct bucket_mark mark)
 
 /* Device usage: */
 
-struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
+struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *, bool);
 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
 
 static inline u64 __dev_buckets_available(struct bch_dev *ca,
@@ -167,7 +178,7 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
 
 /* Filesystem usage: */
 
-struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
+struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *, bool);
 struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
 void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
                         struct disk_reservation *, struct gc_pos);
@@ -184,6 +195,7 @@ static inline bool is_available_bucket(struct bucket_mark mark)
 {
        return (!mark.owned_by_allocator &&
                !mark.dirty_sectors &&
+               !mark.stripe &&
                !mark.nouse);
 }
 
@@ -205,17 +217,13 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
                               struct gc_pos, unsigned);
 
 #define BCH_BUCKET_MARK_NOATOMIC               (1 << 0)
-#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE   (1 << 1)
-#define BCH_BUCKET_MARK_GC_WILL_VISIT          (1 << 2)
-#define BCH_BUCKET_MARK_GC_LOCK_HELD           (1 << 3)
+#define BCH_BUCKET_MARK_GC                     (1 << 1)
 
 void bch2_mark_key(struct bch_fs *, enum bkey_type, struct bkey_s_c,
                   bool, s64, struct gc_pos,
                   struct bch_fs_usage *, u64, unsigned);
 void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
 
-void bch2_recalc_sectors_available(struct bch_fs *);
-
 void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
 
 static inline void bch2_disk_reservation_put(struct bch_fs *c,
index 6f7d3a23d2f09862456e4dba2f2cd1d1c87ef747..0b1bd95419ca0f97e45c5d301e496c10c490f181 100644 (file)
@@ -18,7 +18,8 @@ struct bucket_mark {
                                gen_valid:1,
                                owned_by_allocator:1,
                                nouse:1,
-                               journal_seq_valid:1;
+                               journal_seq_valid:1,
+                               stripe:1;
                u16             dirty_sectors;
                u16             cached_sectors;
 
@@ -52,6 +53,7 @@ struct bucket_array {
 struct bch_dev_usage {
        u64                     buckets[BCH_DATA_NR];
        u64                     buckets_alloc;
+       u64                     buckets_ec;
        u64                     buckets_unavailable;
 
        /* _compressed_ sectors: */
@@ -61,15 +63,18 @@ struct bch_dev_usage {
 
 struct bch_fs_usage {
        /* all fields are in units of 512 byte sectors: */
-       u64                     online_reserved;
-       u64                     available_cache;
 
        struct {
                u64             data[BCH_DATA_NR];
+               u64             ec_data;
                u64             persistent_reserved;
        }                       replicas[BCH_REPLICAS_MAX];
 
        u64                     buckets[BCH_DATA_NR];
+
+       /* fields starting here aren't touched by gc: */
+       u64                     online_reserved;
+       u64                     available_cache;
 };
 
 /*
index 6379905bad7b4ee341e9fea7244a2c94b28dc8f7..e74fc1f8aee5cbd77863b8ab5390790917da0955 100644 (file)
@@ -601,11 +601,13 @@ have_compressed:
                        goto out;
        }
 
-       ret = mempool_init_kmalloc_pool(
-                       &c->decompress_workspace,
-                       1, decompress_workspace_size);
-       if (ret)
-               goto out;
+       if (!mempool_initialized(&c->decompress_workspace)) {
+               ret = mempool_init_kmalloc_pool(
+                               &c->decompress_workspace,
+                               1, decompress_workspace_size);
+               if (ret)
+                       goto out;
+       }
 out:
        pr_verbose_init(c->opts, "ret %i", ret);
        return ret;
index d6047283ddea7b40ca6f111f6059c2a7e1197340..b90b0ef5c85f8beecbed11710b0bff57d31d3bcb 100644 (file)
@@ -54,6 +54,19 @@ static inline struct target target_decode(unsigned target)
 }
 
 const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned);
+
+static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c,
+                                                 enum bch_data_type data_type,
+                                                 u16 target)
+{
+       struct bch_devs_mask devs = c->rw_devs[data_type];
+       const struct bch_devs_mask *t = bch2_target_to_mask(c, target);
+
+       if (t)
+               bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX);
+       return devs;
+}
+
 bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned);
 
 int bch2_disk_path_find(struct bch_sb_handle *, const char *);
diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c
new file mode 100644 (file)
index 0000000..02c51ea
--- /dev/null
@@ -0,0 +1,1283 @@
+
+/* erasure coding */
+
+#include "bcachefs.h"
+#include "alloc_foreground.h"
+#include "bset.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "buckets.h"
+#include "disk_groups.h"
+#include "ec.h"
+#include "error.h"
+#include "io.h"
+#include "keylist.h"
+#include "super-io.h"
+#include "util.h"
+
+#include <linux/sort.h>
+
+#ifdef __KERNEL__
+
+#include <linux/raid/pq.h>
+#include <linux/raid/xor.h>
+
+static void raid5_recov(unsigned disks, unsigned failed_idx,
+                       size_t size, void **data)
+{
+       unsigned i = 2, nr;
+
+       BUG_ON(failed_idx >= disks);
+
+       swap(data[0], data[failed_idx]);
+       memcpy(data[0], data[1], size);
+
+       while (i < disks) {
+               nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS);
+               xor_blocks(nr, size, data[0], data + i);
+               i += nr;
+       }
+
+       swap(data[0], data[failed_idx]);
+}
+
+static void raid_gen(int nd, int np, size_t size, void **v)
+{
+       if (np >= 1)
+               raid5_recov(nd + np, nd, size, v);
+       if (np >= 2)
+               raid6_call.gen_syndrome(nd + np, size, v);
+       BUG_ON(np > 2);
+}
+
+static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
+{
+       switch (nr) {
+       case 0:
+               break;
+       case 1:
+               if (ir[0] < nd + 1)
+                       raid5_recov(nd + 1, ir[0], size, v);
+               else
+                       raid6_call.gen_syndrome(nd + np, size, v);
+               break;
+       case 2:
+               if (ir[1] < nd) {
+                       /* data+data failure. */
+                       raid6_2data_recov(nd + np, size, ir[0], ir[1], v);
+               } else if (ir[0] < nd) {
+                       /* data + p/q failure */
+
+                       if (ir[1] == nd) /* data + p failure */
+                               raid6_datap_recov(nd + np, size, ir[0], v);
+                       else { /* data + q failure */
+                               raid5_recov(nd + 1, ir[0], size, v);
+                               raid6_call.gen_syndrome(nd + np, size, v);
+                       }
+               } else {
+                       raid_gen(nd, np, size, v);
+               }
+               break;
+       default:
+               BUG();
+       }
+}
+
+#else
+
+#include <raid/raid.h>
+
+#endif
+
+struct ec_bio {
+       struct bch_dev          *ca;
+       struct ec_stripe_buf    *buf;
+       size_t                  idx;
+       struct bio              bio;
+};
+
+/* Stripes btree keys: */
+
+static unsigned stripe_csums_per_device(const struct bch_stripe *s)
+{
+       return DIV_ROUND_UP(le16_to_cpu(s->sectors),
+                           1 << s->csum_granularity_bits);
+}
+
+static unsigned stripe_val_u64s(const struct bch_stripe *s)
+{
+       unsigned bytes = sizeof(struct bch_stripe) +
+               sizeof(struct bch_extent_ptr) * s->nr_blocks +
+               bch_crc_bytes[s->csum_type] * s->nr_blocks * stripe_csums_per_device(s);
+       return DIV_ROUND_UP(bytes, sizeof(u64));
+}
+
+static void *stripe_csum(struct bch_stripe *s, unsigned dev, unsigned csum_idx)
+{
+       unsigned csum_bytes = bch_crc_bytes[s->csum_type];
+       void *csums = s->ptrs + s->nr_blocks;
+
+       BUG_ON(!csum_bytes);
+
+       return csums + (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes;
+}
+
+const char *bch2_ec_key_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       if (k.k->p.inode)
+               return "invalid stripe key";
+
+       switch (k.k->type) {
+       case BCH_STRIPE: {
+               const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+
+               if (bkey_val_bytes(k.k) < sizeof(*s))
+                       return "incorrect value size";
+
+               if (bkey_val_u64s(k.k) != stripe_val_u64s(s))
+                       return "incorrect value size";
+
+               return NULL;
+       }
+       default:
+               return "invalid type";
+       }
+}
+
+void bch2_ec_key_to_text(struct printbuf *out, struct bch_fs *c,
+                        struct bkey_s_c k)
+{
+       switch (k.k->type) {
+       case BCH_STRIPE: {
+               const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+               unsigned i;
+
+               pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
+                      s->algorithm,
+                      le16_to_cpu(s->sectors),
+                      s->nr_blocks - s->nr_redundant,
+                      s->nr_redundant,
+                      s->csum_type,
+                      1U << s->csum_granularity_bits);
+
+               for (i = 0; i < s->nr_blocks; i++)
+                       pr_buf(out, " %u:%llu", s->ptrs[i].dev,
+                              (u64) s->ptrs[i].offset);
+       }
+       }
+}
+
+static int ptr_matches_stripe(struct bch_fs *c,
+                             struct bch_stripe *v,
+                             const struct bch_extent_ptr *ptr)
+{
+       unsigned i;
+
+       for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) {
+               const struct bch_extent_ptr *ptr2 = v->ptrs + i;
+
+               if (ptr->dev == ptr2->dev &&
+                   ptr->gen == ptr2->gen &&
+                   ptr->offset >= ptr2->offset &&
+                   ptr->offset <  ptr2->offset + le16_to_cpu(v->sectors))
+                       return i;
+       }
+
+       return -1;
+}
+
+static int extent_matches_stripe(struct bch_fs *c,
+                                struct bch_stripe *v,
+                                struct bkey_s_c k)
+{
+       struct bkey_s_c_extent e;
+       const struct bch_extent_ptr *ptr;
+       int idx;
+
+       if (!bkey_extent_is_data(k.k))
+               return -1;
+
+       e = bkey_s_c_to_extent(k);
+
+       extent_for_each_ptr(e, ptr) {
+               idx = ptr_matches_stripe(c, v, ptr);
+               if (idx >= 0)
+                       return idx;
+       }
+
+       return -1;
+}
+
+static void ec_stripe_key_init(struct bch_fs *c,
+                              struct bkey_i_stripe *s,
+                              struct open_buckets *blocks,
+                              struct open_buckets *parity,
+                              unsigned stripe_size)
+{
+       struct open_bucket *ob;
+       unsigned i, u64s;
+
+       bkey_stripe_init(&s->k_i);
+       s->v.sectors                    = cpu_to_le16(stripe_size);
+       s->v.algorithm                  = 0;
+       s->v.nr_blocks                  = parity->nr + blocks->nr;
+       s->v.nr_redundant               = parity->nr;
+       s->v.csum_granularity_bits      = ilog2(c->sb.encoded_extent_max);
+       s->v.csum_type                  = BCH_CSUM_CRC32C;
+       s->v.pad                        = 0;
+
+       open_bucket_for_each(c, blocks, ob, i)
+               s->v.ptrs[i]                    = ob->ptr;
+
+       open_bucket_for_each(c, parity, ob, i)
+               s->v.ptrs[blocks->nr + i]       = ob->ptr;
+
+       while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) {
+               BUG_ON(1 << s->v.csum_granularity_bits >=
+                      le16_to_cpu(s->v.sectors) ||
+                      s->v.csum_granularity_bits == U8_MAX);
+               s->v.csum_granularity_bits++;
+       }
+
+       set_bkey_val_u64s(&s->k, u64s);
+}
+
+/* Checksumming: */
+
+static void ec_generate_checksums(struct ec_stripe_buf *buf)
+{
+       struct bch_stripe *v = &buf->key.v;
+       unsigned csum_granularity = 1 << v->csum_granularity_bits;
+       unsigned csums_per_device = stripe_csums_per_device(v);
+       unsigned csum_bytes = bch_crc_bytes[v->csum_type];
+       unsigned i, j;
+
+       if (!csum_bytes)
+               return;
+
+       BUG_ON(buf->offset);
+       BUG_ON(buf->size != le16_to_cpu(v->sectors));
+
+       for (i = 0; i < v->nr_blocks; i++) {
+               for (j = 0; j < csums_per_device; j++) {
+                       unsigned offset = j << v->csum_granularity_bits;
+                       unsigned len = min(csum_granularity, buf->size - offset);
+
+                       struct bch_csum csum =
+                               bch2_checksum(NULL, v->csum_type,
+                                             null_nonce(),
+                                             buf->data[i] + (offset << 9),
+                                             len << 9);
+
+                       memcpy(stripe_csum(v, i, j), &csum, csum_bytes);
+               }
+       }
+}
+
+static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
+{
+       struct bch_stripe *v = &buf->key.v;
+       unsigned csum_granularity = 1 << v->csum_granularity_bits;
+       unsigned csum_bytes = bch_crc_bytes[v->csum_type];
+       unsigned i;
+
+       if (!csum_bytes)
+               return;
+
+       for (i = 0; i < v->nr_blocks; i++) {
+               unsigned offset = buf->offset;
+               unsigned end = buf->offset + buf->size;
+
+               if (!test_bit(i, buf->valid))
+                       continue;
+
+               while (offset < end) {
+                       unsigned j = offset >> v->csum_granularity_bits;
+                       unsigned len = min(csum_granularity, end - offset);
+                       struct bch_csum csum;
+
+                       BUG_ON(offset & (csum_granularity - 1));
+                       BUG_ON(offset + len != le16_to_cpu(v->sectors) &&
+                              ((offset + len) & (csum_granularity - 1)));
+
+                       csum = bch2_checksum(NULL, v->csum_type,
+                                            null_nonce(),
+                                            buf->data[i] + ((offset - buf->offset) << 9),
+                                            len << 9);
+
+                       if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) {
+                               __bcache_io_error(c,
+                                       "checksum error while doing reconstruct read (%u:%u)",
+                                       i, j);
+                               clear_bit(i, buf->valid);
+                               break;
+                       }
+
+                       offset += len;
+               }
+       }
+}
+
+/* Erasure coding: */
+
+static void ec_generate_ec(struct ec_stripe_buf *buf)
+{
+       struct bch_stripe *v = &buf->key.v;
+       unsigned nr_data = v->nr_blocks - v->nr_redundant;
+       unsigned bytes = le16_to_cpu(v->sectors) << 9;
+
+       raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
+}
+
+static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr)
+{
+       return nr - bitmap_weight(buf->valid, nr);
+}
+
+static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
+{
+       return __ec_nr_failed(buf, buf->key.v.nr_blocks);
+}
+
+static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
+{
+       struct bch_stripe *v = &buf->key.v;
+       unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0;
+       unsigned nr_data = v->nr_blocks - v->nr_redundant;
+       unsigned bytes = buf->size << 9;
+
+       if (ec_nr_failed(buf) > v->nr_redundant) {
+               __bcache_io_error(c,
+                       "error doing reconstruct read: unable to read enough blocks");
+               return -1;
+       }
+
+       for (i = 0; i < nr_data; i++)
+               if (!test_bit(i, buf->valid))
+                       failed[nr_failed++] = i;
+
+       raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data);
+       return 0;
+}
+
+/* IO: */
+
+static void ec_block_endio(struct bio *bio)
+{
+       struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
+       struct bch_dev *ca = ec_bio->ca;
+       struct closure *cl = bio->bi_private;
+
+       if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding"))
+               clear_bit(ec_bio->idx, ec_bio->buf->valid);
+
+       bio_put(&ec_bio->bio);
+       percpu_ref_put(&ca->io_ref);
+       closure_put(cl);
+}
+
+static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf,
+                       unsigned rw, unsigned idx, struct closure *cl)
+{
+       struct bch_stripe *v = &buf->key.v;
+       unsigned offset = 0, bytes = buf->size << 9;
+       struct bch_extent_ptr *ptr = &v->ptrs[idx];
+       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+       if (!bch2_dev_get_ioref(ca, rw)) {
+               clear_bit(idx, buf->valid);
+               return;
+       }
+
+       while (offset < bytes) {
+               unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES,
+                                          DIV_ROUND_UP(bytes, PAGE_SIZE));
+               unsigned b = min_t(size_t, bytes - offset,
+                                  nr_iovecs << PAGE_SHIFT);
+               struct ec_bio *ec_bio;
+
+               ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs,
+                                                      &c->ec_bioset),
+                                     struct ec_bio, bio);
+
+               ec_bio->ca                      = ca;
+               ec_bio->buf                     = buf;
+               ec_bio->idx                     = idx;
+
+               bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev);
+               bio_set_op_attrs(&ec_bio->bio, rw, 0);
+
+               ec_bio->bio.bi_iter.bi_sector   = ptr->offset + buf->offset + (offset >> 9);
+               ec_bio->bio.bi_iter.bi_size     = b;
+               ec_bio->bio.bi_end_io           = ec_block_endio;
+               ec_bio->bio.bi_private          = cl;
+
+               bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset);
+
+               closure_get(cl);
+               percpu_ref_get(&ca->io_ref);
+
+               submit_bio(&ec_bio->bio);
+
+               offset += b;
+       }
+
+       percpu_ref_put(&ca->io_ref);
+}
+
+/* recovery read path: */
+int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio)
+{
+       struct btree_iter iter;
+       struct ec_stripe_buf *buf;
+       struct closure cl;
+       struct bkey_s_c k;
+       struct bch_stripe *v;
+       unsigned stripe_idx;
+       unsigned offset, end;
+       unsigned i, nr_data, csum_granularity;
+       int ret = 0, idx;
+
+       closure_init_stack(&cl);
+
+       BUG_ON(!rbio->pick.idx ||
+              rbio->pick.idx - 1 >= rbio->pick.ec_nr);
+
+       stripe_idx = rbio->pick.ec[rbio->pick.idx - 1].idx;
+
+       buf = kzalloc(sizeof(*buf), GFP_NOIO);
+       if (!buf)
+               return -ENOMEM;
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_EC,
+                            POS(0, stripe_idx),
+                            BTREE_ITER_SLOTS);
+       k = bch2_btree_iter_peek_slot(&iter);
+       if (btree_iter_err(k) || k.k->type != BCH_STRIPE) {
+               __bcache_io_error(c,
+                       "error doing reconstruct read: stripe not found");
+               kfree(buf);
+               return bch2_btree_iter_unlock(&iter) ?: -EIO;
+       }
+
+       bkey_reassemble(&buf->key.k_i, k);
+       bch2_btree_iter_unlock(&iter);
+
+       v = &buf->key.v;
+
+       nr_data = v->nr_blocks - v->nr_redundant;
+
+       idx = ptr_matches_stripe(c, v, &rbio->pick.ptr);
+       BUG_ON(idx < 0);
+
+       csum_granularity = 1U << v->csum_granularity_bits;
+
+       offset  = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset;
+       end     = offset + bio_sectors(&rbio->bio);
+
+       BUG_ON(end > le16_to_cpu(v->sectors));
+
+       buf->offset     = round_down(offset, csum_granularity);
+       buf->size       = min_t(unsigned, le16_to_cpu(v->sectors),
+                               round_up(end, csum_granularity)) - buf->offset;
+
+       for (i = 0; i < v->nr_blocks; i++) {
+               buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO);
+               if (!buf->data[i]) {
+                       ret = -ENOMEM;
+                       goto err;
+               }
+       }
+
+       memset(buf->valid, 0xFF, sizeof(buf->valid));
+
+       for (i = 0; i < v->nr_blocks; i++) {
+               struct bch_extent_ptr *ptr = v->ptrs + i;
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+               if (ptr_stale(ca, ptr)) {
+                       __bcache_io_error(c,
+                                         "error doing reconstruct read: stale pointer");
+                       clear_bit(i, buf->valid);
+                       continue;
+               }
+
+               ec_block_io(c, buf, REQ_OP_READ, i, &cl);
+       }
+
+       closure_sync(&cl);
+
+       if (ec_nr_failed(buf) > v->nr_redundant) {
+               __bcache_io_error(c,
+                       "error doing reconstruct read: unable to read enough blocks");
+               ret = -EIO;
+               goto err;
+       }
+
+       ec_validate_checksums(c, buf);
+
+       ret = ec_do_recov(c, buf);
+       if (ret)
+               goto err;
+
+       memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter,
+                     buf->data[idx] + ((offset - buf->offset) << 9));
+err:
+       for (i = 0; i < v->nr_blocks; i++)
+               kfree(buf->data[i]);
+       kfree(buf);
+       return ret;
+}
+
+/* ec_stripe bucket accounting: */
+
+static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
+{
+       ec_stripes_heap n, *h = &c->ec_stripes_heap;
+
+       if (idx >= h->size) {
+               if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp))
+                       return -ENOMEM;
+
+               spin_lock(&c->ec_stripes_heap_lock);
+               if (n.size > h->size) {
+                       memcpy(n.data, h->data, h->used * sizeof(h->data[0]));
+                       n.used = h->used;
+                       swap(*h, n);
+               }
+               spin_unlock(&c->ec_stripes_heap_lock);
+
+               free_heap(&n);
+       }
+
+       if (!genradix_ptr_alloc(&c->ec_stripes, idx, gfp))
+               return -ENOMEM;
+
+       return 0;
+}
+
+static int ec_stripe_mem_alloc(struct bch_fs *c,
+                              struct btree_iter *iter)
+{
+       size_t idx = iter->pos.offset;
+
+       if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT))
+               return 0;
+
+       bch2_btree_iter_unlock(iter);
+
+       if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL))
+               return -EINTR;
+       return -ENOMEM;
+}
+
+static ssize_t stripe_idx_to_delete(struct bch_fs *c)
+{
+       ec_stripes_heap *h = &c->ec_stripes_heap;
+
+       return h->data[0].blocks_nonempty == 0 ? h->data[0].idx : -1;
+}
+
+static inline int ec_stripes_heap_cmp(ec_stripes_heap *h,
+                                     struct ec_stripe_heap_entry l,
+                                     struct ec_stripe_heap_entry r)
+{
+       return ((l.blocks_nonempty > r.blocks_nonempty) -
+               (l.blocks_nonempty < r.blocks_nonempty));
+}
+
+static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h,
+                                                  size_t i)
+{
+       struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap);
+
+       genradix_ptr(&c->ec_stripes, h->data[i].idx)->heap_idx = i;
+}
+
+static void heap_verify_backpointer(struct bch_fs *c, size_t idx)
+{
+       ec_stripes_heap *h = &c->ec_stripes_heap;
+       struct ec_stripe *m = genradix_ptr(&c->ec_stripes, idx);
+
+       BUG_ON(!m->alive);
+       BUG_ON(m->heap_idx >= h->used);
+       BUG_ON(h->data[m->heap_idx].idx != idx);
+}
+
+static inline unsigned stripe_entry_blocks(struct ec_stripe *m)
+{
+       return atomic_read(&m->pin)
+               ? UINT_MAX : atomic_read(&m->blocks_nonempty);
+}
+
+void bch2_stripes_heap_update(struct bch_fs *c,
+                             struct ec_stripe *m, size_t idx)
+{
+       ec_stripes_heap *h = &c->ec_stripes_heap;
+       bool queue_delete;
+       size_t i;
+
+       spin_lock(&c->ec_stripes_heap_lock);
+
+       if (!m->alive) {
+               spin_unlock(&c->ec_stripes_heap_lock);
+               return;
+       }
+
+       heap_verify_backpointer(c, idx);
+
+       h->data[m->heap_idx].blocks_nonempty =
+               stripe_entry_blocks(m);
+
+       i = m->heap_idx;
+       heap_sift_up(h,   i, ec_stripes_heap_cmp,
+                    ec_stripes_heap_set_backpointer);
+       heap_sift_down(h, i, ec_stripes_heap_cmp,
+                      ec_stripes_heap_set_backpointer);
+
+       heap_verify_backpointer(c, idx);
+
+       queue_delete = stripe_idx_to_delete(c) >= 0;
+       spin_unlock(&c->ec_stripes_heap_lock);
+
+       if (queue_delete)
+               schedule_work(&c->ec_stripe_delete_work);
+}
+
+void bch2_stripes_heap_del(struct bch_fs *c,
+                          struct ec_stripe *m, size_t idx)
+{
+       spin_lock(&c->ec_stripes_heap_lock);
+       heap_verify_backpointer(c, idx);
+
+       m->alive = false;
+       heap_del(&c->ec_stripes_heap, m->heap_idx,
+                ec_stripes_heap_cmp,
+                ec_stripes_heap_set_backpointer);
+       spin_unlock(&c->ec_stripes_heap_lock);
+}
+
+void bch2_stripes_heap_insert(struct bch_fs *c,
+                             struct ec_stripe *m, size_t idx)
+{
+       spin_lock(&c->ec_stripes_heap_lock);
+
+       BUG_ON(heap_full(&c->ec_stripes_heap));
+
+       heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) {
+                       .idx = idx,
+                       .blocks_nonempty = stripe_entry_blocks(m),
+               }),
+                ec_stripes_heap_cmp,
+                ec_stripes_heap_set_backpointer);
+       m->alive = true;
+
+       heap_verify_backpointer(c, idx);
+
+       spin_unlock(&c->ec_stripes_heap_lock);
+}
+
+static void ec_stripe_delete(struct bch_fs *c, unsigned idx)
+{
+       struct btree_iter iter;
+       struct bch_stripe *v = NULL;
+       struct bkey_s_c k;
+       struct bkey_i delete;
+       u64 journal_seq = 0;
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_EC,
+                            POS(0, idx),
+                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
+       if (btree_iter_err(k) || k.k->type != BCH_STRIPE)
+               goto out;
+
+       v = kmalloc(bkey_val_bytes(k.k), GFP_KERNEL);
+       BUG_ON(!v);
+       memcpy(v, bkey_s_c_to_stripe(k).v, bkey_val_bytes(k.k));
+
+       bkey_init(&delete.k);
+       delete.k.p = iter.pos;
+
+       bch2_btree_insert_at(c, NULL, &journal_seq,
+                            BTREE_INSERT_NOFAIL|
+                            BTREE_INSERT_USE_RESERVE|
+                            BTREE_INSERT_NOUNLOCK,
+                            BTREE_INSERT_ENTRY(&iter, &delete));
+out:
+       bch2_btree_iter_unlock(&iter);
+       kfree(v);
+}
+
+static void ec_stripe_delete_work(struct work_struct *work)
+{
+       struct bch_fs *c =
+               container_of(work, struct bch_fs, ec_stripe_delete_work);
+       ssize_t idx;
+
+       down_read(&c->gc_lock);
+
+       while (1) {
+               spin_lock(&c->ec_stripes_heap_lock);
+               idx = stripe_idx_to_delete(c);
+               spin_unlock(&c->ec_stripes_heap_lock);
+
+               if (idx < 0)
+                       break;
+
+               ec_stripe_delete(c, idx);
+       }
+
+       up_read(&c->gc_lock);
+}
+
+static int ec_stripe_bkey_insert(struct bch_fs *c,
+                                struct bkey_i_stripe *stripe)
+{
+       struct ec_stripe *m;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       /* XXX: start pos hint */
+retry:
+       for_each_btree_key(&iter, c, BTREE_ID_EC, POS_MIN,
+                          BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) {
+               if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) {
+                       bch2_btree_iter_unlock(&iter);
+                       return -ENOSPC;
+               }
+
+               if (bkey_deleted(k.k))
+                       goto found_slot;
+       }
+
+       return bch2_btree_iter_unlock(&iter) ?: -ENOSPC;
+found_slot:
+       mutex_lock(&c->ec_stripes_lock);
+       ret = ec_stripe_mem_alloc(c, &iter);
+       mutex_unlock(&c->ec_stripes_lock);
+
+       if (ret == -EINTR)
+               goto retry;
+       if (ret)
+               return ret;
+
+       m = genradix_ptr(&c->ec_stripes, iter.pos.offset);
+       atomic_inc(&m->pin);
+
+       stripe->k.p = iter.pos;
+
+       ret = bch2_btree_insert_at(c, NULL, NULL,
+                                  BTREE_INSERT_NOFAIL|
+                                  BTREE_INSERT_USE_RESERVE,
+                                  BTREE_INSERT_ENTRY(&iter, &stripe->k_i));
+       bch2_btree_iter_unlock(&iter);
+
+       if (ret)
+               atomic_dec(&m->pin);
+
+       return ret;
+}
+
+/* stripe creation: */
+
+static void extent_stripe_ptr_add(struct bkey_s_extent e,
+                                 struct ec_stripe_buf *s,
+                                 struct bch_extent_ptr *ptr,
+                                 unsigned block)
+{
+       struct bch_extent_stripe_ptr *dst = (void *) ptr;
+       union bch_extent_entry *end = extent_entry_last(e);
+
+       memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst);
+       e.k->u64s += sizeof(*dst) / sizeof(u64);
+
+       *dst = (struct bch_extent_stripe_ptr) {
+               .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
+               .block          = block,
+               .idx            = s->key.k.p.offset,
+       };
+}
+
+static int ec_stripe_update_ptrs(struct bch_fs *c,
+                                struct ec_stripe_buf *s,
+                                struct bkey *pos)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_s_extent e;
+       struct bch_extent_ptr *ptr;
+       BKEY_PADDED(k) tmp;
+       int ret = 0, dev, idx;
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+                            bkey_start_pos(pos),
+                            BTREE_ITER_INTENT);
+
+       while ((k = bch2_btree_iter_peek(&iter)).k &&
+              !btree_iter_err(k) &&
+              bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
+               idx = extent_matches_stripe(c, &s->key.v, k);
+               if (idx < 0) {
+                       bch2_btree_iter_next(&iter);
+                       continue;
+               }
+
+               dev = s->key.v.ptrs[idx].dev;
+
+               bkey_reassemble(&tmp.k, k);
+               e = bkey_i_to_s_extent(&tmp.k);
+
+               extent_for_each_ptr(e, ptr)
+                       if (ptr->dev != dev)
+                               ptr->cached = true;
+
+               ptr = (void *) bch2_extent_has_device(e.c, dev);
+               BUG_ON(!ptr);
+
+               extent_stripe_ptr_add(e, s, ptr, idx);
+
+               ret = bch2_btree_insert_at(c, NULL, NULL,
+                               BTREE_INSERT_ATOMIC|
+                               BTREE_INSERT_NOFAIL|
+                               BTREE_INSERT_USE_RESERVE,
+                               BTREE_INSERT_ENTRY(&iter, &tmp.k));
+               if (ret == -EINTR)
+                       ret = 0;
+               if (ret)
+                       break;
+       }
+
+       return bch2_btree_iter_unlock(&iter) ?: ret;
+}
+
+/*
+ * data buckets of new stripe all written: create the stripe
+ */
+static void ec_stripe_create(struct ec_stripe_new *s)
+{
+       struct ec_stripe *ec_stripe;
+       struct bch_fs *c = s->c;
+       struct open_bucket *ob;
+       struct bkey_i *k;
+       struct bch_stripe *v = &s->stripe.key.v;
+       unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
+       struct closure cl;
+       int ret;
+
+       BUG_ON(s->h->s == s);
+
+       closure_init_stack(&cl);
+
+       if (s->err) {
+               bch_err(c, "error creating stripe: error writing data buckets");
+               goto err;
+       }
+
+       if (!percpu_ref_tryget(&c->writes))
+               goto err;
+
+       BUG_ON(bitmap_weight(s->blocks_allocated,
+                            s->blocks.nr) != s->blocks.nr);
+
+       ec_generate_ec(&s->stripe);
+
+       ec_generate_checksums(&s->stripe);
+
+       /* write p/q: */
+       for (i = nr_data; i < v->nr_blocks; i++)
+               ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl);
+
+       closure_sync(&cl);
+
+       for (i = nr_data; i < v->nr_blocks; i++)
+               if (!test_bit(i, s->stripe.valid)) {
+                       bch_err(c, "error creating stripe: error writing redundancy buckets");
+                       goto err_put_writes;
+               }
+
+       ret = ec_stripe_bkey_insert(c, &s->stripe.key);
+       if (ret) {
+               bch_err(c, "error creating stripe: error creating stripe key");
+               goto err_put_writes;
+       }
+
+       for_each_keylist_key(&s->keys, k) {
+               ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k);
+               if (ret)
+                       break;
+       }
+
+       ec_stripe = genradix_ptr(&c->ec_stripes, s->stripe.key.k.p.offset);
+
+       atomic_dec(&ec_stripe->pin);
+       bch2_stripes_heap_update(c, ec_stripe,
+                                s->stripe.key.k.p.offset);
+
+err_put_writes:
+       percpu_ref_put(&c->writes);
+err:
+       open_bucket_for_each(c, &s->blocks, ob, i) {
+               ob->ec = NULL;
+               __bch2_open_bucket_put(c, ob);
+       }
+
+       bch2_open_buckets_put(c, &s->parity);
+
+       bch2_keylist_free(&s->keys, s->inline_keys);
+
+       mutex_lock(&s->h->lock);
+       list_del(&s->list);
+       mutex_unlock(&s->h->lock);
+
+       for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
+               kvpfree(s->stripe.data[i], s->stripe.size << 9);
+       kfree(s);
+}
+
+static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h)
+{
+       struct ec_stripe_new *s = h->s;
+
+       list_add(&s->list, &h->stripes);
+       h->s = NULL;
+
+       return s;
+}
+
+static void ec_stripe_new_put(struct ec_stripe_new *s)
+{
+       BUG_ON(atomic_read(&s->pin) <= 0);
+       if (atomic_dec_and_test(&s->pin))
+               ec_stripe_create(s);
+}
+
+/* have a full bucket - hand it off to be erasure coded: */
+void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob)
+{
+       struct ec_stripe_new *s = ob->ec;
+
+       if (ob->sectors_free)
+               s->err = -1;
+
+       ec_stripe_new_put(s);
+}
+
+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob)
+{
+       struct ec_stripe_new *s = ob->ec;
+
+       s->err = -EIO;
+}
+
+void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
+{
+       struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
+       struct bch_dev *ca;
+       unsigned offset;
+
+       if (!ob)
+               return NULL;
+
+       ca      = bch_dev_bkey_exists(c, ob->ptr.dev);
+       offset  = ca->mi.bucket_size - ob->sectors_free;
+
+       return ob->ec->stripe.data[ob->ec_idx] + (offset << 9);
+}
+
+void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
+                            struct bpos pos, unsigned sectors)
+{
+       struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs);
+       struct ec_stripe_new *ec;
+
+       if (!ob)
+               return;
+
+       ec = ob->ec;
+       mutex_lock(&ec->lock);
+
+       if (bch2_keylist_realloc(&ec->keys, ec->inline_keys,
+                                ARRAY_SIZE(ec->inline_keys),
+                                BKEY_U64s)) {
+               BUG();
+       }
+
+       bkey_init(&ec->keys.top->k);
+       ec->keys.top->k.p       = pos;
+       bch2_key_resize(&ec->keys.top->k, sectors);
+       bch2_keylist_push(&ec->keys);
+
+       mutex_unlock(&ec->lock);
+}
+
+static int unsigned_cmp(const void *_l, const void *_r)
+{
+       unsigned l = *((const unsigned *) _l);
+       unsigned r = *((const unsigned *) _r);
+
+       return (l > r) - (l < r);
+}
+
+/* pick most common bucket size: */
+static unsigned pick_blocksize(struct bch_fs *c,
+                              struct bch_devs_mask *devs)
+{
+       struct bch_dev *ca;
+       unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX];
+       struct {
+               unsigned nr, size;
+       } cur = { 0, 0 }, best = { 0, 0 };
+
+       for_each_member_device_rcu(ca, c, i, devs)
+               sizes[nr++] = ca->mi.bucket_size;
+
+       sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
+
+       for (i = 0; i < nr; i++) {
+               if (sizes[i] != cur.size) {
+                       if (cur.nr > best.nr)
+                               best = cur;
+
+                       cur.nr = 0;
+                       cur.size = sizes[i];
+               }
+
+               cur.nr++;
+       }
+
+       if (cur.nr > best.nr)
+               best = cur;
+
+       return best.size;
+}
+
+int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h)
+{
+       struct ec_stripe_new *s;
+       unsigned i;
+
+       BUG_ON(h->parity.nr != h->redundancy);
+       BUG_ON(!h->blocks.nr);
+       BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX);
+       lockdep_assert_held(&h->lock);
+
+       s = kzalloc(sizeof(*s), GFP_KERNEL);
+       if (!s)
+               return -ENOMEM;
+
+       mutex_init(&s->lock);
+       atomic_set(&s->pin, 1);
+       s->c            = c;
+       s->h            = h;
+       s->blocks       = h->blocks;
+       s->parity       = h->parity;
+
+       memset(&h->blocks, 0, sizeof(h->blocks));
+       memset(&h->parity, 0, sizeof(h->parity));
+
+       bch2_keylist_init(&s->keys, s->inline_keys);
+
+       s->stripe.offset        = 0;
+       s->stripe.size          = h->blocksize;
+       memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid));
+
+       ec_stripe_key_init(c, &s->stripe.key,
+                          &s->blocks, &s->parity,
+                          h->blocksize);
+
+       for (i = 0; i < s->stripe.key.v.nr_blocks; i++) {
+               s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL);
+               if (!s->stripe.data[i])
+                       goto err;
+       }
+
+       h->s = s;
+
+       return 0;
+err:
+       for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
+               kvpfree(s->stripe.data[i], s->stripe.size << 9);
+       kfree(s);
+       return -ENOMEM;
+}
+
+static struct ec_stripe_head *
+ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
+                        unsigned algo, unsigned redundancy)
+{
+       struct ec_stripe_head *h;
+       struct bch_dev *ca;
+       unsigned i;
+
+       h = kzalloc(sizeof(*h), GFP_KERNEL);
+       if (!h)
+               return NULL;
+
+       mutex_init(&h->lock);
+       mutex_lock(&h->lock);
+       INIT_LIST_HEAD(&h->stripes);
+
+       h->target       = target;
+       h->algo         = algo;
+       h->redundancy   = redundancy;
+
+       rcu_read_lock();
+       h->devs = target_rw_devs(c, BCH_DATA_USER, target);
+
+       for_each_member_device_rcu(ca, c, i, &h->devs)
+               if (!ca->mi.durability)
+                       __clear_bit(i, h->devs.d);
+
+       h->blocksize = pick_blocksize(c, &h->devs);
+
+       for_each_member_device_rcu(ca, c, i, &h->devs)
+               if (ca->mi.bucket_size == h->blocksize)
+                       h->nr_active_devs++;
+
+       rcu_read_unlock();
+       list_add(&h->list, &c->ec_new_stripe_list);
+       return h;
+}
+
+void bch2_ec_stripe_head_put(struct ec_stripe_head *h)
+{
+       struct ec_stripe_new *s = NULL;
+
+       if (h->s &&
+           bitmap_weight(h->s->blocks_allocated,
+                         h->s->blocks.nr) == h->s->blocks.nr)
+               s = ec_stripe_set_pending(h);
+
+       mutex_unlock(&h->lock);
+
+       if (s)
+               ec_stripe_new_put(s);
+}
+
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
+                                              unsigned target,
+                                              unsigned algo,
+                                              unsigned redundancy)
+{
+       struct ec_stripe_head *h;
+
+       if (!redundancy)
+               return NULL;
+
+       mutex_lock(&c->ec_new_stripe_lock);
+       list_for_each_entry(h, &c->ec_new_stripe_list, list)
+               if (h->target           == target &&
+                   h->algo             == algo &&
+                   h->redundancy       == redundancy) {
+                       mutex_lock(&h->lock);
+                       goto found;
+               }
+
+       h = ec_new_stripe_head_alloc(c, target, algo, redundancy);
+found:
+       mutex_unlock(&c->ec_new_stripe_lock);
+       return h;
+}
+
+void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca)
+{
+       struct ec_stripe_head *h;
+       struct open_bucket *ob;
+       unsigned i;
+
+       mutex_lock(&c->ec_new_stripe_lock);
+       list_for_each_entry(h, &c->ec_new_stripe_list, list) {
+               struct ec_stripe_new *s = NULL;
+
+               mutex_lock(&h->lock);
+               bch2_open_buckets_stop_dev(c, ca,
+                                          &h->blocks,
+                                          BCH_DATA_USER);
+               bch2_open_buckets_stop_dev(c, ca,
+                                          &h->parity,
+                                          BCH_DATA_USER);
+
+               if (!h->s)
+                       goto unlock;
+
+               open_bucket_for_each(c, &h->s->blocks, ob, i)
+                       if (ob->ptr.dev == ca->dev_idx)
+                               goto found;
+               open_bucket_for_each(c, &h->s->parity, ob, i)
+                       if (ob->ptr.dev == ca->dev_idx)
+                               goto found;
+               goto unlock;
+found:
+               h->s->err = -1;
+               s = ec_stripe_set_pending(h);
+unlock:
+               mutex_unlock(&h->lock);
+
+               if (s)
+                       ec_stripe_new_put(s);
+       }
+       mutex_unlock(&c->ec_new_stripe_lock);
+}
+
+int bch2_fs_ec_start(struct bch_fs *c)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       size_t i, idx = 0;
+       int ret = 0;
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_EC, POS(0, U64_MAX), 0);
+
+       k = bch2_btree_iter_prev(&iter);
+       if (!IS_ERR_OR_NULL(k.k))
+               idx = k.k->p.offset + 1;
+       ret = bch2_btree_iter_unlock(&iter);
+       if (ret)
+               return ret;
+
+       if (!init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx),
+                      GFP_KERNEL))
+               return -ENOMEM;
+#if 0
+       ret = genradix_prealloc(&c->ec_stripes, idx, GFP_KERNEL);
+#else
+       for (i = 0; i < idx; i++)
+               if (!genradix_ptr_alloc(&c->ec_stripes, i, GFP_KERNEL))
+                       return -ENOMEM;
+#endif
+       return 0;
+}
+
+void bch2_fs_ec_exit(struct bch_fs *c)
+{
+       struct ec_stripe_head *h;
+
+       while (1) {
+               mutex_lock(&c->ec_new_stripe_lock);
+               h = list_first_entry_or_null(&c->ec_new_stripe_list,
+                                            struct ec_stripe_head, list);
+               if (h)
+                       list_del(&h->list);
+               mutex_unlock(&c->ec_new_stripe_lock);
+               if (!h)
+                       break;
+
+               BUG_ON(h->s);
+               BUG_ON(!list_empty(&h->stripes));
+               kfree(h);
+       }
+
+       free_heap(&c->ec_stripes_heap);
+       genradix_free(&c->ec_stripes);
+       bioset_exit(&c->ec_bioset);
+}
+
+int bch2_fs_ec_init(struct bch_fs *c)
+{
+       INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work);
+
+       return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio),
+                          BIOSET_NEED_BVECS);
+}
diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h
new file mode 100644 (file)
index 0000000..13b875a
--- /dev/null
@@ -0,0 +1,108 @@
+#ifndef _BCACHEFS_EC_H
+#define _BCACHEFS_EC_H
+
+#include "ec_types.h"
+#include "keylist_types.h"
+
+const char *bch2_ec_key_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_ec_key_to_text(struct printbuf *, struct bch_fs *,
+                        struct bkey_s_c);
+
+#define bch2_bkey_ec_ops (struct bkey_ops) {           \
+       .key_invalid    = bch2_ec_key_invalid,          \
+       .val_to_text    = bch2_ec_key_to_text,          \
+}
+
+struct bch_read_bio;
+
+struct ec_stripe_buf {
+       /* might not be buffering the entire stripe: */
+       unsigned                offset;
+       unsigned                size;
+       unsigned long           valid[BITS_TO_LONGS(EC_STRIPE_MAX)];
+
+       void                    *data[EC_STRIPE_MAX];
+
+       union {
+               struct bkey_i_stripe    key;
+               u64                     pad[255];
+       };
+};
+
+struct ec_stripe_head;
+
+struct ec_stripe_new {
+       struct bch_fs           *c;
+       struct ec_stripe_head   *h;
+       struct mutex            lock;
+       struct list_head        list;
+
+       /* counts in flight writes, stripe is created when pin == 0 */
+       atomic_t                pin;
+
+       int                     err;
+
+       unsigned long           blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)];
+
+       struct open_buckets     blocks;
+       struct open_buckets     parity;
+
+       struct keylist          keys;
+       u64                     inline_keys[BKEY_U64s * 8];
+
+       struct ec_stripe_buf    stripe;
+};
+
+struct ec_stripe_head {
+       struct list_head        list;
+       struct mutex            lock;
+
+       struct list_head        stripes;
+
+       unsigned                target;
+       unsigned                algo;
+       unsigned                redundancy;
+
+       struct bch_devs_mask    devs;
+       unsigned                nr_active_devs;
+
+       unsigned                blocksize;
+
+       struct dev_stripe_state block_stripe;
+       struct dev_stripe_state parity_stripe;
+
+       struct open_buckets     blocks;
+       struct open_buckets     parity;
+
+       struct ec_stripe_new    *s;
+};
+
+int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *);
+
+void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *);
+void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *,
+                            struct bpos, unsigned);
+
+void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *);
+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
+
+int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
+
+void bch2_ec_stripe_head_put(struct ec_stripe_head *);
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned,
+                                              unsigned, unsigned);
+
+void bch2_stripes_heap_update(struct bch_fs *, struct ec_stripe *, size_t);
+void bch2_stripes_heap_del(struct bch_fs *, struct ec_stripe *, size_t);
+void bch2_stripes_heap_insert(struct bch_fs *, struct ec_stripe *, size_t);
+
+void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *);
+
+void bch2_ec_flush_new_stripes(struct bch_fs *);
+
+int bch2_fs_ec_start(struct bch_fs *);
+
+void bch2_fs_ec_exit(struct bch_fs *);
+int bch2_fs_ec_init(struct bch_fs *);
+
+#endif /* _BCACHEFS_EC_H */
diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h
new file mode 100644 (file)
index 0000000..feb3601
--- /dev/null
@@ -0,0 +1,30 @@
+#ifndef _BCACHEFS_EC_TYPES_H
+#define _BCACHEFS_EC_TYPES_H
+
+#include <linux/llist.h>
+
+#define EC_STRIPE_MAX  16
+
+struct ec_stripe {
+       size_t                  heap_idx;
+
+       u16                     sectors;
+       u8                      algorithm;
+
+       u8                      nr_blocks;
+       u8                      nr_redundant;
+
+       u8                      alive;
+       atomic_t                pin;
+       atomic_t                blocks_nonempty;
+       atomic_t                block_sectors[EC_STRIPE_MAX];
+};
+
+struct ec_stripe_heap_entry {
+       size_t                  idx;
+       unsigned                blocks_nonempty;
+};
+
+typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap;
+
+#endif /* _BCACHEFS_EC_TYPES_H */
index a3ec1cc9a5d20dfb42859de88c6b92fee3f5ca24..ebaf390fd651f5962190b4072bd623cf70214509 100644 (file)
@@ -193,29 +193,41 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
        return nr_ptrs;
 }
 
-unsigned bch2_extent_ptr_durability(struct bch_fs *c,
-                                   const struct bch_extent_ptr *ptr)
+static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
+                                          struct extent_ptr_decoded p)
 {
+       unsigned i, durability = 0;
        struct bch_dev *ca;
 
-       if (ptr->cached)
+       if (p.ptr.cached)
                return 0;
 
-       ca = bch_dev_bkey_exists(c, ptr->dev);
+       ca = bch_dev_bkey_exists(c, p.ptr.dev);
 
-       if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
-               return 0;
+       if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
+               durability = max_t(unsigned, durability, ca->mi.durability);
+
+       for (i = 0; i < p.ec_nr; i++) {
+               struct ec_stripe *s =
+                       genradix_ptr(&c->ec_stripes, p.idx);
 
-       return ca->mi.durability;
+               if (WARN_ON(!s))
+                       continue;
+
+               durability = max_t(unsigned, durability, s->nr_redundant);
+       }
+
+       return durability;
 }
 
 unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e)
 {
-       const struct bch_extent_ptr *ptr;
+       const union bch_extent_entry *entry;
+       struct extent_ptr_decoded p;
        unsigned durability = 0;
 
-       extent_for_each_ptr(e, ptr)
-               durability += bch2_extent_ptr_durability(c, ptr);
+       extent_for_each_ptr_decode(e, p, entry)
+               durability += bch2_extent_ptr_durability(c, p);
 
        return durability;
 }
@@ -258,30 +270,46 @@ bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
        return false;
 }
 
+static union bch_extent_entry *extent_entry_prev(struct bkey_s_extent e,
+                                         union bch_extent_entry *entry)
+{
+       union bch_extent_entry *i = e.v->start;
+
+       if (i == entry)
+               return NULL;
+
+       while (extent_entry_next(i) != entry)
+               i = extent_entry_next(i);
+       return i;
+}
+
 union bch_extent_entry *bch2_extent_drop_ptr(struct bkey_s_extent e,
                                             struct bch_extent_ptr *ptr)
 {
-       union bch_extent_entry *dst;
-       union bch_extent_entry *src;
+       union bch_extent_entry *dst, *src, *prev;
+       bool drop_crc = true;
 
        EBUG_ON(ptr < &e.v->start->ptr ||
                ptr >= &extent_entry_last(e)->ptr);
        EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
 
-       src = to_entry(ptr + 1);
-
+       src = extent_entry_next(to_entry(ptr));
        if (src != extent_entry_last(e) &&
-           extent_entry_type(src) == BCH_EXTENT_ENTRY_ptr) {
-               dst = to_entry(ptr);
-       } else {
-               extent_for_each_entry(e, dst) {
-                       if (dst == to_entry(ptr))
-                               break;
+           !extent_entry_is_crc(src))
+               drop_crc = false;
 
-                       if (extent_entry_next(dst) == to_entry(ptr) &&
-                           extent_entry_is_crc(dst))
-                               break;
+       dst = to_entry(ptr);
+       while ((prev = extent_entry_prev(e, dst))) {
+               if (extent_entry_is_ptr(prev))
+                       break;
+
+               if (extent_entry_is_crc(prev)) {
+                       if (drop_crc)
+                               dst = prev;
+                       break;
                }
+
+               dst = prev;
        }
 
        memmove_u64s_down(dst, src,
@@ -423,6 +451,8 @@ void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
                                entry->crc128.csum.lo = (__force __le64)
                                        swab64((__force u64) entry->crc128.csum.lo);
                                break;
+                       case BCH_EXTENT_ENTRY_stripe_ptr:
+                               break;
                        }
                }
                break;
@@ -470,6 +500,7 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
        const union bch_extent_entry *entry;
        struct bch_extent_crc_unpacked crc;
        const struct bch_extent_ptr *ptr;
+       const struct bch_extent_stripe_ptr *ec;
        struct bch_dev *ca;
        bool first = true;
 
@@ -478,6 +509,18 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
                        pr_buf(out, " ");
 
                switch (__extent_entry_type(entry)) {
+               case BCH_EXTENT_ENTRY_ptr:
+                       ptr = entry_to_ptr(entry);
+                       ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+                               ? bch_dev_bkey_exists(c, ptr->dev)
+                               : NULL;
+
+                       pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
+                              (u64) ptr->offset, ptr->gen,
+                              ptr->cached ? " cached" : "",
+                              ca && ptr_stale(ca, ptr)
+                              ? " stale" : "");
+                       break;
                case BCH_EXTENT_ENTRY_crc32:
                case BCH_EXTENT_ENTRY_crc64:
                case BCH_EXTENT_ENTRY_crc128:
@@ -490,17 +533,11 @@ static void extent_print_ptrs(struct printbuf *out, struct bch_fs *c,
                               crc.csum_type,
                               crc.compression_type);
                        break;
-               case BCH_EXTENT_ENTRY_ptr:
-                       ptr = entry_to_ptr(entry);
-                       ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
-                               ? bch_dev_bkey_exists(c, ptr->dev)
-                               : NULL;
+               case BCH_EXTENT_ENTRY_stripe_ptr:
+                       ec = &entry->stripe_ptr;
 
-                       pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
-                              (u64) ptr->offset, ptr->gen,
-                              ptr->cached ? " cached" : "",
-                              ca && ptr_stale(ca, ptr)
-                              ? " stale" : "");
+                       pr_buf(out, "ec: idx %llu block %u",
+                              (u64) ec->idx, ec->block);
                        break;
                default:
                        pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
@@ -536,6 +573,11 @@ void bch2_mark_io_failure(struct bch_io_failures *failed,
 
                f = &failed->devs[failed->nr++];
                f->dev          = p->ptr.dev;
+               f->idx          = p->idx;
+               f->nr_failed    = 1;
+               f->nr_retries   = 0;
+       } else if (p->idx != f->idx) {
+               f->idx          = p->idx;
                f->nr_failed    = 1;
                f->nr_retries   = 0;
        } else {
@@ -550,15 +592,22 @@ static inline bool ptr_better(struct bch_fs *c,
                              const struct extent_ptr_decoded p1,
                              const struct extent_ptr_decoded p2)
 {
-       struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
-       struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
+       if (likely(!p1.idx && !p2.idx)) {
+               struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
+               struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
+
+               u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
+               u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
 
-       u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
-       u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
+               /* Pick at random, biased in favor of the faster device: */
+
+               return bch2_rand_range(l1 + l2) > l1;
+       }
 
-       /* Pick at random, biased in favor of the faster device: */
+       if (force_reconstruct_read(c))
+               return p1.idx > p2.idx;
 
-       return bch2_rand_range(l1 + l2) > l1;
+       return p1.idx < p2.idx;
 }
 
 static int extent_pick_read_device(struct bch_fs *c,
@@ -579,7 +628,20 @@ static int extent_pick_read_device(struct bch_fs *c,
                        continue;
 
                f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
-               if (f && f->nr_failed >= f->nr_retries)
+               if (f)
+                       p.idx = f->nr_failed < f->nr_retries
+                               ? f->idx
+                               : f->idx + 1;
+
+               if (!p.idx &&
+                   !bch2_dev_is_readable(ca))
+                       p.idx++;
+
+               if (force_reconstruct_read(c) &&
+                   !p.idx && p.ec_nr)
+                       p.idx++;
+
+               if (p.idx >= p.ec_nr + 1)
                        continue;
 
                if (ret && !ptr_better(c, p, *pick))
@@ -616,8 +678,8 @@ const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
                        if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
                                return "invalid extent entry type";
 
-                       if (extent_entry_is_crc(entry))
-                               return "has crc field";
+                       if (!extent_entry_is_ptr(entry))
+                               return "has non ptr field";
                }
 
                extent_for_each_ptr(e, ptr) {
@@ -754,6 +816,8 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
                        case BCH_EXTENT_ENTRY_crc128:
                                entry->crc128.offset += e.k->size - len;
                                break;
+                       case BCH_EXTENT_ENTRY_stripe_ptr:
+                               break;
                        }
 
                        if (extent_entry_is_crc(entry))
@@ -1512,7 +1576,18 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
                        if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
                                return "invalid extent entry type";
 
-                       if (extent_entry_is_crc(entry)) {
+                       switch (extent_entry_type(entry)) {
+                       case BCH_EXTENT_ENTRY_ptr:
+                               ptr = entry_to_ptr(entry);
+
+                               reason = extent_ptr_invalid(c, e, &entry->ptr,
+                                                           size_ondisk, false);
+                               if (reason)
+                                       return reason;
+                               break;
+                       case BCH_EXTENT_ENTRY_crc32:
+                       case BCH_EXTENT_ENTRY_crc64:
+                       case BCH_EXTENT_ENTRY_crc128:
                                crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
 
                                if (crc.offset + e.k->size >
@@ -1533,13 +1608,9 @@ const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
                                        else if (nonce != crc.offset + crc.nonce)
                                                return "incorrect nonce";
                                }
-                       } else {
-                               ptr = entry_to_ptr(entry);
-
-                               reason = extent_ptr_invalid(c, e, &entry->ptr,
-                                                           size_ondisk, false);
-                               if (reason)
-                                       return reason;
+                               break;
+                       case BCH_EXTENT_ENTRY_stripe_ptr:
+                               break;
                        }
                }
 
@@ -1744,6 +1815,7 @@ void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e,
 {
        struct bch_extent_crc_unpacked crc;
        union bch_extent_entry *pos;
+       unsigned i;
 
        extent_for_each_crc(extent_i_to_s(e), crc, pos)
                if (!bch2_crc_unpacked_cmp(crc, p->crc))
@@ -1754,6 +1826,11 @@ void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e,
 found:
        p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
        __extent_entry_insert(e, pos, to_entry(&p->ptr));
+
+       for (i = 0; i < p->ec_nr; i++) {
+               p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
+               __extent_entry_insert(e, pos, to_entry(&p->ec[i]));
+       }
 }
 
 /*
@@ -1808,26 +1885,27 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c,
                                      unsigned target,
                                      unsigned nr_desired_replicas)
 {
-       struct bch_extent_ptr *ptr;
+       union bch_extent_entry *entry;
+       struct extent_ptr_decoded p;
        int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas;
 
        if (target && extra > 0)
-               extent_for_each_ptr(e, ptr) {
-                       int n = bch2_extent_ptr_durability(c, ptr);
+               extent_for_each_ptr_decode(e, p, entry) {
+                       int n = bch2_extent_ptr_durability(c, p);
 
                        if (n && n <= extra &&
-                           !bch2_dev_in_target(c, ptr->dev, target)) {
-                               ptr->cached = true;
+                           !bch2_dev_in_target(c, p.ptr.dev, target)) {
+                               entry->ptr.cached = true;
                                extra -= n;
                        }
                }
 
        if (extra > 0)
-               extent_for_each_ptr(e, ptr) {
-                       int n = bch2_extent_ptr_durability(c, ptr);
+               extent_for_each_ptr_decode(e, p, entry) {
+                       int n = bch2_extent_ptr_durability(c, p);
 
                        if (n && n <= extra) {
-                               ptr->cached = true;
+                               entry->ptr.cached = true;
                                extra -= n;
                        }
                }
@@ -1903,7 +1981,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *c, struct btree *b,
 
                        if ((extent_entry_type(en_l) !=
                             extent_entry_type(en_r)) ||
-                           extent_entry_is_crc(en_l))
+                           !extent_entry_is_ptr(en_l))
                                return BCH_MERGE_NOMERGE;
 
                        lp = &en_l->ptr;
index 5b786cb29fcdbefc5feb464faec398c259f5d236..307abd26f3c3ded933a0da93296363c430db03bb 100644 (file)
@@ -95,8 +95,6 @@ unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
 unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
 unsigned bch2_extent_is_compressed(struct bkey_s_c);
 
-unsigned bch2_extent_ptr_durability(struct bch_fs *,
-                                   const struct bch_extent_ptr *);
 unsigned bch2_extent_durability(struct bch_fs *, struct bkey_s_c_extent);
 
 bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
@@ -361,20 +359,13 @@ bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
 
 /* Iterate over pointers, with crcs: */
 
-static inline struct extent_ptr_decoded
-__extent_ptr_decoded_init(const struct bkey *k)
-{
-       return (struct extent_ptr_decoded) {
-               .crc            = bch2_extent_crc_unpack(k, NULL),
-       };
-}
-
-#define EXTENT_ITERATE_EC              (1 << 0)
-
 #define __extent_ptr_next_decode(_e, _ptr, _entry)                     \
 ({                                                                     \
        __label__ out;                                                  \
                                                                        \
+       (_ptr).idx      = 0;                                            \
+       (_ptr).ec_nr    = 0;                                            \
+                                                                       \
        extent_for_each_entry_from(_e, _entry, _entry)                  \
                switch (extent_entry_type(_entry)) {                    \
                case BCH_EXTENT_ENTRY_ptr:                              \
@@ -386,14 +377,16 @@ __extent_ptr_decoded_init(const struct bkey *k)
                        (_ptr).crc = bch2_extent_crc_unpack((_e).k,     \
                                        entry_to_crc(_entry));          \
                        break;                                          \
+               case BCH_EXTENT_ENTRY_stripe_ptr:                       \
+                       (_ptr).ec[(_ptr).ec_nr++] = _entry->stripe_ptr; \
+                       break;                                          \
                }                                                       \
-                                                                       \
 out:                                                                   \
        _entry < extent_entry_last(_e);                                 \
 })
 
 #define extent_for_each_ptr_decode(_e, _ptr, _entry)                   \
-       for ((_ptr) = __extent_ptr_decoded_init((_e).k),                \
+       for ((_ptr).crc = bch2_extent_crc_unpack((_e).k, NULL),         \
             (_entry) = (_e).v->start;                                  \
             __extent_ptr_next_decode(_e, _ptr, _entry);                \
             (_entry) = extent_entry_next(_entry))
index 02c625672ad2011d3830cff1d1833ad1f6b9cab9..efd72e26259daafb0a03b106cddf21d0167e0fd5 100644 (file)
@@ -19,14 +19,18 @@ struct bch_extent_crc_unpacked {
 };
 
 struct extent_ptr_decoded {
+       unsigned                        idx;
+       unsigned                        ec_nr;
        struct bch_extent_crc_unpacked  crc;
        struct bch_extent_ptr           ptr;
+       struct bch_extent_stripe_ptr    ec[4];
 };
 
 struct bch_io_failures {
        u8                      nr;
        struct bch_dev_io_failures {
                u8              dev;
+               u8              idx;
                u8              nr_failed;
                u8              nr_retries;
        }                       devs[BCH_REPLICAS_MAX];
index 986bb7d28b0fc6c426a580815b205949a8a0af69..34cfd5d6224ee0eb2d9d30f09318652e08e53468 100644 (file)
@@ -454,12 +454,12 @@ struct bch_page_state {
 union { struct {
        /* existing data: */
        unsigned                sectors:PAGE_SECTOR_SHIFT + 1;
+
+       /* Uncompressed, fully allocated replicas: */
        unsigned                nr_replicas:4;
-       unsigned                compressed:1;
 
-       /* Owns PAGE_SECTORS sized reservation: */
-       unsigned                reserved:1;
-       unsigned                reservation_replicas:4;
+       /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */
+       unsigned                replicas_reserved:4;
 
        /* Owns PAGE_SECTORS sized quota reservation: */
        unsigned                quota_reserved:1;
@@ -506,7 +506,7 @@ static inline struct bch_page_state *page_state(struct page *page)
 static inline unsigned page_res_sectors(struct bch_page_state s)
 {
 
-       return s.reserved ? s.reservation_replicas * PAGE_SECTORS : 0;
+       return s.replicas_reserved * PAGE_SECTORS;
 }
 
 static void __bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
@@ -524,8 +524,10 @@ static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *i
 {
        struct bch_page_state s;
 
+       EBUG_ON(!PageLocked(page));
+
        s = page_state_cmpxchg(page_state(page), s, {
-               s.reserved              = 0;
+               s.replicas_reserved     = 0;
                s.quota_reserved        = 0;
        });
 
@@ -535,62 +537,46 @@ static void bch2_put_page_reservation(struct bch_fs *c, struct bch_inode_info *i
 static int bch2_get_page_reservation(struct bch_fs *c, struct bch_inode_info *inode,
                                     struct page *page, bool check_enospc)
 {
-       struct bch_page_state *s = page_state(page), new, old;
+       struct bch_page_state *s = page_state(page), new;
 
        /* XXX: this should not be open coded */
        unsigned nr_replicas = inode->ei_inode.bi_data_replicas
                ? inode->ei_inode.bi_data_replicas - 1
                : c->opts.data_replicas;
-
-       struct disk_reservation disk_res = bch2_disk_reservation_init(c,
-                                               nr_replicas);
+       struct disk_reservation disk_res;
        struct quota_res quota_res = { 0 };
-       int ret = 0;
+       int ret;
 
-       /*
-        * XXX: this could likely be quite a bit simpler, page reservations
-        * _should_ only be manipulated with page locked:
-        */
+       EBUG_ON(!PageLocked(page));
 
-       old = page_state_cmpxchg(s, new, {
-               if (new.reserved
-                   ? (new.reservation_replicas < disk_res.nr_replicas)
-                   : (new.sectors < PAGE_SECTORS ||
-                      new.nr_replicas < disk_res.nr_replicas ||
-                      new.compressed)) {
-                       int sectors = (disk_res.nr_replicas * PAGE_SECTORS -
-                                      page_res_sectors(new) -
-                                      disk_res.sectors);
-
-                       if (sectors > 0) {
-                               ret = bch2_disk_reservation_add(c, &disk_res, sectors,
-                                               !check_enospc
-                                               ? BCH_DISK_RESERVATION_NOFAIL : 0);
-                               if (unlikely(ret))
-                                       goto err;
-                       }
+       if (s->replicas_reserved < nr_replicas) {
+               ret = bch2_disk_reservation_get(c, &disk_res, PAGE_SECTORS,
+                               nr_replicas - s->replicas_reserved,
+                               !check_enospc ? BCH_DISK_RESERVATION_NOFAIL : 0);
+               if (unlikely(ret))
+                       return ret;
 
-                       new.reserved = 1;
-                       new.reservation_replicas = disk_res.nr_replicas;
-               }
+               page_state_cmpxchg(s, new, ({
+                       BUG_ON(new.replicas_reserved +
+                              disk_res.nr_replicas != nr_replicas);
+                       new.replicas_reserved += disk_res.nr_replicas;
+               }));
+       }
 
-               if (!new.quota_reserved &&
-                   new.sectors + new.dirty_sectors < PAGE_SECTORS) {
-                       ret = bch2_quota_reservation_add(c, inode, &quota_res,
-                                               PAGE_SECTORS - quota_res.sectors,
-                                               check_enospc);
-                       if (unlikely(ret))
-                               goto err;
+       if (!s->quota_reserved &&
+           s->sectors + s->dirty_sectors < PAGE_SECTORS) {
+               ret = bch2_quota_reservation_add(c, inode, &quota_res,
+                                                PAGE_SECTORS,
+                                                check_enospc);
+               if (unlikely(ret))
+                       return ret;
 
+               page_state_cmpxchg(s, new, ({
+                       BUG_ON(new.quota_reserved);
                        new.quota_reserved = 1;
-               }
-       });
+               }));
+       }
 
-       quota_res.sectors -= (new.quota_reserved - old.quota_reserved) * PAGE_SECTORS;
-       disk_res.sectors -= page_res_sectors(new) - page_res_sectors(old);
-err:
-       bch2_quota_reservation_put(c, inode, &quota_res);
-       bch2_disk_reservation_put(c, &disk_res);
        return ret;
 }
 
@@ -600,6 +586,8 @@ static void bch2_clear_page_bits(struct page *page)
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_page_state s;
 
+       EBUG_ON(!PageLocked(page));
+
        if (!PagePrivate(page))
                return;
 
@@ -710,6 +698,9 @@ int bch2_migrate_page(struct address_space *mapping, struct page *newpage,
 {
        int ret;
 
+       EBUG_ON(!PageLocked(page));
+       EBUG_ON(!PageLocked(newpage));
+
        ret = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
        if (ret != MIGRATEPAGE_SUCCESS)
                return ret;
@@ -856,10 +847,13 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
 {
        struct bvec_iter iter;
        struct bio_vec bv;
-       bool compressed = bch2_extent_is_compressed(k);
-       unsigned nr_ptrs = bch2_extent_nr_dirty_ptrs(k);
+       unsigned nr_ptrs = !bch2_extent_is_compressed(k)
+               ? bch2_extent_nr_dirty_ptrs(k)
+               : 0;
 
        bio_for_each_segment(bv, bio, iter) {
+               /* brand new pages, don't need to be locked: */
+
                struct bch_page_state *s = page_state(bv.bv_page);
 
                /* sectors in @k from the start of this page: */
@@ -867,14 +861,11 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
 
                unsigned page_sectors = min(bv.bv_len >> 9, k_sectors);
 
-               s->nr_replicas = !s->sectors
-                       ? nr_ptrs
-                       : min_t(unsigned, s->nr_replicas, nr_ptrs);
+               s->nr_replicas = page_sectors == PAGE_SECTORS
+                       ? nr_ptrs : 0;
 
                BUG_ON(s->sectors + page_sectors > PAGE_SECTORS);
                s->sectors += page_sectors;
-
-               s->compressed |= compressed;
        }
 }
 
@@ -1214,7 +1205,7 @@ static int __bch2_writepage(struct page *page,
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct bch_writepage_state *w = data;
        struct bch_page_state new, old;
-       unsigned offset;
+       unsigned offset, nr_replicas_this_write;
        loff_t i_size = i_size_read(&inode->v);
        pgoff_t end_index = i_size >> PAGE_SHIFT;
 
@@ -1240,19 +1231,31 @@ static int __bch2_writepage(struct page *page,
         */
        zero_user_segment(page, offset, PAGE_SIZE);
 do_io:
+       EBUG_ON(!PageLocked(page));
+
        /* Before unlocking the page, transfer reservation to w->io: */
        old = page_state_cmpxchg(page_state(page), new, {
-               EBUG_ON(!new.reserved &&
-                       (new.sectors != PAGE_SECTORS ||
-                       new.compressed));
+               /*
+                * If we didn't get a reservation, we can only write out the
+                * number of (fully allocated) replicas that currently exist,
+                * and only if the entire page has been written:
+                */
+               nr_replicas_this_write =
+                       max_t(unsigned,
+                             new.replicas_reserved,
+                             (new.sectors == PAGE_SECTORS
+                              ? new.nr_replicas : 0));
+
+               BUG_ON(!nr_replicas_this_write);
 
-               if (new.reserved)
-                       new.nr_replicas = new.reservation_replicas;
-               new.reserved = 0;
+               new.nr_replicas = w->opts.compression
+                       ? 0
+                       : nr_replicas_this_write;
 
-               new.compressed |= w->opts.compression != 0;
+               new.replicas_reserved = 0;
 
                new.sectors += new.dirty_sectors;
+               BUG_ON(new.sectors != PAGE_SECTORS);
                new.dirty_sectors = 0;
        });
 
@@ -1261,21 +1264,20 @@ do_io:
        unlock_page(page);
 
        if (w->io &&
-           (w->io->op.op.res.nr_replicas != new.nr_replicas ||
+           (w->io->op.op.res.nr_replicas != nr_replicas_this_write ||
             !bio_can_add_page_contig(&w->io->op.op.wbio.bio, page)))
                bch2_writepage_do_io(w);
 
        if (!w->io)
-               bch2_writepage_io_alloc(c, w, inode, page, new.nr_replicas);
+               bch2_writepage_io_alloc(c, w, inode, page,
+                                       nr_replicas_this_write);
 
        w->io->new_sectors += new.sectors - old.sectors;
 
        BUG_ON(inode != w->io->op.inode);
        BUG_ON(bio_add_page_contig(&w->io->op.op.wbio.bio, page));
 
-       if (old.reserved)
-               w->io->op.op.res.sectors += old.reservation_replicas * PAGE_SECTORS;
-
+       w->io->op.op.res.sectors += old.replicas_reserved * PAGE_SECTORS;
        w->io->op.new_i_size = i_size;
 
        if (wbc->sync_mode == WB_SYNC_ALL)
@@ -2547,10 +2549,9 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
                                &disk_res, &quota_res,
                                iter, &reservation.k_i,
                                0, true, true, NULL);
-
+btree_iter_err:
                bch2_quota_reservation_put(c, inode, &quota_res);
                bch2_disk_reservation_put(c, &disk_res);
-btree_iter_err:
                if (ret == -EINTR)
                        ret = 0;
                if (ret)
@@ -2612,6 +2613,8 @@ long bch2_fallocate_dispatch(struct file *file, int mode,
 
 static bool page_is_data(struct page *page)
 {
+       EBUG_ON(!PageLocked(page));
+
        /* XXX: should only have to check PageDirty */
        return PagePrivate(page) &&
                (page_state(page)->sectors ||
index 34cab253c86d49a4519e15d1e33331cb32689420..12d77ec6de232573267e87c06307d1a058856ee7 100644 (file)
@@ -15,6 +15,7 @@
 #include "clock.h"
 #include "debug.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "error.h"
 #include "extents.h"
 #include "io.h"
@@ -302,6 +303,7 @@ static void __bch2_write_index(struct bch_write_op *op)
        struct bkey_s_extent e;
        struct bch_extent_ptr *ptr;
        struct bkey_i *src, *dst = keys->keys, *n, *k;
+       unsigned dev;
        int ret;
 
        for (src = keys->keys; src != keys->top; src = n) {
@@ -345,6 +347,10 @@ static void __bch2_write_index(struct bch_write_op *op)
                }
        }
 out:
+       /* If some a bucket wasn't written, we can't erasure code it: */
+       for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX)
+               bch2_open_bucket_write_error(c, &op->open_buckets, dev);
+
        bch2_open_buckets_put(c, &op->open_buckets);
        return;
 err:
@@ -421,7 +427,8 @@ static void init_append_extent(struct bch_write_op *op,
 static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
                                        struct write_point *wp,
                                        struct bio *src,
-                                       bool *page_alloc_failed)
+                                       bool *page_alloc_failed,
+                                       void *buf)
 {
        struct bch_write_bio *wbio;
        struct bio *bio;
@@ -431,11 +438,18 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
 
        bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
        wbio                    = wbio_init(bio);
-       wbio->bounce            = true;
        wbio->put_bio           = true;
        /* copy WRITE_SYNC flag */
        wbio->bio.bi_opf        = src->bi_opf;
 
+       if (buf) {
+               bio->bi_iter.bi_size = output_available;
+               bch2_bio_map(bio, buf);
+               return bio;
+       }
+
+       wbio->bounce            = true;
+
        /*
         * We can't use mempool for more than c->sb.encoded_extent_max
         * worth of pages, but we'd like to allocate more if we can:
@@ -600,14 +614,18 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
        struct bio *src = &op->wbio.bio, *dst = src;
        struct bvec_iter saved_iter;
        struct bkey_i *key_to_write;
+       void *ec_buf;
        unsigned key_to_write_offset = op->insert_keys.top_p -
                op->insert_keys.keys_p;
-       unsigned total_output = 0;
-       bool bounce = false, page_alloc_failed = false;
+       unsigned total_output = 0, total_input = 0;
+       bool bounce = false;
+       bool page_alloc_failed = false;
        int ret, more = 0;
 
        BUG_ON(!bio_sectors(src));
 
+       ec_buf = bch2_writepoint_ec_buf(c, wp);
+
        switch (bch2_write_prep_encoded_data(op, wp)) {
        case PREP_ENCODED_OK:
                break;
@@ -617,16 +635,26 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
        case PREP_ENCODED_CHECKSUM_ERR:
                goto csum_err;
        case PREP_ENCODED_DO_WRITE:
+               if (ec_buf) {
+                       dst = bch2_write_bio_alloc(c, wp, src,
+                                                  &page_alloc_failed,
+                                                  ec_buf);
+                       bio_copy_data(dst, src);
+                       bounce = true;
+               }
                init_append_extent(op, wp, op->version, op->crc);
                goto do_write;
        }
 
-       if (op->compression_type ||
+       if (ec_buf ||
+           op->compression_type ||
            (op->csum_type &&
             !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
            (bch2_csum_type_is_encryption(op->csum_type) &&
             !(op->flags & BCH_WRITE_PAGES_OWNED))) {
-               dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed);
+               dst = bch2_write_bio_alloc(c, wp, src,
+                                          &page_alloc_failed,
+                                          ec_buf);
                bounce = true;
        }
 
@@ -729,7 +757,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
                if (dst != src)
                        bio_advance(dst, dst_len);
                bio_advance(src, src_len);
-               total_output += dst_len;
+               total_output    += dst_len;
+               total_input     += src_len;
        } while (dst->bi_iter.bi_size &&
                 src->bi_iter.bi_size &&
                 wp->sectors_free &&
@@ -742,16 +771,20 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 
        dst->bi_iter = saved_iter;
 
-       if (!bounce && more) {
-               dst = bio_split(src, total_output >> 9,
+       if (dst == src && more) {
+               BUG_ON(total_output != total_input);
+
+               dst = bio_split(src, total_input >> 9,
                                GFP_NOIO, &c->bio_write);
-               wbio_init(dst)->put_bio = true;
+               wbio_init(dst)->put_bio = true;
+               /* copy WRITE_SYNC flag */
+               dst->bi_opf             = src->bi_opf;
        }
 
        dst->bi_iter.bi_size = total_output;
 
        /* Free unneeded pages after compressing: */
-       if (bounce)
+       if (to_wbio(dst)->bounce)
                while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE))
                        mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page,
                                     &c->bio_bounce_pages);
@@ -760,6 +793,10 @@ do_write:
 
        key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
 
+       bch2_ec_add_backpointer(c, wp,
+                               bkey_start_pos(&key_to_write->k),
+                               total_input >> 9);
+
        dst->bi_end_io  = bch2_write_endio;
        dst->bi_private = &op->cl;
        bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
@@ -774,10 +811,10 @@ csum_err:
                "rewriting existing data (memory corruption?)");
        ret = -EIO;
 err:
-       if (bounce) {
+       if (to_wbio(dst)->bounce)
                bch2_bio_free_pages_pool(c, dst);
+       if (to_wbio(dst)->put_bio)
                bio_put(dst);
-       }
 
        return ret;
 }
@@ -789,6 +826,8 @@ static void __bch2_write(struct closure *cl)
        struct write_point *wp;
        int ret;
 again:
+       memset(&op->failed, 0, sizeof(op->failed));
+
        do {
                /* +1 for possible cache device: */
                if (op->open_buckets.nr + op->nr_replicas + 1 >
@@ -803,6 +842,7 @@ again:
 
                wp = bch2_alloc_sectors_start(c,
                        op->target,
+                       op->opts.erasure_code,
                        op->write_point,
                        &op->devs_have,
                        op->nr_replicas,
@@ -882,8 +922,6 @@ void bch2_write(struct closure *cl)
 
        op->start_time = local_clock();
 
-       memset(&op->failed, 0, sizeof(op->failed));
-
        bch2_keylist_init(&op->insert_keys, op->inline_keys);
        wbio_init(&op->wbio.bio)->put_bio = false;
 
@@ -1557,8 +1595,10 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
        if (!pick_ret)
                goto hole;
 
-       if (pick_ret < 0)
-               goto no_device;
+       if (pick_ret < 0) {
+               __bcache_io_error(c, "no device to read from");
+               goto err;
+       }
 
        if (pick_ret > 0)
                ca = bch_dev_bkey_exists(c, pick.ptr.dev);
@@ -1683,31 +1723,46 @@ noclone:
 
        bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
 
-       if (!rbio->have_ioref)
-               goto no_device_postclone;
-
        percpu_down_read_preempt_disable(&c->usage_lock);
        bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
        percpu_up_read_preempt_enable(&c->usage_lock);
 
-       this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
-                    bio_sectors(&rbio->bio));
+       if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) {
+               bio_inc_remaining(&orig->bio);
+               trace_read_split(&orig->bio);
+       }
+
+       if (!rbio->pick.idx) {
+               if (!rbio->have_ioref) {
+                       __bcache_io_error(c, "no device to read from");
+                       bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+                       goto out;
+               }
 
-       bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
+               this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
+                            bio_sectors(&rbio->bio));
+               bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
 
-       if (likely(!(flags & BCH_READ_IN_RETRY))) {
-               if (!(flags & BCH_READ_LAST_FRAGMENT)) {
-                       bio_inc_remaining(&orig->bio);
-                       trace_read_split(&orig->bio);
+               if (likely(!(flags & BCH_READ_IN_RETRY)))
+                       submit_bio(&rbio->bio);
+               else
+                       submit_bio_wait(&rbio->bio);
+       } else {
+               /* Attempting reconstruct read: */
+               if (bch2_ec_read_extent(c, rbio)) {
+                       bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
+                       goto out;
                }
 
-               submit_bio(&rbio->bio);
+               if (likely(!(flags & BCH_READ_IN_RETRY)))
+                       bio_endio(&rbio->bio);
+       }
+out:
+       if (likely(!(flags & BCH_READ_IN_RETRY))) {
                return 0;
        } else {
                int ret;
 
-               submit_bio_wait(&rbio->bio);
-
                rbio->context = RBIO_CONTEXT_UNBOUND;
                bch2_read_endio(&rbio->bio);
 
@@ -1722,22 +1777,12 @@ noclone:
                return ret;
        }
 
-no_device_postclone:
-       if (!rbio->split)
-               rbio->bio.bi_end_io = rbio->end_io;
-       bch2_rbio_free(rbio);
-no_device:
-       __bcache_io_error(c, "no device to read from");
-
-       if (likely(!(flags & BCH_READ_IN_RETRY))) {
-               orig->bio.bi_status = BLK_STS_IOERR;
-
-               if (flags & BCH_READ_LAST_FRAGMENT)
-                       bch2_rbio_done(orig);
-               return 0;
-       } else {
+err:
+       if (flags & BCH_READ_IN_RETRY)
                return READ_ERR;
-       }
+
+       orig->bio.bi_status = BLK_STS_IOERR;
+       goto out_read_done;
 
 hole:
        /*
@@ -1749,7 +1794,7 @@ hole:
                orig->hole = true;
 
        zero_fill_bio_iter(&orig->bio, iter);
-
+out_read_done:
        if (flags & BCH_READ_LAST_FRAGMENT)
                bch2_rbio_done(orig);
        return 0;
index 26c7ae7d7563650bbd4d28b65757856a160c0f86..ac1219fc6b66b78f75a89da867c157a249a67fe8 100644 (file)
@@ -134,6 +134,8 @@ static enum {
                c->opts.block_size;
        BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
 
+       bkey_extent_init(&buf->key);
+
        /*
         * We have to set last_seq here, _before_ opening a new journal entry:
         *
@@ -334,15 +336,14 @@ u64 bch2_inode_journal_seq(struct journal *j, u64 inode)
 }
 
 static int __journal_res_get(struct journal *j, struct journal_res *res,
-                             unsigned u64s_min, unsigned u64s_max)
+                            unsigned flags)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct journal_buf *buf;
        int ret;
 retry:
-       ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
-       if (ret)
-               return ret;
+       if (journal_res_get_fast(j, res))
+               return 0;
 
        spin_lock(&j->lock);
        /*
@@ -350,10 +351,9 @@ retry:
         * that just did journal_entry_open() and call journal_entry_close()
         * unnecessarily
         */
-       ret = journal_res_get_fast(j, res, u64s_min, u64s_max);
-       if (ret) {
+       if (journal_res_get_fast(j, res)) {
                spin_unlock(&j->lock);
-               return 1;
+               return 0;
        }
 
        /*
@@ -376,7 +376,12 @@ retry:
                spin_unlock(&j->lock);
                return -EROFS;
        case JOURNAL_ENTRY_INUSE:
-               /* haven't finished writing out the previous one: */
+               /*
+                * haven't finished writing out the previous entry, can't start
+                * another yet:
+                * signal to caller which sequence number we're trying to open:
+                */
+               res->seq = journal_cur_seq(j) + 1;
                spin_unlock(&j->lock);
                trace_journal_entry_full(c);
                goto blocked;
@@ -388,6 +393,8 @@ retry:
 
        /* We now have a new, closed journal buf - see if we can open it: */
        ret = journal_entry_open(j);
+       if (!ret)
+               res->seq = journal_cur_seq(j);
        spin_unlock(&j->lock);
 
        if (ret < 0)
@@ -407,7 +414,7 @@ retry:
 blocked:
        if (!j->res_get_blocked_start)
                j->res_get_blocked_start = local_clock() ?: 1;
-       return 0;
+       return -EAGAIN;
 }
 
 /*
@@ -421,14 +428,14 @@ blocked:
  * btree node write locks.
  */
 int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
-                                unsigned u64s_min, unsigned u64s_max)
+                                 unsigned flags)
 {
        int ret;
 
        wait_event(j->wait,
-                  (ret = __journal_res_get(j, res, u64s_min,
-                                           u64s_max)));
-       return ret < 0 ? ret : 0;
+                  (ret = __journal_res_get(j, res, flags)) != -EAGAIN ||
+                  (flags & JOURNAL_RES_GET_NONBLOCK));
+       return ret;
 }
 
 u64 bch2_journal_last_unwritten_seq(struct journal *j)
@@ -452,28 +459,55 @@ u64 bch2_journal_last_unwritten_seq(struct journal *j)
  * btree root - every journal entry contains the roots of all the btrees, so it
  * doesn't need to bother with getting a journal reservation
  */
-int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *parent)
+int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
 {
-       int ret;
-
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       bool need_reclaim = false;
+retry:
        spin_lock(&j->lock);
-       BUG_ON(seq > journal_cur_seq(j));
 
        if (seq < journal_cur_seq(j) ||
            journal_entry_is_open(j)) {
                spin_unlock(&j->lock);
-               return 1;
+               return 0;
+       }
+
+       if (journal_cur_seq(j) < seq) {
+               switch (journal_buf_switch(j, false)) {
+               case JOURNAL_ENTRY_ERROR:
+                       spin_unlock(&j->lock);
+                       return -EROFS;
+               case JOURNAL_ENTRY_INUSE:
+                       /* haven't finished writing out the previous one: */
+                       trace_journal_entry_full(c);
+                       goto blocked;
+               case JOURNAL_ENTRY_CLOSED:
+                       break;
+               case JOURNAL_UNLOCKED:
+                       goto retry;
+               }
+       }
+
+       BUG_ON(journal_cur_seq(j) < seq);
+
+       if (!journal_entry_open(j)) {
+               need_reclaim = true;
+               goto blocked;
        }
 
-       ret = journal_entry_open(j);
-       if (!ret)
-               closure_wait(&j->async_wait, parent);
        spin_unlock(&j->lock);
 
-       if (!ret)
-               bch2_journal_reclaim_work(&j->reclaim_work.work);
+       return 0;
+blocked:
+       if (!j->res_get_blocked_start)
+               j->res_get_blocked_start = local_clock() ?: 1;
 
-       return ret;
+       closure_wait(&j->async_wait, cl);
+       spin_unlock(&j->lock);
+
+       if (need_reclaim)
+               bch2_journal_reclaim_work(&j->reclaim_work.work);
+       return -EAGAIN;
 }
 
 static int journal_seq_error(struct journal *j, u64 seq)
@@ -593,11 +627,10 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
 void bch2_journal_meta_async(struct journal *j, struct closure *parent)
 {
        struct journal_res res;
-       unsigned u64s = jset_u64s(0);
 
        memset(&res, 0, sizeof(res));
 
-       bch2_journal_res_get(j, &res, u64s, u64s);
+       bch2_journal_res_get(j, &res, jset_u64s(0), 0);
        bch2_journal_res_put(j, &res);
 
        bch2_journal_flush_seq_async(j, res.seq, parent);
@@ -606,12 +639,11 @@ void bch2_journal_meta_async(struct journal *j, struct closure *parent)
 int bch2_journal_meta(struct journal *j)
 {
        struct journal_res res;
-       unsigned u64s = jset_u64s(0);
        int ret;
 
        memset(&res, 0, sizeof(res));
 
-       ret = bch2_journal_res_get(j, &res, u64s, u64s);
+       ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0);
        if (ret)
                return ret;
 
@@ -751,9 +783,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
                                ca->mi.bucket_size,
                                gc_phase(GC_PHASE_SB),
-                               new_fs
-                               ? BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE
-                               : 0);
+                               0);
 
                if (c) {
                        spin_unlock(&c->journal.lock);
@@ -861,10 +891,6 @@ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
 
 void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
 {
-       spin_lock(&j->lock);
-       bch2_extent_drop_device(bkey_i_to_s_extent(&j->key), ca->dev_idx);
-       spin_unlock(&j->lock);
-
        wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx));
 }
 
@@ -1000,8 +1026,6 @@ int bch2_fs_journal_init(struct journal *j)
        j->write_delay_ms       = 1000;
        j->reclaim_delay_ms     = 100;
 
-       bkey_extent_init(&j->key);
-
        atomic64_set(&j->reservations.counter,
                ((union journal_res_state)
                 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
index 5870392e383eda54508e0c2ad6a03443140c4190..0595597f08e918a4ded73810eea35dbc29df920b 100644 (file)
@@ -269,12 +269,10 @@ static inline void bch2_journal_res_put(struct journal *j,
 }
 
 int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
-                                unsigned, unsigned);
+                                 unsigned);
 
 static inline int journal_res_get_fast(struct journal *j,
-                                      struct journal_res *res,
-                                      unsigned u64s_min,
-                                      unsigned u64s_max)
+                                      struct journal_res *res)
 {
        union journal_res_state old, new;
        u64 v = atomic64_read(&j->reservations.counter);
@@ -286,37 +284,37 @@ static inline int journal_res_get_fast(struct journal *j,
                 * Check if there is still room in the current journal
                 * entry:
                 */
-               if (old.cur_entry_offset + u64s_min > j->cur_entry_u64s)
+               if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
                        return 0;
 
-               res->offset     = old.cur_entry_offset;
-               res->u64s       = min(u64s_max, j->cur_entry_u64s -
-                                     old.cur_entry_offset);
-
-               journal_state_inc(&new);
                new.cur_entry_offset += res->u64s;
+               journal_state_inc(&new);
        } while ((v = atomic64_cmpxchg(&j->reservations.counter,
                                       old.v, new.v)) != old.v);
 
-       res->ref = true;
-       res->idx = new.idx;
-       res->seq = le64_to_cpu(j->buf[res->idx].data->seq);
+       res->ref        = true;
+       res->idx        = old.idx;
+       res->offset     = old.cur_entry_offset;
+       res->seq        = le64_to_cpu(j->buf[old.idx].data->seq);
        return 1;
 }
 
+#define JOURNAL_RES_GET_NONBLOCK       (1 << 0)
+
 static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res,
-                                     unsigned u64s_min, unsigned u64s_max)
+                                      unsigned u64s, unsigned flags)
 {
        int ret;
 
        EBUG_ON(res->ref);
-       EBUG_ON(u64s_max < u64s_min);
        EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags));
 
-       if (journal_res_get_fast(j, res, u64s_min, u64s_max))
+       res->u64s = u64s;
+
+       if (journal_res_get_fast(j, res))
                goto out;
 
-       ret = bch2_journal_res_get_slowpath(j, res, u64s_min, u64s_max);
+       ret = bch2_journal_res_get_slowpath(j, res, flags);
        if (ret)
                return ret;
 out:
index c83e8eb8e111b59348bd0f300f77dbc7276aeefa..3840764a642967901fa337e14d405cace5bb1982 100644 (file)
@@ -426,7 +426,7 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
 static int journal_read_bucket(struct bch_dev *ca,
                               struct journal_read_buf *buf,
                               struct journal_list *jlist,
-                              unsigned bucket, u64 *seq, bool *entries_found)
+                              unsigned bucket)
 {
        struct bch_fs *c = ca->fs;
        struct journal_device *ja = &ca->journal;
@@ -511,7 +511,6 @@ reread:
 
                switch (ret) {
                case JOURNAL_ENTRY_ADD_OK:
-                       *entries_found = true;
                        break;
                case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
                        break;
@@ -519,9 +518,6 @@ reread:
                        return ret;
                }
 
-               if (le64_to_cpu(j->seq) > *seq)
-                       *seq = le64_to_cpu(j->seq);
-
                sectors = vstruct_sectors(j, c->block_bits);
 next_block:
                pr_debug("next");
@@ -535,120 +531,51 @@ next_block:
 
 static void bch2_journal_read_device(struct closure *cl)
 {
-#define read_bucket(b)                                                 \
-       ({                                                              \
-               bool entries_found = false;                             \
-               ret = journal_read_bucket(ca, &buf, jlist, b, &seq,     \
-                                         &entries_found);              \
-               if (ret)                                                \
-                       goto err;                                       \
-               __set_bit(b, bitmap);                                   \
-               entries_found;                                          \
-        })
-
        struct journal_device *ja =
                container_of(cl, struct journal_device, read);
        struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
        struct journal_list *jlist =
                container_of(cl->parent, struct journal_list, cl);
-       struct request_queue *q = bdev_get_queue(ca->disk_sb.bdev);
        struct journal_read_buf buf = { NULL, 0 };
-
-       DECLARE_BITMAP(bitmap, ja->nr);
-       unsigned i, l, r;
-       u64 seq = 0;
+       u64 min_seq = U64_MAX;
+       unsigned i;
        int ret;
 
        if (!ja->nr)
                goto out;
 
-       bitmap_zero(bitmap, ja->nr);
        ret = journal_read_buf_realloc(&buf, PAGE_SIZE);
        if (ret)
                goto err;
 
        pr_debug("%u journal buckets", ja->nr);
 
-       /*
-        * If the device supports discard but not secure discard, we can't do
-        * the fancy fibonacci hash/binary search because the live journal
-        * entries might not form a contiguous range:
-        */
-       for (i = 0; i < ja->nr; i++)
-               read_bucket(i);
-       goto search_done;
-
-       if (!blk_queue_nonrot(q))
-               goto linear_scan;
-
-       /*
-        * Read journal buckets ordered by golden ratio hash to quickly
-        * find a sequence of buckets with valid journal entries
-        */
        for (i = 0; i < ja->nr; i++) {
-               l = (i * 2654435769U) % ja->nr;
-
-               if (test_bit(l, bitmap))
-                       break;
-
-               if (read_bucket(l))
-                       goto bsearch;
+               ret = journal_read_bucket(ca, &buf, jlist, i);
+               if (ret)
+                       goto err;
        }
 
-       /*
-        * If that fails, check all the buckets we haven't checked
-        * already
-        */
-       pr_debug("falling back to linear search");
-linear_scan:
-       for (l = find_first_zero_bit(bitmap, ja->nr);
-            l < ja->nr;
-            l = find_next_zero_bit(bitmap, ja->nr, l + 1))
-               if (read_bucket(l))
-                       goto bsearch;
-
-       /* no journal entries on this device? */
-       if (l == ja->nr)
-               goto out;
-bsearch:
-       /* Binary search */
-       r = find_next_bit(bitmap, ja->nr, l + 1);
-       pr_debug("starting binary search, l %u r %u", l, r);
-
-       while (l + 1 < r) {
-               unsigned m = (l + r) >> 1;
-               u64 cur_seq = seq;
-
-               read_bucket(m);
+       /* Find the journal bucket with the highest sequence number: */
+       for (i = 0; i < ja->nr; i++) {
+               if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
+                       ja->cur_idx = i;
 
-               if (cur_seq != seq)
-                       l = m;
-               else
-                       r = m;
+               min_seq = min(ja->bucket_seq[i], min_seq);
        }
 
-search_done:
        /*
-        * Find the journal bucket with the highest sequence number:
-        *
         * If there's duplicate journal entries in multiple buckets (which
         * definitely isn't supposed to happen, but...) - make sure to start
         * cur_idx at the last of those buckets, so we don't deadlock trying to
         * allocate
         */
-       seq = 0;
-
-       for (i = 0; i < ja->nr; i++)
-               if (ja->bucket_seq[i] >= seq &&
-                   ja->bucket_seq[i] != ja->bucket_seq[(i + 1) % ja->nr]) {
-                       /*
-                        * When journal_next_bucket() goes to allocate for
-                        * the first time, it'll use the bucket after
-                        * ja->cur_idx
-                        */
-                       ja->cur_idx = i;
-                       seq = ja->bucket_seq[i];
-               }
+       while (ja->bucket_seq[ja->cur_idx] > min_seq &&
+              ja->bucket_seq[ja->cur_idx] >
+              ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
+               ja->cur_idx++;
+
+       ja->sectors_free = 0;
 
        /*
         * Set last_idx to indicate the entire journal is full and needs to be
@@ -656,17 +583,6 @@ search_done:
         * pinned when it first runs:
         */
        ja->last_idx = (ja->cur_idx + 1) % ja->nr;
-
-       /*
-        * Read buckets in reverse order until we stop finding more journal
-        * entries:
-        */
-       for (i = (ja->cur_idx + ja->nr - 1) % ja->nr;
-            i != ja->cur_idx;
-            i = (i + ja->nr - 1) % ja->nr)
-               if (!test_bit(i, bitmap) &&
-                   !read_bucket(i))
-                       break;
 out:
        kvpfree(buf.data, buf.size);
        percpu_ref_put(&ca->io_ref);
@@ -677,7 +593,6 @@ err:
        jlist->ret = ret;
        mutex_unlock(&jlist->lock);
        goto out;
-#undef read_bucket
 }
 
 void bch2_journal_entries_free(struct list_head *list)
@@ -865,7 +780,6 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
        int ret = 0;
 
        list_for_each_entry_safe(i, n, list, list) {
-
                j->replay_journal_seq = le64_to_cpu(i->j.seq);
 
                for_each_jset_key(k, _n, entry, &i->j) {
@@ -875,7 +789,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
                                 * allocation code handles replay for
                                 * BTREE_ID_ALLOC keys:
                                 */
-                               ret = bch2_alloc_replay_key(c, k->k.p);
+                               ret = bch2_alloc_replay_key(c, k);
                        } else {
                                /*
                                 * We might cause compressed extents to be
@@ -886,9 +800,9 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
                                        bch2_disk_reservation_init(c, 0);
 
                                ret = bch2_btree_insert(c, entry->btree_id, k,
-                                                       &disk_res, NULL,
-                                                       BTREE_INSERT_NOFAIL|
-                                                       BTREE_INSERT_JOURNAL_REPLAY);
+                                               &disk_res, NULL,
+                                               BTREE_INSERT_NOFAIL|
+                                               BTREE_INSERT_JOURNAL_REPLAY);
                        }
 
                        if (ret) {
@@ -932,32 +846,18 @@ static void bch2_journal_add_btree_root(struct journal_buf *buf,
 }
 
 static unsigned journal_dev_buckets_available(struct journal *j,
-                                             struct bch_dev *ca)
+                                             struct journal_device *ja)
 {
-       struct journal_device *ja = &ca->journal;
        unsigned next = (ja->cur_idx + 1) % ja->nr;
        unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
 
-       /*
-        * Hack to avoid a deadlock during journal replay:
-        * journal replay might require setting a new btree
-        * root, which requires writing another journal entry -
-        * thus, if the journal is full (and this happens when
-        * replaying the first journal bucket's entries) we're
-        * screwed.
-        *
-        * So don't let the journal fill up unless we're in
-        * replay:
-        */
-       if (test_bit(JOURNAL_REPLAY_DONE, &j->flags))
-               available = max((int) available - 2, 0);
-
        /*
         * Don't use the last bucket unless writing the new last_seq
         * will make another bucket available:
         */
-       if (ja->bucket_seq[ja->last_idx] >= journal_last_seq(j))
-               available = max((int) available - 1, 0);
+       if (available &&
+           journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
+               --available;
 
        return available;
 }
@@ -967,7 +867,6 @@ int bch2_journal_entry_sectors(struct journal *j)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct bch_dev *ca;
-       struct bkey_s_extent e = bkey_i_to_s_extent(&j->key);
        unsigned sectors_available = UINT_MAX;
        unsigned i, nr_online = 0, nr_devs = 0;
 
@@ -977,38 +876,39 @@ int bch2_journal_entry_sectors(struct journal *j)
        for_each_member_device_rcu(ca, c, i,
                                   &c->rw_devs[BCH_DATA_JOURNAL]) {
                struct journal_device *ja = &ca->journal;
-               unsigned buckets_required = 0;
+               unsigned buckets_this_device, sectors_this_device;
 
                if (!ja->nr)
                        continue;
 
-               sectors_available = min_t(unsigned, sectors_available,
-                                         ca->mi.bucket_size);
+               buckets_this_device = journal_dev_buckets_available(j, ja);
+               sectors_this_device = ja->sectors_free;
+
+               nr_online++;
 
                /*
-                * Note that we don't allocate the space for a journal entry
-                * until we write it out - thus, if we haven't started the write
-                * for the previous entry we have to make sure we have space for
-                * it too:
+                * We that we don't allocate the space for a journal entry
+                * until we write it out - thus, account for it here:
                 */
-               if (bch2_extent_has_device(e.c, ca->dev_idx)) {
-                       if (j->prev_buf_sectors > ja->sectors_free)
-                               buckets_required++;
-
-                       if (j->prev_buf_sectors + sectors_available >
-                           ja->sectors_free)
-                               buckets_required++;
-               } else {
-                       if (j->prev_buf_sectors + sectors_available >
-                           ca->mi.bucket_size)
-                               buckets_required++;
-
-                       buckets_required++;
+               if (j->prev_buf_sectors >= sectors_this_device) {
+                       if (!buckets_this_device)
+                               continue;
+
+                       buckets_this_device--;
+                       sectors_this_device = ca->mi.bucket_size;
                }
 
-               if (journal_dev_buckets_available(j, ca) >= buckets_required)
-                       nr_devs++;
-               nr_online++;
+               sectors_this_device -= j->prev_buf_sectors;
+
+               if (buckets_this_device)
+                       sectors_this_device = ca->mi.bucket_size;
+
+               if (!sectors_this_device)
+                       continue;
+
+               sectors_available = min(sectors_available,
+                                       sectors_this_device);
+               nr_devs++;
        }
        rcu_read_unlock();
 
@@ -1021,107 +921,111 @@ int bch2_journal_entry_sectors(struct journal *j)
        return sectors_available;
 }
 
-/**
- * journal_next_bucket - move on to the next journal bucket if possible
- */
-static int journal_write_alloc(struct journal *j, struct journal_buf *w,
-                              unsigned sectors)
+static void __journal_write_alloc(struct journal *j,
+                                 struct journal_buf *w,
+                                 struct dev_alloc_list *devs_sorted,
+                                 unsigned sectors,
+                                 unsigned *replicas,
+                                 unsigned replicas_want)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct bkey_s_extent e;
-       struct bch_extent_ptr *ptr;
+       struct bkey_i_extent *e = bkey_i_to_extent(&w->key);
        struct journal_device *ja;
        struct bch_dev *ca;
-       struct dev_alloc_list devs_sorted;
-       unsigned i, replicas, replicas_want =
-               READ_ONCE(c->opts.metadata_replicas);
-
-       spin_lock(&j->lock);
-       e = bkey_i_to_s_extent(&j->key);
-
-       /*
-        * Drop any pointers to devices that have been removed, are no longer
-        * empty, or filled up their current journal bucket:
-        *
-        * Note that a device may have had a small amount of free space (perhaps
-        * one sector) that wasn't enough for the smallest possible journal
-        * entry - that's why we drop pointers to devices <= current free space,
-        * i.e. whichever device was limiting the current journal entry size.
-        */
-       bch2_extent_drop_ptrs(e, ptr, ({
-               ca = bch_dev_bkey_exists(c, ptr->dev);
-
-               ca->mi.state != BCH_MEMBER_STATE_RW ||
-               ca->journal.sectors_free <= sectors;
-       }));
-
-       extent_for_each_ptr(e, ptr) {
-               ca = bch_dev_bkey_exists(c, ptr->dev);
+       unsigned i;
 
-               BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW ||
-                      ca->journal.sectors_free <= sectors);
-               ca->journal.sectors_free -= sectors;
-       }
-
-       replicas = bch2_extent_nr_ptrs(e.c);
-
-       rcu_read_lock();
-       devs_sorted = bch2_wp_alloc_list(c, &j->wp,
-                                        &c->rw_devs[BCH_DATA_JOURNAL]);
+       if (*replicas >= replicas_want)
+               return;
 
-       for (i = 0; i < devs_sorted.nr; i++) {
-               ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+       for (i = 0; i < devs_sorted->nr; i++) {
+               ca = rcu_dereference(c->devs[devs_sorted->devs[i]]);
                if (!ca)
                        continue;
 
-               if (!ca->mi.durability)
-                       continue;
-
                ja = &ca->journal;
-               if (!ja->nr)
-                       continue;
-
-               if (replicas >= replicas_want)
-                       break;
 
                /*
                 * Check that we can use this device, and aren't already using
                 * it:
                 */
-               if (bch2_extent_has_device(e.c, ca->dev_idx) ||
-                   !journal_dev_buckets_available(j, ca) ||
-                   sectors > ca->mi.bucket_size)
+               if (!ca->mi.durability ||
+                   ca->mi.state != BCH_MEMBER_STATE_RW ||
+                   !ja->nr ||
+                   bch2_extent_has_device(extent_i_to_s_c(e), ca->dev_idx) ||
+                   sectors > ja->sectors_free)
                        continue;
 
-               j->wp.next_alloc[ca->dev_idx] += U32_MAX;
-               bch2_wp_rescale(c, ca, &j->wp);
-
-               ja->sectors_free = ca->mi.bucket_size - sectors;
-               ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
-               ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+               bch2_dev_stripe_increment(c, ca, &j->wp.stripe);
 
-               extent_ptr_append(bkey_i_to_extent(&j->key),
+               extent_ptr_append(e,
                        (struct bch_extent_ptr) {
                                  .offset = bucket_to_sector(ca,
-                                       ja->buckets[ja->cur_idx]),
+                                       ja->buckets[ja->cur_idx]) +
+                                       ca->mi.bucket_size -
+                                       ja->sectors_free,
                                  .dev = ca->dev_idx,
                });
 
-               replicas += ca->mi.durability;
+               ja->sectors_free -= sectors;
+               ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
+
+               *replicas += ca->mi.durability;
+
+               if (*replicas >= replicas_want)
+                       break;
        }
-       rcu_read_unlock();
+}
 
-       j->prev_buf_sectors = 0;
+/**
+ * journal_next_bucket - move on to the next journal bucket if possible
+ */
+static int journal_write_alloc(struct journal *j, struct journal_buf *w,
+                              unsigned sectors)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct journal_device *ja;
+       struct bch_dev *ca;
+       struct dev_alloc_list devs_sorted;
+       unsigned i, replicas = 0, replicas_want =
+               READ_ONCE(c->opts.metadata_replicas);
 
-       bkey_copy(&w->key, &j->key);
-       spin_unlock(&j->lock);
+       rcu_read_lock();
 
-       if (replicas < c->opts.metadata_replicas_required)
-               return -EROFS;
+       devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
+                                         &c->rw_devs[BCH_DATA_JOURNAL]);
 
-       BUG_ON(!replicas);
+       spin_lock(&j->lock);
+       __journal_write_alloc(j, w, &devs_sorted,
+                             sectors, &replicas, replicas_want);
 
-       return 0;
+       if (replicas >= replicas_want)
+               goto done;
+
+       for (i = 0; i < devs_sorted.nr; i++) {
+               ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+               if (!ca)
+                       continue;
+
+               ja = &ca->journal;
+
+               if (sectors > ja->sectors_free &&
+                   sectors <= ca->mi.bucket_size &&
+                   journal_dev_buckets_available(j, ja)) {
+                       ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+                       ja->sectors_free = ca->mi.bucket_size;
+               }
+       }
+
+       __journal_write_alloc(j, w, &devs_sorted,
+                             sectors, &replicas, replicas_want);
+done:
+       if (replicas >= replicas_want)
+               j->prev_buf_sectors = 0;
+
+       spin_unlock(&j->lock);
+       rcu_read_unlock();
+
+       return replicas >= replicas_want ? 0 : -EROFS;
 }
 
 static void journal_write_compact(struct jset *jset)
@@ -1376,9 +1280,6 @@ void bch2_journal_write(struct closure *cl)
                }
 
 no_io:
-       extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr)
-               ptr->offset += sectors;
-
        bch2_bucket_seq_cleanup(c);
 
        continue_at(cl, journal_write_done, system_highpri_wq);
index 978aba7207903a4c39622c7640b763c764e648b6..9ac65d03935ebc9a341ef68f0144e71401885bce 100644 (file)
@@ -125,7 +125,8 @@ void bch2_journal_reclaim_fast(struct journal *j)
         * Unpin journal entries whose reference counts reached zero, meaning
         * all btree nodes got written out
         */
-       while (!atomic_read(&fifo_peek_front(&j->pin).count)) {
+       while (!fifo_empty(&j->pin) &&
+              !atomic_read(&fifo_peek_front(&j->pin).count)) {
                BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
                BUG_ON(!fifo_pop(&j->pin, temp));
                popped = true;
index 26702482b85a6f88613b3de803cee2ef80339706..a593368921c6f3f9ef69191070c440c94dd8a6aa 100644 (file)
@@ -184,7 +184,6 @@ struct journal {
        struct list_head        seq_blacklist;
        struct journal_seq_blacklist *new_blacklist;
 
-       BKEY_PADDED(key);
        struct write_point      wp;
        spinlock_t              err_lock;
 
index 775d6a667752664e8f1dc79b2ed31604dfea4fef..449cd5bfcfc7317adc615e72a5f4ad385b6cc811 100644 (file)
@@ -278,11 +278,37 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v)
        case Opt_background_compression:
                ret = bch2_check_set_has_compressed_data(c, v);
                break;
+       case Opt_erasure_code:
+               if (v &&
+                   !(c->sb.features & (1ULL << BCH_FEATURE_EC))) {
+                       mutex_lock(&c->sb_lock);
+                       c->disk_sb.sb->features[0] |=
+                               cpu_to_le64(1ULL << BCH_FEATURE_EC);
+
+                       bch2_write_super(c);
+                       mutex_unlock(&c->sb_lock);
+               }
+               break;
        }
 
        return ret;
 }
 
+int bch2_opts_check_may_set(struct bch_fs *c)
+{
+       unsigned i;
+       int ret;
+
+       for (i = 0; i < bch2_opts_nr; i++) {
+               ret = bch2_opt_check_may_set(c, i,
+                               bch2_opt_get_by_id(&c->opts, i));
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+
 int bch2_parse_mount_opts(struct bch_opts *opts, char *options)
 {
        char *opt, *name, *val;
index bdf1e4fb606e713f1b19f585a47169bbf9061e1e..8ffae3d9e6b69e7df0a1716c88a916db16aeef49 100644 (file)
@@ -110,6 +110,9 @@ enum opt_type {
        BCH_OPT(promote_target,         u16,    OPT_RUNTIME,            \
                OPT_FN(bch2_opt_target),                                \
                BCH_SB_PROMOTE_TARGET,  0)                              \
+       BCH_OPT(erasure_code,           u16,    OPT_RUNTIME,            \
+               OPT_BOOL(),                                             \
+               BCH_SB_ERASURE_CODE,            false)                  \
        BCH_OPT(inodes_32bit,           u8,     OPT_RUNTIME,            \
                OPT_BOOL(),                                             \
                BCH_SB_INODE_32BIT,             false)                  \
@@ -266,6 +269,7 @@ void bch2_opt_to_text(struct printbuf *, struct bch_fs *,
                      const struct bch_option *, u64, unsigned);
 
 int bch2_opt_check_may_set(struct bch_fs *, int, u64);
+int bch2_opts_check_may_set(struct bch_fs *);
 int bch2_parse_mount_opts(struct bch_opts *, char *);
 
 /* inode opts: */
@@ -277,7 +281,8 @@ int bch2_parse_mount_opts(struct bch_opts *, char *);
        BCH_INODE_OPT(data_replicas,                    8)      \
        BCH_INODE_OPT(promote_target,                   16)     \
        BCH_INODE_OPT(foreground_target,                16)     \
-       BCH_INODE_OPT(background_target,                16)
+       BCH_INODE_OPT(background_target,                16)     \
+       BCH_INODE_OPT(erasure_code,                     16)
 
 struct bch_io_opts {
 #define BCH_INODE_OPT(_name, _bits)    unsigned _name##_defined:1;
index c5d9dc4ee85ee3022d79a16000627d74d27fde7e..0e3c321a148594f694c94a1cc5e5032724950ca3 100644 (file)
@@ -6,6 +6,7 @@
 #include "btree_update_interior.h"
 #include "btree_io.h"
 #include "dirent.h"
+#include "ec.h"
 #include "error.h"
 #include "fsck.h"
 #include "journal_io.h"
@@ -212,6 +213,11 @@ int bch2_fs_recovery(struct bch_fs *c)
 
        set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
 
+       err = "cannot allocate memory";
+       ret = bch2_fs_ec_start(c);
+       if (ret)
+               goto err;
+
        bch_verbose(c, "starting mark and sweep:");
        err = "error in recovery";
        ret = bch2_initial_gc(c, &journal);
index a7a4e280840b90c2ce755f34758b615b0c64cfe0..0ba5ce5cb2ec17a14aeaf4a782bdb6b68d7e538a 100644 (file)
@@ -79,9 +79,33 @@ static void extent_to_replicas(struct bkey_s_c k,
 
                r->nr_required  = 1;
 
-               extent_for_each_ptr_decode(e, p, entry)
-                       if (!p.ptr.cached)
-                               r->devs[r->nr_devs++] = p.ptr.dev;
+               extent_for_each_ptr_decode(e, p, entry) {
+                       if (p.ptr.cached)
+                               continue;
+
+                       if (p.ec_nr) {
+                               r->nr_devs = 0;
+                               break;
+                       }
+
+                       r->devs[r->nr_devs++] = p.ptr.dev;
+               }
+       }
+}
+
+static void stripe_to_replicas(struct bkey_s_c k,
+                              struct bch_replicas_entry *r)
+{
+       if (k.k->type == BCH_STRIPE) {
+               struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+               const struct bch_extent_ptr *ptr;
+
+               r->nr_required  = s.v->nr_blocks - s.v->nr_redundant;
+
+               for (ptr = s.v->ptrs;
+                    ptr < s.v->ptrs + s.v->nr_blocks;
+                    ptr++)
+                       r->devs[r->nr_devs++] = ptr->dev;
        }
 }
 
@@ -100,6 +124,10 @@ static void bkey_to_replicas(enum bkey_type type,
                e->data_type = BCH_DATA_USER;
                extent_to_replicas(k, e);
                break;
+       case BKEY_TYPE_EC:
+               e->data_type = BCH_DATA_USER;
+               stripe_to_replicas(k, e);
+               break;
        default:
                break;
        }
index 83523572881a9890ea1bc7a8ac8158e64ca66552..71920079ac3c4e1f882d31ede88a48579f492b3c 100644 (file)
@@ -2,6 +2,7 @@
 #include "bcachefs.h"
 #include "checksum.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "error.h"
 #include "io.h"
 #include "journal.h"
index 5b27ead03cd424302eb571344ec0f09f84f79774..0eb6b7e7531b5a4196c68dbf23f11450e5ee7575 100644 (file)
@@ -19,6 +19,7 @@
 #include "compress.h"
 #include "debug.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "error.h"
 #include "fs.h"
 #include "fs-io.h"
@@ -395,6 +396,7 @@ static void bch2_fs_free(struct bch_fs *c)
 
        bch2_fs_quota_exit(c);
        bch2_fs_fsio_exit(c);
+       bch2_fs_ec_exit(c);
        bch2_fs_encryption_exit(c);
        bch2_fs_io_exit(c);
        bch2_fs_btree_cache_exit(c);
@@ -403,7 +405,7 @@ static void bch2_fs_free(struct bch_fs *c)
        bch2_io_clock_exit(&c->io_clock[READ]);
        bch2_fs_compress_exit(c);
        percpu_free_rwsem(&c->usage_lock);
-       free_percpu(c->usage_percpu);
+       free_percpu(c->usage[0]);
        mempool_exit(&c->btree_iters_pool);
        mempool_exit(&c->btree_bounce_pool);
        bioset_exit(&c->btree_bio);
@@ -576,6 +578,11 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
        INIT_LIST_HEAD(&c->fsck_errors);
        mutex_init(&c->fsck_error_lock);
 
+       INIT_LIST_HEAD(&c->ec_new_stripe_list);
+       mutex_init(&c->ec_new_stripe_lock);
+       mutex_init(&c->ec_stripes_lock);
+       spin_lock_init(&c->ec_stripes_heap_lock);
+
        seqcount_init(&c->gc_pos_lock);
 
        c->copy_gc_enabled              = 1;
@@ -631,7 +638,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                        max(offsetof(struct btree_read_bio, bio),
                            offsetof(struct btree_write_bio, wbio.bio)),
                        BIOSET_NEED_BVECS) ||
-           !(c->usage_percpu = alloc_percpu(struct bch_fs_usage)) ||
+           !(c->usage[0] = alloc_percpu(struct bch_fs_usage)) ||
            percpu_init_rwsem(&c->usage_lock) ||
            mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
                                        btree_bytes(c)) ||
@@ -644,6 +651,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
            bch2_fs_io_init(c) ||
            bch2_fs_encryption_init(c) ||
            bch2_fs_compress_init(c) ||
+           bch2_fs_ec_init(c) ||
            bch2_fs_fsio_init(c))
                goto err;
 
@@ -715,6 +723,10 @@ const char *bch2_fs_start(struct bch_fs *c)
        if (ret)
                goto err;
 
+       ret = bch2_opts_check_may_set(c);
+       if (ret)
+               goto err;
+
        err = "dynamic fault";
        if (bch2_fs_init_fault("fs_start"))
                goto err;
@@ -1054,8 +1066,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
                return ret;
 
        mutex_lock(&c->sb_lock);
-       bch2_mark_dev_superblock(ca->fs, ca,
-                       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+       bch2_mark_dev_superblock(ca->fs, ca, 0);
        mutex_unlock(&c->sb_lock);
 
        bch2_dev_sysfs_online(c, ca);
@@ -1340,7 +1351,7 @@ static void dev_usage_clear(struct bch_dev *ca)
 
        for_each_possible_cpu(cpu) {
                struct bch_dev_usage *p =
-                       per_cpu_ptr(ca->usage_percpu, cpu);
+                       per_cpu_ptr(ca->usage[0], cpu);
                memset(p, 0, sizeof(*p));
        }
 
@@ -1401,8 +1412,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
         * allocate the journal, reset all the marks, then remark after we
         * attach...
         */
-       bch2_mark_dev_superblock(ca->fs, ca,
-                       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+       bch2_mark_dev_superblock(ca->fs, ca, 0);
 
        err = "journal alloc failed";
        ret = bch2_dev_journal_alloc(ca);
@@ -1461,8 +1471,7 @@ have_slot:
        ca->disk_sb.sb->dev_idx = dev_idx;
        bch2_dev_attach(c, ca, dev_idx);
 
-       bch2_mark_dev_superblock(c, ca,
-                       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+       bch2_mark_dev_superblock(c, ca, 0);
 
        bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
index f793cfbad605b65e95ac2ddd9b85c65f8ab84cdd..0c3bdcd1f86d9098d58af370d3ef9585fcd97882 100644 (file)
@@ -18,6 +18,7 @@
 #include "btree_gc.h"
 #include "buckets.h"
 #include "disk_groups.h"
+#include "ec.h"
 #include "inode.h"
 #include "journal.h"
 #include "keylist.h"
@@ -187,6 +188,8 @@ sysfs_pd_controller_attribute(rebalance);
 read_attribute(rebalance_work);
 rw_attribute(promote_whole_extents);
 
+read_attribute(new_stripes);
+
 rw_attribute(pd_controllers_update_seconds);
 
 read_attribute(meta_replicas_have);
@@ -241,6 +244,8 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
                        pr_buf(&out, "\t%s:\t\t%llu\n",
                               bch2_data_types[type],
                               stats.replicas[replicas].data[type]);
+               pr_buf(&out, "\terasure coded:\t%llu\n",
+                      stats.replicas[replicas].ec_data);
                pr_buf(&out, "\treserved:\t%llu\n",
                       stats.replicas[replicas].persistent_reserved);
        }
@@ -309,6 +314,41 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
                        compressed_sectors_uncompressed << 9);
 }
 
+static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf)
+{
+       char *out = buf, *end = buf + PAGE_SIZE;
+       struct ec_stripe_head *h;
+       struct ec_stripe_new *s;
+
+       mutex_lock(&c->ec_new_stripe_lock);
+       list_for_each_entry(h, &c->ec_new_stripe_list, list) {
+               out += scnprintf(out, end - out,
+                                "target %u algo %u redundancy %u:\n",
+                                h->target, h->algo, h->redundancy);
+
+               if (h->s)
+                       out += scnprintf(out, end - out,
+                                        "\tpending: blocks %u allocated %u\n",
+                                        h->s->blocks.nr,
+                                        bitmap_weight(h->s->blocks_allocated,
+                                                      h->s->blocks.nr));
+
+               mutex_lock(&h->lock);
+               list_for_each_entry(s, &h->stripes, list)
+                       out += scnprintf(out, end - out,
+                                        "\tin flight: blocks %u allocated %u pin %u\n",
+                                        s->blocks.nr,
+                                        bitmap_weight(s->blocks_allocated,
+                                                      s->blocks.nr),
+                                        atomic_read(&s->pin));
+               mutex_unlock(&h->lock);
+
+       }
+       mutex_unlock(&c->ec_new_stripe_lock);
+
+       return out - buf;
+}
+
 SHOW(bch2_fs)
 {
        struct bch_fs *c = container_of(kobj, struct bch_fs, kobj);
@@ -368,6 +408,9 @@ SHOW(bch2_fs)
        if (attr == &sysfs_compression_stats)
                return bch2_compression_stats(c, buf);
 
+       if (attr == &sysfs_new_stripes)
+               return bch2_new_stripes(c, buf);
+
 #define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name);
        BCH_DEBUG_PARAMS()
 #undef BCH_DEBUG_PARAM
@@ -434,7 +477,7 @@ STORE(__bch2_fs)
                bch2_coalesce(c);
 
        if (attr == &sysfs_trigger_gc)
-               bch2_gc(c);
+               bch2_gc(c, NULL, false);
 
        if (attr == &sysfs_prune_cache) {
                struct shrink_control sc;
@@ -536,6 +579,8 @@ struct attribute *bch2_fs_internal_files[] = {
        &sysfs_rebalance_work,
        sysfs_pd_controller_files(rebalance),
 
+       &sysfs_new_stripes,
+
        &sysfs_internal_uuid,
 
 #define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
@@ -764,6 +809,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
                "    meta:               %llu\n"
                "    user:               %llu\n"
                "    cached:             %llu\n"
+               "    erasure coded:      %llu\n"
                "    available:          %lli\n"
                "sectors:\n"
                "    sb:                 %llu\n"
@@ -787,6 +833,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
                stats.buckets[BCH_DATA_BTREE],
                stats.buckets[BCH_DATA_USER],
                stats.buckets[BCH_DATA_CACHED],
+               stats.buckets_ec,
                ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable,
                stats.sectors[BCH_DATA_SB],
                stats.sectors[BCH_DATA_JOURNAL],