]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 26409a8f75 bcachefs: Journal updates to dev usage
authorKent Overstreet <kent.overstreet@gmail.com>
Tue, 2 Feb 2021 19:26:28 +0000 (14:26 -0500)
committerKent Overstreet <kent.overstreet@gmail.com>
Tue, 2 Feb 2021 21:07:59 +0000 (16:07 -0500)
31 files changed:
.bcachefs_revision
cmd_debug.c
libbcachefs.c
libbcachefs/alloc_background.c
libbcachefs/alloc_background.h
libbcachefs/alloc_types.h
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bkey.h
libbcachefs/btree_gc.c
libbcachefs/btree_update_interior.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/buckets_types.h
libbcachefs/clock.c
libbcachefs/clock_types.h
libbcachefs/ec.c
libbcachefs/extents.c
libbcachefs/journal.c
libbcachefs/journal_io.c
libbcachefs/movinggc.c
libbcachefs/opts.h
libbcachefs/rebalance.c
libbcachefs/rebalance_types.h
libbcachefs/recovery.c
libbcachefs/replicas.c
libbcachefs/replicas.h
libbcachefs/super-io.c
libbcachefs/super-io.h
libbcachefs/super.c
libbcachefs/sysfs.c

index 953107c28e819391957597e81f61db8a980830c6..7b4e00b1439eb75c18927548a133b6316a5b49e0 100644 (file)
@@ -1 +1 @@
-ea3414eed52e5d90c248453e84b2dcd91c960306
+26409a8f755b8faa620a49796d7935566204daaf
index befd41f4e2cbf88af5079a999f4f9491765bc1e3..c4dd24ba1de8ec74c66530f5c5bf09bcad44b453 100644 (file)
@@ -572,14 +572,10 @@ int cmd_list_journal(int argc, char *argv[])
                printf("journal entry   %8llu\n"
                       "    version     %8u\n"
                       "    last seq    %8llu\n"
-                      "    read clock  %8u\n"
-                      "    write clock %8u\n"
                       ,
                       le64_to_cpu(p->j.seq),
                       le32_to_cpu(p->j.version),
-                      le64_to_cpu(p->j.last_seq),
-                      le16_to_cpu(p->j.read_clock),
-                      le16_to_cpu(p->j.write_clock));
+                      le64_to_cpu(p->j.last_seq));
 
                for_each_jset_key(k, _n, entry, &p->j) {
                        char buf[200];
index e7c1ca23224ca721217e4ae14a7ec456c4df9ac3..e359d48b758d3a105848a00d9265d1ca849aa265 100644 (file)
@@ -623,8 +623,6 @@ static void bch2_sb_print_clean(struct bch_sb *sb, struct bch_sb_field *f,
 
 
        printf("  flags:       %x", le32_to_cpu(clean->flags));
-       printf("  read clock:  %x", le16_to_cpu(clean->read_clock));
-       printf("  write clock: %x", le16_to_cpu(clean->write_clock));
        printf("  journal seq: %llx", le64_to_cpu(clean->journal_seq));
 }
 
index 896ec02328269020f4daa15d7951ff6d80fa5c0d..a91caf04fc9a732182ca512135c794e234548ef1 100644 (file)
@@ -14,6 +14,7 @@
 #include "ec.h"
 #include "error.h"
 #include "recovery.h"
+#include "varint.h"
 
 #include <linux/kthread.h>
 #include <linux/math64.h>
 #include <linux/sort.h>
 #include <trace/events/bcachefs.h>
 
-static const char * const bch2_alloc_field_names[] = {
-#define x(name, bytes) #name,
-       BCH_ALLOC_FIELDS()
+static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
+#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
+       BCH_ALLOC_FIELDS_V1()
 #undef x
-       NULL
 };
 
-static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
-
 /* Ratelimiting/PD controllers */
 
 static void pd_controllers_update(struct work_struct *work)
@@ -67,10 +65,10 @@ static void pd_controllers_update(struct work_struct *work)
 
 /* Persistent alloc info: */
 
-static inline u64 get_alloc_field(const struct bch_alloc *a,
-                                 const void **p, unsigned field)
+static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
+                                    const void **p, unsigned field)
 {
-       unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+       unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
        u64 v;
 
        if (!(a->fields & (1 << field)))
@@ -97,10 +95,10 @@ static inline u64 get_alloc_field(const struct bch_alloc *a,
        return v;
 }
 
-static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
-                                  unsigned field, u64 v)
+static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p,
+                                     unsigned field, u64 v)
 {
-       unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+       unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
 
        if (!v)
                return;
@@ -127,55 +125,149 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
        *p += bytes;
 }
 
-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
+                                struct bkey_s_c k)
 {
-       struct bkey_alloc_unpacked ret = { .gen = 0 };
+       const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
+       const void *d = in->data;
+       unsigned idx = 0;
 
-       if (k.k->type == KEY_TYPE_alloc) {
-               const struct bch_alloc *a = bkey_s_c_to_alloc(k).v;
-               const void *d = a->data;
-               unsigned idx = 0;
+       out->gen = in->gen;
 
-               ret.gen = a->gen;
+#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
+       BCH_ALLOC_FIELDS_V1()
+#undef  x
+}
 
-#define x(_name, _bits)        ret._name = get_alloc_field(a, &d, idx++);
-               BCH_ALLOC_FIELDS()
+static void bch2_alloc_pack_v1(struct bkey_alloc_buf *dst,
+                              const struct bkey_alloc_unpacked src)
+{
+       struct bkey_i_alloc *a = bkey_alloc_init(&dst->k);
+       void *d = a->v.data;
+       unsigned bytes, idx = 0;
+
+       a->k.p          = POS(src.dev, src.bucket);
+       a->v.fields     = 0;
+       a->v.gen        = src.gen;
+
+#define x(_name, _bits)        alloc_field_v1_put(a, &d, idx++, src._name);
+       BCH_ALLOC_FIELDS_V1()
 #undef  x
-       }
-       return ret;
+       bytes = (void *) d - (void *) &a->v;
+       set_bkey_val_bytes(&a->k, bytes);
+       memset_u64s_tail(&a->v, 0, bytes);
 }
 
-void bch2_alloc_pack(struct bkey_i_alloc *dst,
-                    const struct bkey_alloc_unpacked src)
+static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
+                               struct bkey_s_c k)
 {
-       unsigned idx = 0;
-       void *d = dst->v.data;
+       struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
+       const u8 *in = a.v->data;
+       const u8 *end = bkey_val_end(a);
+       unsigned fieldnr = 0;
+       int ret;
+       u64 v;
+
+       out->gen        = a.v->gen;
+       out->oldest_gen = a.v->oldest_gen;
+       out->data_type  = a.v->data_type;
+
+#define x(_name, _bits)                                                        \
+       if (fieldnr < a.v->nr_fields) {                                 \
+               ret = bch2_varint_decode(in, end, &v);                  \
+               if (ret < 0)                                            \
+                       return ret;                                     \
+               in += ret;                                              \
+       } else {                                                        \
+               v = 0;                                                  \
+       }                                                               \
+       out->_name = v;                                                 \
+       if (v != out->_name)                                            \
+               return -1;                                              \
+       fieldnr++;
+
+       BCH_ALLOC_FIELDS_V2()
+#undef  x
+       return 0;
+}
+
+static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
+                              const struct bkey_alloc_unpacked src)
+{
+       struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k);
+       unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+       u8 *out = a->v.data;
+       u8 *end = (void *) &dst[1];
+       u8 *last_nonzero_field = out;
        unsigned bytes;
 
-       dst->v.fields   = 0;
-       dst->v.gen      = src.gen;
+       a->k.p          = POS(src.dev, src.bucket);
+       a->v.gen        = src.gen;
+       a->v.oldest_gen = src.oldest_gen;
+       a->v.data_type  = src.data_type;
+
+#define x(_name, _bits)                                                        \
+       nr_fields++;                                                    \
+                                                                       \
+       if (src._name) {                                                \
+               out += bch2_varint_encode(out, src._name);              \
+                                                                       \
+               last_nonzero_field = out;                               \
+               last_nonzero_fieldnr = nr_fields;                       \
+       } else {                                                        \
+               *out++ = 0;                                             \
+       }
 
-#define x(_name, _bits)        put_alloc_field(dst, &d, idx++, src._name);
-       BCH_ALLOC_FIELDS()
+       BCH_ALLOC_FIELDS_V2()
 #undef  x
+       BUG_ON(out > end);
+
+       out = last_nonzero_field;
+       a->v.nr_fields = last_nonzero_fieldnr;
 
-       bytes = (void *) d - (void *) &dst->v;
-       set_bkey_val_bytes(&dst->k, bytes);
-       memset_u64s_tail(&dst->v, 0, bytes);
+       bytes = (u8 *) out - (u8 *) &a->v;
+       set_bkey_val_bytes(&a->k, bytes);
+       memset_u64s_tail(&a->v, 0, bytes);
+}
+
+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+{
+       struct bkey_alloc_unpacked ret = {
+               .dev    = k.k->p.inode,
+               .bucket = k.k->p.offset,
+               .gen    = 0,
+       };
+
+       if (k.k->type == KEY_TYPE_alloc_v2)
+               bch2_alloc_unpack_v2(&ret, k);
+       else if (k.k->type == KEY_TYPE_alloc)
+               bch2_alloc_unpack_v1(&ret, k);
+
+       return ret;
+}
+
+void bch2_alloc_pack(struct bch_fs *c,
+                    struct bkey_alloc_buf *dst,
+                    const struct bkey_alloc_unpacked src)
+{
+       if (c->sb.features & (1ULL << BCH_FEATURE_alloc_v2))
+               bch2_alloc_pack_v2(dst, src);
+       else
+               bch2_alloc_pack_v1(dst, src);
 }
 
 static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
 {
        unsigned i, bytes = offsetof(struct bch_alloc, data);
 
-       for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++)
+       for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
                if (a->fields & (1 << i))
-                       bytes += BCH_ALLOC_FIELD_BYTES[i];
+                       bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
 
        return DIV_ROUND_UP(bytes, sizeof(u64));
 }
 
-const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
+const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
        struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 
@@ -190,20 +282,30 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
        return NULL;
 }
 
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
-                       struct bkey_s_c k)
+const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-       struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
-       const void *d = a.v->data;
-       unsigned i;
+       struct bkey_alloc_unpacked u;
 
-       pr_buf(out, "gen %u", a.v->gen);
+       if (k.k->p.inode >= c->sb.nr_devices ||
+           !c->devs[k.k->p.inode])
+               return "invalid device";
+
+       if (bch2_alloc_unpack_v2(&u, k))
+               return "unpack error";
 
-       for (i = 0; i < BCH_ALLOC_FIELD_NR; i++)
-               if (a.v->fields & (1 << i))
-                       pr_buf(out, " %s %llu",
-                              bch2_alloc_field_names[i],
-                              get_alloc_field(a.v, &d, i));
+       return NULL;
+}
+
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
+                          struct bkey_s_c k)
+{
+       struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+
+       pr_buf(out, "gen %u oldest_gen %u data_type %u",
+              u.gen, u.oldest_gen, u.data_type);
+#define x(_name, ...)  pr_buf(out, #_name " %llu ", (u64) u._name);
+       BCH_ALLOC_FIELDS_V2()
+#undef  x
 }
 
 static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
@@ -213,7 +315,9 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
        struct bucket *g;
        struct bkey_alloc_unpacked u;
 
-       if (level || k.k->type != KEY_TYPE_alloc)
+       if (level ||
+           (k.k->type != KEY_TYPE_alloc &&
+            k.k->type != KEY_TYPE_alloc_v2))
                return 0;
 
        ca = bch_dev_bkey_exists(c, k.k->p.inode);
@@ -234,9 +338,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
 
 int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 {
-       struct bch_dev *ca;
-       unsigned i;
-       int ret = 0;
+       int ret;
 
        down_read(&c->gc_lock);
        ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC,
@@ -248,26 +350,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
                return ret;
        }
 
-       percpu_down_write(&c->mark_lock);
-       bch2_dev_usage_from_buckets(c);
-       percpu_up_write(&c->mark_lock);
-
-       mutex_lock(&c->bucket_clock[READ].lock);
-       for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               bch2_recalc_oldest_io(c, ca, READ);
-               up_read(&ca->bucket_lock);
-       }
-       mutex_unlock(&c->bucket_clock[READ].lock);
-
-       mutex_lock(&c->bucket_clock[WRITE].lock);
-       for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               bch2_recalc_oldest_io(c, ca, WRITE);
-               up_read(&ca->bucket_lock);
-       }
-       mutex_unlock(&c->bucket_clock[WRITE].lock);
-
        return 0;
 }
 
@@ -281,8 +363,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
        struct bucket *g;
        struct bucket_mark m;
        struct bkey_alloc_unpacked old_u, new_u;
-       __BKEY_PADDED(k, 8) alloc_key; /* hack: */
-       struct bkey_i_alloc *a;
+       struct bkey_alloc_buf a;
        int ret;
 retry:
        bch2_trans_begin(trans);
@@ -303,17 +384,14 @@ retry:
        ca      = bch_dev_bkey_exists(c, iter->pos.inode);
        g       = bucket(ca, iter->pos.offset);
        m       = READ_ONCE(g->mark);
-       new_u   = alloc_mem_to_key(g, m);
+       new_u   = alloc_mem_to_key(iter, g, m);
        percpu_up_read(&c->mark_lock);
 
        if (!bkey_alloc_unpacked_cmp(old_u, new_u))
                return 0;
 
-       a = bkey_alloc_init(&alloc_key.k);
-       a->k.p = iter->pos;
-       bch2_alloc_pack(a, new_u);
-
-       bch2_trans_update(trans, iter, &a->k_i,
+       bch2_alloc_pack(c, &a, new_u);
+       bch2_trans_update(trans, iter, &a.k,
                          BTREE_TRIGGER_NORUN);
        ret = bch2_trans_commit(trans, NULL, NULL,
                                BTREE_INSERT_NOFAIL|flags);
@@ -358,114 +436,6 @@ err:
 
 /* Bucket IO clocks: */
 
-static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
-{
-       struct bucket_clock *clock = &c->bucket_clock[rw];
-       struct bucket_array *buckets = bucket_array(ca);
-       struct bucket *g;
-       u16 max_last_io = 0;
-       unsigned i;
-
-       lockdep_assert_held(&c->bucket_clock[rw].lock);
-
-       /* Recalculate max_last_io for this device: */
-       for_each_bucket(g, buckets)
-               max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
-
-       ca->max_last_bucket_io[rw] = max_last_io;
-
-       /* Recalculate global max_last_io: */
-       max_last_io = 0;
-
-       for_each_member_device(ca, c, i)
-               max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
-
-       clock->max_last_io = max_last_io;
-}
-
-static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
-{
-       struct bucket_clock *clock = &c->bucket_clock[rw];
-       struct bucket_array *buckets;
-       struct bch_dev *ca;
-       struct bucket *g;
-       unsigned i;
-
-       trace_rescale_prios(c);
-
-       for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               buckets = bucket_array(ca);
-
-               for_each_bucket(g, buckets)
-                       g->io_time[rw] = clock->hand -
-                       bucket_last_io(c, g, rw) / 2;
-
-               bch2_recalc_oldest_io(c, ca, rw);
-
-               up_read(&ca->bucket_lock);
-       }
-}
-
-static inline u64 bucket_clock_freq(u64 capacity)
-{
-       return max(capacity >> 10, 2028ULL);
-}
-
-static void bch2_inc_clock_hand(struct io_timer *timer)
-{
-       struct bucket_clock *clock = container_of(timer,
-                                               struct bucket_clock, rescale);
-       struct bch_fs *c = container_of(clock,
-                                       struct bch_fs, bucket_clock[clock->rw]);
-       struct bch_dev *ca;
-       u64 capacity;
-       unsigned i;
-
-       mutex_lock(&clock->lock);
-
-       /* if clock cannot be advanced more, rescale prio */
-       if (clock->max_last_io >= U16_MAX - 2)
-               bch2_rescale_bucket_io_times(c, clock->rw);
-
-       BUG_ON(clock->max_last_io >= U16_MAX - 2);
-
-       for_each_member_device(ca, c, i)
-               ca->max_last_bucket_io[clock->rw]++;
-       clock->max_last_io++;
-       clock->hand++;
-
-       mutex_unlock(&clock->lock);
-
-       capacity = READ_ONCE(c->capacity);
-
-       if (!capacity)
-               return;
-
-       /*
-        * we only increment when 0.1% of the filesystem capacity has been read
-        * or written too, this determines if it's time
-        *
-        * XXX: we shouldn't really be going off of the capacity of devices in
-        * RW mode (that will be 0 when we're RO, yet we can still service
-        * reads)
-        */
-       timer->expire += bucket_clock_freq(capacity);
-
-       bch2_io_timer_add(&c->io_clock[clock->rw], timer);
-}
-
-static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
-{
-       struct bucket_clock *clock = &c->bucket_clock[rw];
-
-       clock->hand             = 1;
-       clock->rw               = rw;
-       clock->rescale.fn       = bch2_inc_clock_hand;
-       clock->rescale.expire   = bucket_clock_freq(c->capacity);
-       mutex_init(&clock->lock);
-}
-
 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
                              size_t bucket_nr, int rw)
 {
@@ -473,9 +443,9 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
        struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
        struct btree_iter *iter;
        struct bucket *g;
-       struct bkey_i_alloc *a;
+       struct bkey_alloc_buf *a;
        struct bkey_alloc_unpacked u;
-       u16 *time;
+       u64 *time, now;
        int ret = 0;
 
        iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr),
@@ -486,28 +456,25 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
        if (ret)
                goto out;
 
-       a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
+       a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
        ret = PTR_ERR_OR_ZERO(a);
        if (ret)
                goto out;
 
        percpu_down_read(&c->mark_lock);
        g = bucket(ca, bucket_nr);
-       u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+       u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
        percpu_up_read(&c->mark_lock);
 
-       bkey_alloc_init(&a->k_i);
-       a->k.p = iter->pos;
-
        time = rw == READ ? &u.read_time : &u.write_time;
-       if (*time == c->bucket_clock[rw].hand)
+       now = atomic64_read(&c->io_clock[rw].now);
+       if (*time == now)
                goto out;
 
-       *time = c->bucket_clock[rw].hand;
-
-       bch2_alloc_pack(a, u);
+       *time = now;
 
-       ret   = bch2_trans_update(trans, iter, &a->k_i, 0) ?:
+       bch2_alloc_pack(c, a, u);
+       ret   = bch2_trans_update(trans, iter, &a->k, 0) ?:
                bch2_trans_commit(trans, NULL, NULL, 0);
 out:
        bch2_trans_iter_put(trans, iter);
@@ -576,23 +543,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
        return ret;
 }
 
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
-                                      size_t bucket,
-                                      struct bucket_mark mark)
+static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
+                                      struct bucket_mark m)
 {
        u8 gc_gen;
 
-       if (!is_available_bucket(mark))
+       if (!is_available_bucket(m))
                return false;
 
-       if (mark.owned_by_allocator)
+       if (m.owned_by_allocator)
                return false;
 
        if (ca->buckets_nouse &&
-           test_bit(bucket, ca->buckets_nouse))
+           test_bit(b, ca->buckets_nouse))
                return false;
 
-       gc_gen = bucket_gc_gen(ca, bucket);
+       gc_gen = bucket_gc_gen(bucket(ca, b));
 
        if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
                ca->inc_gen_needs_gc++;
@@ -606,43 +572,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
 /*
  * Determines what order we're going to reuse buckets, smallest bucket_key()
  * first.
- *
- *
- * - We take into account the read prio of the bucket, which gives us an
- *   indication of how hot the data is -- we scale the prio so that the prio
- *   farthest from the clock is worth 1/8th of the closest.
- *
- * - The number of sectors of cached data in the bucket, which gives us an
- *   indication of the cost in cache misses this eviction will cause.
- *
- * - If hotness * sectors used compares equal, we pick the bucket with the
- *   smallest bucket_gc_gen() - since incrementing the same bucket's generation
- *   number repeatedly forces us to run mark and sweep gc to avoid generation
- *   number wraparound.
  */
 
-static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
-                                    size_t b, struct bucket_mark m)
+static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
+                               u64 now, u64 last_seq_ondisk)
 {
-       unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
-       unsigned max_last_io = ca->max_last_bucket_io[READ];
+       unsigned used = bucket_sectors_used(m);
 
-       /*
-        * Time since last read, scaled to [0, 8) where larger value indicates
-        * more recently read data:
-        */
-       unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
-
-       /* How much we want to keep the data in this bucket: */
-       unsigned long data_wantness =
-               (hotness + 1) * bucket_sectors_used(m);
-
-       unsigned long needs_journal_commit =
-               bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
+       if (used) {
+               /*
+                * Prefer to keep buckets that have been read more recently, and
+                * buckets that have more data in them:
+                */
+               u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
+               u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
 
-       return  (data_wantness << 9) |
-               (needs_journal_commit << 8) |
-               (bucket_gc_gen(ca, b) / 16);
+               return -last_read_scaled;
+       } else {
+               /*
+                * Prefer to use buckets with smaller gc_gen so that we don't
+                * have to walk the btree and recalculate oldest_gen - but shift
+                * off the low bits so that buckets will still have equal sort
+                * keys when there's only a small difference, so that we can
+                * keep sequential buckets together:
+                */
+               return  (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
+                       (bucket_gc_gen(g) >> 4);
+       }
 }
 
 static inline int bucket_alloc_cmp(alloc_heap *h,
@@ -665,16 +621,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 {
        struct bucket_array *buckets;
        struct alloc_heap_entry e = { 0 };
+       u64 now, last_seq_ondisk;
        size_t b, i, nr = 0;
 
-       ca->alloc_heap.used = 0;
-
-       mutex_lock(&c->bucket_clock[READ].lock);
        down_read(&ca->bucket_lock);
 
        buckets = bucket_array(ca);
-
-       bch2_recalc_oldest_io(c, ca, READ);
+       ca->alloc_heap.used = 0;
+       now = atomic64_read(&c->io_clock[READ].now);
+       last_seq_ondisk = c->journal.last_seq_ondisk;
 
        /*
         * Find buckets with lowest read priority, by building a maxheap sorted
@@ -682,8 +637,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
         * all buckets have been visited.
         */
        for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
-               struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
-               unsigned long key = bucket_sort_key(c, ca, b, m);
+               struct bucket *g = &buckets->b[b];
+               struct bucket_mark m = READ_ONCE(g->mark);
+               unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
 
                if (!bch2_can_invalidate_bucket(ca, b, m))
                        continue;
@@ -718,7 +674,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
        }
 
        up_read(&ca->bucket_lock);
-       mutex_unlock(&c->bucket_clock[READ].lock);
 }
 
 static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
@@ -863,14 +818,8 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
                                       struct btree_iter *iter,
                                       u64 *journal_seq, unsigned flags)
 {
-#if 0
-       __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
-#else
-       /* hack: */
-       __BKEY_PADDED(k, 8) alloc_key;
-#endif
        struct bch_fs *c = trans->c;
-       struct bkey_i_alloc *a;
+       struct bkey_alloc_buf a;
        struct bkey_alloc_unpacked u;
        struct bucket *g;
        struct bucket_mark m;
@@ -920,8 +869,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
                goto out;
        }
 
-       BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
-
        bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
 retry:
        ret = bch2_btree_iter_traverse(iter);
@@ -931,7 +878,7 @@ retry:
        percpu_down_read(&c->mark_lock);
        g = bucket(ca, iter->pos.offset);
        m = READ_ONCE(g->mark);
-       u = alloc_mem_to_key(g, m);
+       u = alloc_mem_to_key(iter, g, m);
 
        percpu_up_read(&c->mark_lock);
 
@@ -941,14 +888,11 @@ retry:
        u.data_type     = 0;
        u.dirty_sectors = 0;
        u.cached_sectors = 0;
-       u.read_time     = c->bucket_clock[READ].hand;
-       u.write_time    = c->bucket_clock[WRITE].hand;
-
-       a = bkey_alloc_init(&alloc_key.k);
-       a->k.p = iter->pos;
-       bch2_alloc_pack(a, u);
+       u.read_time     = atomic64_read(&c->io_clock[READ].now);
+       u.write_time    = atomic64_read(&c->io_clock[WRITE].now);
 
-       bch2_trans_update(trans, iter, &a->k_i,
+       bch2_alloc_pack(c, &a, u);
+       bch2_trans_update(trans, iter, &a.k,
                          BTREE_TRIGGER_BUCKET_INVALIDATE);
 
        /*
@@ -1455,8 +1399,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
        spin_lock_init(&c->freelist_lock);
-       bch2_bucket_clock_init(c, READ);
-       bch2_bucket_clock_init(c, WRITE);
 
        c->pd_controllers_update_seconds = 5;
        INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
index f60fcebff2cec9114b3984f96667ac67942335e2..6fededcd9f8686276beadd696b713c734eeffb5b 100644 (file)
@@ -7,12 +7,33 @@
 #include "debug.h"
 
 struct bkey_alloc_unpacked {
+       u64             bucket;
+       u8              dev;
        u8              gen;
+       u8              oldest_gen;
+       u8              data_type;
 #define x(_name, _bits)        u##_bits _name;
-       BCH_ALLOC_FIELDS()
+       BCH_ALLOC_FIELDS_V2()
 #undef  x
 };
 
+struct bkey_alloc_buf {
+       struct bkey_i   k;
+
+       union {
+       struct {
+#define x(_name,  _bits)               + _bits / 8
+       u8              _pad[8 + BCH_ALLOC_FIELDS_V1()];
+#undef  x
+       } _v1;
+       struct {
+#define x(_name,  _bits)               + 8 + _bits / 8
+       u8              _pad[8 + BCH_ALLOC_FIELDS_V2()];
+#undef  x
+       } _v2;
+       };
+} __attribute__((packed, aligned(8)));
+
 /* How out of date a pointer gen is allowed to be: */
 #define BUCKET_GC_GEN_MAX      96U
 
@@ -20,23 +41,28 @@ struct bkey_alloc_unpacked {
 static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
                                           struct bkey_alloc_unpacked r)
 {
-       return l.gen != r.gen
-#define x(_name, _bits)        || l._name != r._name
-       BCH_ALLOC_FIELDS()
+       return  l.gen != r.gen                  ||
+               l.oldest_gen != r.oldest_gen    ||
+               l.data_type != r.data_type
+#define x(_name, ...)  || l._name != r._name
+       BCH_ALLOC_FIELDS_V2()
 #undef  x
        ;
 }
 
 struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
-void bch2_alloc_pack(struct bkey_i_alloc *,
+void bch2_alloc_pack(struct bch_fs *, struct bkey_alloc_buf *,
                     const struct bkey_alloc_unpacked);
 
 int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 
 static inline struct bkey_alloc_unpacked
-alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
+alloc_mem_to_key(struct btree_iter *iter,
+                struct bucket *g, struct bucket_mark m)
 {
        return (struct bkey_alloc_unpacked) {
+               .dev            = iter->pos.inode,
+               .bucket         = iter->pos.offset,
                .gen            = m.gen,
                .oldest_gen     = g->oldest_gen,
                .data_type      = m.data_type,
@@ -49,11 +75,17 @@ alloc_mem_to_key(struct bucket *g, struct bucket_mark m)
 
 #define ALLOC_SCAN_BATCH(ca)           max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
 
-const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
+const char *bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 
 #define bch2_bkey_ops_alloc (struct bkey_ops) {                \
-       .key_invalid    = bch2_alloc_invalid,           \
+       .key_invalid    = bch2_alloc_v1_invalid,        \
+       .val_to_text    = bch2_alloc_to_text,           \
+}
+
+#define bch2_bkey_ops_alloc_v2 (struct bkey_ops) {     \
+       .key_invalid    = bch2_alloc_v2_invalid,        \
        .val_to_text    = bch2_alloc_to_text,           \
 }
 
index 1abfff5290bc52e5ef263558a3dfca7287751a42..be164d6108bbcdbb3f2ca48bf8e6dc0618d4b6a8 100644 (file)
 
 struct ec_bucket_buf;
 
-/* There's two of these clocks, one for reads and one for writes: */
-struct bucket_clock {
-       /*
-        * "now" in (read/write) IO time - incremented whenever we do X amount
-        * of reads or writes.
-        *
-        * Goes with the bucket read/write prios: when we read or write to a
-        * bucket we reset the bucket's prio to the current hand; thus hand -
-        * prio = time since bucket was last read/written.
-        *
-        * The units are some amount (bytes/sectors) of data read/written, and
-        * the units can change on the fly if we need to rescale to fit
-        * everything in a u16 - your only guarantee is that the units are
-        * consistent.
-        */
-       u16                     hand;
-       u16                     max_last_io;
-
-       int                     rw;
-
-       struct io_timer         rescale;
-       struct mutex            lock;
-};
-
 enum alloc_reserve {
        RESERVE_BTREE_MOVINGGC  = -2,
        RESERVE_BTREE           = -1,
index 91b9375f7341533f81f20fa2435730cba641a36f..fa36e7641e323860896fe73ded85f2147e07b231 100644 (file)
@@ -429,7 +429,9 @@ struct bch_dev {
        unsigned long           *buckets_nouse;
        struct rw_semaphore     bucket_lock;
 
-       struct bch_dev_usage __percpu *usage[2];
+       struct bch_dev_usage            *usage_base;
+       struct bch_dev_usage __percpu   *usage[JOURNAL_BUF_NR];
+       struct bch_dev_usage __percpu   *usage_gc;
 
        /* Allocator: */
        struct task_struct __rcu *alloc_thread;
@@ -451,9 +453,6 @@ struct bch_dev {
 
        size_t                  fifo_last_bucket;
 
-       /* last calculated minimum prio */
-       u16                     max_last_bucket_io[2];
-
        size_t                  inc_gen_needs_gc;
        size_t                  inc_gen_really_needs_gc;
 
@@ -473,6 +472,7 @@ struct bch_dev {
        atomic64_t              rebalance_work;
 
        struct journal_device   journal;
+       u64                     prev_journal_sector;
 
        struct work_struct      io_error_work;
 
@@ -584,6 +584,8 @@ struct bch_fs {
 
        struct journal_entry_res replicas_journal_res;
 
+       struct journal_entry_res dev_usage_journal_res;
+
        struct bch_disk_groups_cpu __rcu *disk_groups;
 
        struct bch_opts         opts;
@@ -691,14 +693,6 @@ struct bch_fs {
        struct mutex            usage_scratch_lock;
        struct bch_fs_usage     *usage_scratch;
 
-       /*
-        * When we invalidate buckets, we use both the priority and the amount
-        * of good data to determine which buckets to reuse first - to weight
-        * those together consistently we keep track of the smallest nonzero
-        * priority of any bucket.
-        */
-       struct bucket_clock     bucket_clock[2];
-
        struct io_clock         io_clock[2];
 
        /* JOURNAL SEQ BLACKLIST */
index 6dc150cbf2af48784915579afa0f9dd13fbf5232..30e77190d97a6a903938efe3f012e2a23e373755 100644 (file)
@@ -341,7 +341,8 @@ static inline void bkey_init(struct bkey *k)
        x(reflink_v,            16)                     \
        x(inline_data,          17)                     \
        x(btree_ptr_v2,         18)                     \
-       x(indirect_inline_data, 19)
+       x(indirect_inline_data, 19)                     \
+       x(alloc_v2,             20)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name    = nr,
@@ -551,9 +552,11 @@ struct bch_extent_stripe_ptr {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
        __u64                   type:5,
                                block:8,
-                               idx:51;
+                               redundancy:4,
+                               idx:47;
 #elif defined (__BIG_ENDIAN_BITFIELD)
-       __u64                   idx:51,
+       __u64                   idx:47,
+                               redundancy:4,
                                block:8,
                                type:5;
 #endif
@@ -799,35 +802,40 @@ struct bch_alloc {
        __u8                    data[];
 } __attribute__((packed, aligned(8)));
 
-#define BCH_ALLOC_FIELDS()                     \
+#define BCH_ALLOC_FIELDS_V1()                  \
        x(read_time,            16)             \
        x(write_time,           16)             \
        x(data_type,            8)              \
        x(dirty_sectors,        16)             \
        x(cached_sectors,       16)             \
-       x(oldest_gen,           8)
+       x(oldest_gen,           8)              \
+       x(stripe,               32)             \
+       x(stripe_redundancy,    8)
+
+struct bch_alloc_v2 {
+       struct bch_val          v;
+       __u8                    nr_fields;
+       __u8                    gen;
+       __u8                    oldest_gen;
+       __u8                    data_type;
+       __u8                    data[];
+} __attribute__((packed, aligned(8)));
+
+#define BCH_ALLOC_FIELDS_V2()                  \
+       x(read_time,            64)             \
+       x(write_time,           64)             \
+       x(dirty_sectors,        16)             \
+       x(cached_sectors,       16)             \
+       x(stripe,               32)             \
+       x(stripe_redundancy,    8)
 
 enum {
-#define x(name, bytes) BCH_ALLOC_FIELD_##name,
-       BCH_ALLOC_FIELDS()
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+       BCH_ALLOC_FIELDS_V1()
 #undef x
        BCH_ALLOC_FIELD_NR
 };
 
-static const unsigned BCH_ALLOC_FIELD_BYTES[] = {
-#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8,
-       BCH_ALLOC_FIELDS()
-#undef x
-};
-
-#define x(name, bits) + (bits / 8)
-static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
-       DIV_ROUND_UP(offsetof(struct bch_alloc, data)
-                    BCH_ALLOC_FIELDS(), sizeof(u64));
-#undef x
-
-#define BKEY_ALLOC_U64s_MAX    (BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX)
-
 /* Quotas: */
 
 enum quota_types {
@@ -1131,8 +1139,8 @@ struct bch_sb_field_clean {
        struct bch_sb_field     field;
 
        __le32                  flags;
-       __le16                  read_clock;
-       __le16                  write_clock;
+       __le16                  _read_clock; /* no longer used */
+       __le16                  _write_clock;
        __le64                  journal_seq;
 
        union {
@@ -1305,6 +1313,7 @@ LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE,
 LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,  struct bch_sb, flags[2],  4, 64);
 
 LE64_BITMASK(BCH_SB_ERASURE_CODE,      struct bch_sb, flags[3],  0, 16);
+LE64_BITMASK(BCH_SB_METADATA_TARGET,   struct bch_sb, flags[3], 16, 28);
 
 /*
  * Features:
@@ -1332,7 +1341,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3],  0, 16);
        x(btree_updates_journalled,     13)     \
        x(reflink_inline_data,          14)     \
        x(new_varint,                   15)     \
-       x(journal_no_flush,             16)
+       x(journal_no_flush,             16)     \
+       x(alloc_v2,                     17)
 
 #define BCH_SB_FEATURES_ALL                            \
        ((1ULL << BCH_FEATURE_new_siphash)|             \
@@ -1340,7 +1350,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3],  0, 16);
         (1ULL << BCH_FEATURE_btree_ptr_v2)|            \
         (1ULL << BCH_FEATURE_extents_above_btree_updates)|\
         (1ULL << BCH_FEATURE_new_varint)|              \
-        (1ULL << BCH_FEATURE_journal_no_flush))
+        (1ULL << BCH_FEATURE_journal_no_flush)|        \
+        (1ULL << BCH_FEATURE_alloc_v2))
 
 enum bch_sb_feature {
 #define x(f, n) BCH_FEATURE_##f,
@@ -1493,7 +1504,9 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
        x(blacklist,            3)              \
        x(blacklist_v2,         4)              \
        x(usage,                5)              \
-       x(data_usage,           6)
+       x(data_usage,           6)              \
+       x(clock,                7)              \
+       x(dev_usage,            8)
 
 enum {
 #define x(f, nr)       BCH_JSET_ENTRY_##f      = nr,
@@ -1541,6 +1554,30 @@ struct jset_entry_data_usage {
        struct bch_replicas_entry r;
 } __attribute__((packed));
 
+struct jset_entry_clock {
+       struct jset_entry       entry;
+       __u8                    rw;
+       __u8                    pad[7];
+       __le64                  time;
+} __attribute__((packed));
+
+struct jset_entry_dev_usage_type {
+       __le64                  buckets;
+       __le64                  sectors;
+       __le64                  fragmented;
+} __attribute__((packed));
+
+struct jset_entry_dev_usage {
+       struct jset_entry       entry;
+       __le32                  dev;
+       __u32                   pad;
+
+       __le64                  buckets_ec;
+       __le64                  buckets_unavailable;
+
+       struct jset_entry_dev_usage_type d[];
+} __attribute__((packed));
+
 /*
  * On disk format for a journal entry:
  * seq is monotonically increasing; every journal entry has its own unique
@@ -1563,8 +1600,8 @@ struct jset {
 
        __u8                    encrypted_start[0];
 
-       __le16                  read_clock;
-       __le16                  write_clock;
+       __le16                  _read_clock; /* no longer used */
+       __le16                  _write_clock;
 
        /* Sequence number of oldest dirty journal entry */
        __le64                  last_seq;
index 2c3b73a6fea35be5f1eb5ac31c1c4cfe824846d8..48821f6c09aa4b6644e9e4ffef77d76e2dfa3f99 100644 (file)
@@ -530,6 +530,7 @@ BKEY_VAL_ACCESSORS(reflink_v);
 BKEY_VAL_ACCESSORS(inline_data);
 BKEY_VAL_ACCESSORS(btree_ptr_v2);
 BKEY_VAL_ACCESSORS(indirect_inline_data);
+BKEY_VAL_ACCESSORS(alloc_v2);
 
 /* byte order helpers */
 
index bab5ebd37f04753f9e842ba8aa1b22d153917126..c2c8a34f735db3bb52588e5479c3eea489295746 100644 (file)
@@ -706,8 +706,8 @@ static void bch2_gc_free(struct bch_fs *c)
                        ca->mi.nbuckets * sizeof(struct bucket));
                ca->buckets[1] = NULL;
 
-               free_percpu(ca->usage[1]);
-               ca->usage[1] = NULL;
+               free_percpu(ca->usage_gc);
+               ca->usage_gc = NULL;
        }
 
        free_percpu(c->usage_gc);
@@ -720,7 +720,7 @@ static int bch2_gc_done(struct bch_fs *c,
        struct bch_dev *ca;
        bool verify = (!initial ||
                       (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)));
-       unsigned i;
+       unsigned i, dev;
        int ret = 0;
 
 #define copy_field(_f, _msg, ...)                                      \
@@ -786,7 +786,10 @@ static int bch2_gc_done(struct bch_fs *c,
                }
        }
 
-       for_each_member_device(ca, c, i) {
+       for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+               bch2_fs_usage_acc_to_base(c, i);
+
+       for_each_member_device(ca, c, dev) {
                struct bucket_array *dst = __bucket_array(ca, 0);
                struct bucket_array *src = __bucket_array(ca, 1);
                size_t b;
@@ -801,12 +804,23 @@ static int bch2_gc_done(struct bch_fs *c,
 
                        dst->b[b].oldest_gen = src->b[b].oldest_gen;
                }
-       };
 
-       for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-               bch2_fs_usage_acc_to_base(c, i);
+               {
+                       struct bch_dev_usage *dst = ca->usage_base;
+                       struct bch_dev_usage *src = (void *)
+                               bch2_acc_percpu_u64s((void *) ca->usage_gc,
+                                                    dev_usage_u64s());
+
+                       copy_dev_field(buckets_ec,              "buckets_ec");
+                       copy_dev_field(buckets_unavailable,     "buckets_unavailable");
 
-       bch2_dev_usage_from_buckets(c);
+                       for (i = 0; i < BCH_DATA_NR; i++) {
+                               copy_dev_field(d[i].buckets,    "%s buckets", bch2_data_types[i]);
+                               copy_dev_field(d[i].sectors,    "%s sectors", bch2_data_types[i]);
+                               copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
+                       }
+               }
+       };
 
        {
                unsigned nr = fs_usage_u64s(c);
@@ -862,7 +876,7 @@ static int bch2_gc_start(struct bch_fs *c)
 
        for_each_member_device(ca, c, i) {
                BUG_ON(ca->buckets[1]);
-               BUG_ON(ca->usage[1]);
+               BUG_ON(ca->usage_gc);
 
                ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
                                ca->mi.nbuckets * sizeof(struct bucket),
@@ -873,9 +887,9 @@ static int bch2_gc_start(struct bch_fs *c)
                        return -ENOMEM;
                }
 
-               ca->usage[1] = alloc_percpu(struct bch_dev_usage);
-               if (!ca->usage[1]) {
-                       bch_err(c, "error allocating ca->usage[gc]");
+               ca->usage_gc = alloc_percpu(struct bch_dev_usage);
+               if (!ca->usage_gc) {
+                       bch_err(c, "error allocating ca->usage_gc");
                        percpu_ref_put(&ca->ref);
                        return -ENOMEM;
                }
@@ -1489,7 +1503,7 @@ static int bch2_gc_thread(void *arg)
 {
        struct bch_fs *c = arg;
        struct io_clock *clock = &c->io_clock[WRITE];
-       unsigned long last = atomic_long_read(&clock->now);
+       unsigned long last = atomic64_read(&clock->now);
        unsigned last_kick = atomic_read(&c->kick_gc);
        int ret;
 
@@ -1510,7 +1524,7 @@ static int bch2_gc_thread(void *arg)
                        if (c->btree_gc_periodic) {
                                unsigned long next = last + c->capacity / 16;
 
-                               if (atomic_long_read(&clock->now) >= next)
+                               if (atomic64_read(&clock->now) >= next)
                                        break;
 
                                bch2_io_clock_schedule_timeout(clock, next);
@@ -1522,7 +1536,7 @@ static int bch2_gc_thread(void *arg)
                }
                __set_current_state(TASK_RUNNING);
 
-               last = atomic_long_read(&clock->now);
+               last = atomic64_read(&clock->now);
                last_kick = atomic_read(&c->kick_gc);
 
                /*
index 8919ea628138b04b8ea93b16fa0d9e0d11e6a890..dd1b8f6ef9b0da30b1ae59ebaadfade9da261348 100644 (file)
@@ -222,7 +222,10 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
        mutex_unlock(&c->btree_reserve_cache_lock);
 
 retry:
-       wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0,
+       wp = bch2_alloc_sectors_start(c,
+                                     c->opts.metadata_target ?:
+                                     c->opts.foreground_target,
+                                     0,
                                      writepoint_ptr(&c->btree_write_point),
                                      &devs_have,
                                      res->nr_replicas,
index cb0f0e09a2c124647f05b2cb1d5110c95a344aac..ef79f5cac64d988e6584465754d74d7d034de7f3 100644 (file)
@@ -137,6 +137,7 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c)
 void bch2_fs_usage_initialize(struct bch_fs *c)
 {
        struct bch_fs_usage *usage;
+       struct bch_dev *ca;
        unsigned i;
 
        percpu_down_write(&c->mark_lock);
@@ -155,6 +156,14 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
                fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
        }
 
+       for_each_member_device(ca, c, i) {
+               struct bch_dev_usage dev = bch2_dev_usage_read(ca);
+
+               usage->hidden += (dev.d[BCH_DATA_sb].buckets +
+                                 dev.d[BCH_DATA_journal].buckets) *
+                       ca->mi.bucket_size;
+       }
+
        percpu_up_write(&c->mark_lock);
 }
 
@@ -189,14 +198,27 @@ out_pool:
        return ret;
 }
 
+static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca,
+                                                 unsigned journal_seq,
+                                                 bool gc)
+{
+       return this_cpu_ptr(gc
+                           ? ca->usage_gc
+                           : ca->usage[journal_seq & JOURNAL_BUF_MASK]);
+}
+
 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
 {
+       struct bch_fs *c = ca->fs;
        struct bch_dev_usage ret;
+       unsigned seq, i, u64s = dev_usage_u64s();
 
-       memset(&ret, 0, sizeof(ret));
-       acc_u64s_percpu((u64 *) &ret,
-                       (u64 __percpu *) ca->usage[0],
-                       sizeof(ret) / sizeof(u64));
+       do {
+               seq = read_seqcount_begin(&c->usage_lock);
+               memcpy(&ret, ca->usage_base, u64s * sizeof(u64));
+               for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+                       acc_u64s_percpu((u64 *) &ret, (u64 __percpu *) ca->usage[i], u64s);
+       } while (read_seqcount_retry(&c->usage_lock, seq));
 
        return ret;
 }
@@ -261,7 +283,8 @@ retry:
 
 void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
 {
-       unsigned u64s = fs_usage_u64s(c);
+       struct bch_dev *ca;
+       unsigned i, u64s = fs_usage_u64s(c);
 
        BUG_ON(idx >= ARRAY_SIZE(c->usage));
 
@@ -272,6 +295,16 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
                        (u64 __percpu *) c->usage[idx], u64s);
        percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
 
+       rcu_read_lock();
+       for_each_member_device_rcu(ca, c, i, NULL) {
+               u64s = dev_usage_u64s();
+
+               acc_u64s_percpu((u64 *) ca->usage_base,
+                               (u64 __percpu *) ca->usage[idx], u64s);
+               percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64));
+       }
+       rcu_read_unlock();
+
        write_seqcount_end(&c->usage_lock);
        preempt_enable();
 }
@@ -454,14 +487,14 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
                                  struct bch_fs_usage *fs_usage,
                                  struct bucket_mark old, struct bucket_mark new,
-                                 bool gc)
+                                 u64 journal_seq, bool gc)
 {
        struct bch_dev_usage *u;
 
        percpu_rwsem_assert_held(&c->mark_lock);
 
        preempt_disable();
-       u = this_cpu_ptr(ca->usage[gc]);
+       u = dev_usage_ptr(ca, journal_seq, gc);
 
        if (bucket_type(old))
                account_bucket(fs_usage, u, bucket_type(old),
@@ -491,31 +524,6 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
                bch2_wake_allocator(ca);
 }
 
-__flatten
-void bch2_dev_usage_from_buckets(struct bch_fs *c)
-{
-       struct bch_dev *ca;
-       struct bucket_mark old = { .v.counter = 0 };
-       struct bucket_array *buckets;
-       struct bucket *g;
-       unsigned i;
-       int cpu;
-
-       c->usage_base->hidden = 0;
-
-       for_each_member_device(ca, c, i) {
-               for_each_possible_cpu(cpu)
-                       memset(per_cpu_ptr(ca->usage[0], cpu), 0,
-                              sizeof(*ca->usage[0]));
-
-               buckets = bucket_array(ca);
-
-               for_each_bucket(g, buckets)
-                       bch2_dev_usage_update(c, ca, c->usage_base,
-                                             old, g->mark, false);
-       }
-}
-
 static inline int update_replicas(struct bch_fs *c,
                                  struct bch_fs_usage *fs_usage,
                                  struct bch_replicas_entry *r,
@@ -653,7 +661,12 @@ static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
                new.owned_by_allocator  = owned_by_allocator;
        }));
 
-       bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+       /*
+        * XXX: this is wrong, this means we'll be doing updates to the percpu
+        * buckets_alloc counter that don't have an open journal buffer and
+        * we'll race with the machinery that accumulates that to ca->usage_base
+        */
+       bch2_dev_usage_update(c, ca, fs_usage, old, new, 0, gc);
 
        BUG_ON(!gc &&
               !owned_by_allocator && !old.owned_by_allocator);
@@ -685,7 +698,8 @@ static int bch2_mark_alloc(struct bch_fs *c,
        struct bucket_mark old_m, m;
 
        /* We don't do anything for deletions - do we?: */
-       if (new.k->type != KEY_TYPE_alloc)
+       if (new.k->type != KEY_TYPE_alloc &&
+           new.k->type != KEY_TYPE_alloc_v2)
                return 0;
 
        /*
@@ -708,6 +722,7 @@ static int bch2_mark_alloc(struct bch_fs *c,
                m.data_type             = u.data_type;
                m.dirty_sectors         = u.dirty_sectors;
                m.cached_sectors        = u.cached_sectors;
+               m.stripe                = u.stripe != 0;
 
                if (journal_seq) {
                        m.journal_seq_valid     = 1;
@@ -715,12 +730,14 @@ static int bch2_mark_alloc(struct bch_fs *c,
                }
        }));
 
-       bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc);
+       bch2_dev_usage_update(c, ca, fs_usage, old_m, m, journal_seq, gc);
 
        g->io_time[READ]        = u.read_time;
        g->io_time[WRITE]       = u.write_time;
        g->oldest_gen           = u.oldest_gen;
        g->gen_valid            = 1;
+       g->stripe               = u.stripe;
+       g->stripe_redundancy    = u.stripe_redundancy;
 
        /*
         * need to know if we're getting called from the invalidate path or
@@ -778,7 +795,7 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 
        if (c)
                bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc),
-                                     old, new, gc);
+                                     old, new, 0, gc);
 
        return 0;
 }
@@ -915,11 +932,10 @@ static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k,
        return 0;
 }
 
-static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
+static int mark_stripe_bucket(struct bch_fs *c, struct bkey_s_c k,
                             unsigned ptr_idx,
                             struct bch_fs_usage *fs_usage,
-                            u64 journal_seq, unsigned flags,
-                            bool enabled)
+                            u64 journal_seq, unsigned flags)
 {
        const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
        unsigned nr_data = s->nr_blocks - s->nr_redundant;
@@ -932,8 +948,13 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
        char buf[200];
        int ret;
 
-       if (enabled)
-               g->ec_redundancy = s->nr_redundant;
+       if (g->stripe && g->stripe != k.k->p.offset) {
+               bch2_fs_inconsistent(c,
+                             "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
+                             ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
+                             (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+               return -EINVAL;
+       }
 
        old = bucket_cmpxchg(g, new, ({
                ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type,
@@ -941,23 +962,9 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
                if (ret)
                        return ret;
 
-               if (new.stripe && enabled)
-                       bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-                                     "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
-                                     ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
-                                     (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
-               if (!new.stripe && !enabled)
-                       bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
-                                     "bucket %u:%zu gen %u: deleting stripe but not marked\n%s",
-                                     ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen,
-                                     (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
-               new.stripe                      = enabled;
-
-               if ((flags & BTREE_TRIGGER_GC) && parity) {
-                       new.data_type = enabled ? BCH_DATA_parity : 0;
-                       new.dirty_sectors = enabled ? le16_to_cpu(s->sectors): 0;
+               if (parity) {
+                       new.data_type           = BCH_DATA_parity;
+                       new.dirty_sectors       = le16_to_cpu(s->sectors);
                }
 
                if (journal_seq) {
@@ -966,10 +973,10 @@ static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k,
                }
        }));
 
-       if (!enabled)
-               g->ec_redundancy = 0;
+       g->stripe               = k.k->p.offset;
+       g->stripe_redundancy    = s->nr_redundant;
 
-       bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+       bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
        return 0;
 }
 
@@ -1036,7 +1043,7 @@ static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k,
                              old.v.counter,
                              new.v.counter)) != old.v.counter);
 
-       bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
+       bch2_dev_usage_update(c, ca, fs_usage, old, new, journal_seq, gc);
 
        BUG_ON(!gc && bucket_became_unavailable(old, new));
 
@@ -1163,6 +1170,8 @@ static int bch2_mark_stripe(struct bch_fs *c,
        unsigned i;
        int ret;
 
+       BUG_ON(gc && old_s);
+
        if (!m || (old_s && !m->alive)) {
                bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
                                    idx);
@@ -1170,48 +1179,12 @@ static int bch2_mark_stripe(struct bch_fs *c,
        }
 
        if (!new_s) {
-               /* Deleting: */
-               for (i = 0; i < old_s->nr_blocks; i++) {
-                       ret = bucket_set_stripe(c, old, i, fs_usage,
-                                               journal_seq, flags, false);
-                       if (ret)
-                               return ret;
-               }
-
-               if (!gc && m->on_heap) {
-                       spin_lock(&c->ec_stripes_heap_lock);
-                       bch2_stripes_heap_del(c, m, idx);
-                       spin_unlock(&c->ec_stripes_heap_lock);
-               }
-
-               if (gc)
-                       update_replicas(c, fs_usage, &m->r.e,
-                                       -((s64) m->sectors * m->nr_redundant));
+               spin_lock(&c->ec_stripes_heap_lock);
+               bch2_stripes_heap_del(c, m, idx);
+               spin_unlock(&c->ec_stripes_heap_lock);
 
                memset(m, 0, sizeof(*m));
        } else {
-               BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks);
-               BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant);
-
-               for (i = 0; i < new_s->nr_blocks; i++) {
-                       if (!old_s ||
-                           memcmp(new_s->ptrs + i,
-                                  old_s->ptrs + i,
-                                  sizeof(struct bch_extent_ptr))) {
-
-                               if (old_s) {
-                                       bucket_set_stripe(c, old, i, fs_usage,
-                                                         journal_seq, flags, false);
-                                       if (ret)
-                                               return ret;
-                               }
-                               ret = bucket_set_stripe(c, new, i, fs_usage,
-                                                       journal_seq, flags, true);
-                               if (ret)
-                                       return ret;
-                       }
-               }
-
                m->alive        = true;
                m->sectors      = le16_to_cpu(new_s->sectors);
                m->algorithm    = new_s->algorithm;
@@ -1220,27 +1193,13 @@ static int bch2_mark_stripe(struct bch_fs *c,
                m->blocks_nonempty = 0;
 
                for (i = 0; i < new_s->nr_blocks; i++) {
-                       unsigned s = stripe_blockcount_get(new_s, i);
-
-                       /*
-                        * gc recalculates this field from stripe ptr
-                        * references:
-                        */
-                       if (!gc)
-                               m->block_sectors[i] = s;
-                       m->blocks_nonempty += !!s;
+                       m->block_sectors[i] =
+                               stripe_blockcount_get(new_s, i);
+                       m->blocks_nonempty += !!m->block_sectors[i];
                }
 
-               if (gc && old_s)
-                       update_replicas(c, fs_usage, &m->r.e,
-                                       -((s64) m->sectors * m->nr_redundant));
-
                bch2_bkey_to_replicas(&m->r.e, new);
 
-               if (gc)
-                       update_replicas(c, fs_usage, &m->r.e,
-                                       ((s64) m->sectors * m->nr_redundant));
-
                if (!gc) {
                        spin_lock(&c->ec_stripes_heap_lock);
                        bch2_stripes_heap_update(c, m, idx);
@@ -1248,6 +1207,25 @@ static int bch2_mark_stripe(struct bch_fs *c,
                }
        }
 
+       if (gc) {
+               /*
+                * gc recalculates this field from stripe ptr
+                * references:
+                */
+               memset(m->block_sectors, 0, sizeof(m->block_sectors));
+               m->blocks_nonempty = 0;
+
+               for (i = 0; i < new_s->nr_blocks; i++) {
+                       ret = mark_stripe_bucket(c, new, i, fs_usage,
+                                                journal_seq, flags);
+                       if (ret)
+                               return ret;
+               }
+
+               update_replicas(c, fs_usage, &m->r.e,
+                               ((s64) m->sectors * m->nr_redundant));
+       }
+
        return 0;
 }
 
@@ -1271,6 +1249,7 @@ static int bch2_mark_key_locked(struct bch_fs *c,
 
        switch (k.k->type) {
        case KEY_TYPE_alloc:
+       case KEY_TYPE_alloc_v2:
                ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags);
                break;
        case KEY_TYPE_btree_ptr:
@@ -1539,9 +1518,10 @@ static int trans_get_key(struct btree_trans *trans,
        return ret;
 }
 
-static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
-                                        const struct bch_extent_ptr *ptr,
-                                        struct bkey_alloc_unpacked *u)
+static struct bkey_alloc_buf *
+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter,
+                             const struct bch_extent_ptr *ptr,
+                             struct bkey_alloc_unpacked *u)
 {
        struct bch_fs *c = trans->c;
        struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
@@ -1549,8 +1529,13 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
        struct bucket *g;
        struct btree_iter *iter;
        struct bkey_s_c k;
+       struct bkey_alloc_buf *a;
        int ret;
 
+       a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
+       if (IS_ERR(a))
+               return a;
+
        iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k);
        if (iter) {
                *u = bch2_alloc_unpack(k);
@@ -1562,17 +1547,17 @@ static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree
                ret = bch2_btree_iter_traverse(iter);
                if (ret) {
                        bch2_trans_iter_put(trans, iter);
-                       return ret;
+                       return ERR_PTR(ret);
                }
 
                percpu_down_read(&c->mark_lock);
                g = bucket(ca, pos.offset);
-               *u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+               *u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
                percpu_up_read(&c->mark_lock);
        }
 
        *_iter = iter;
-       return 0;
+       return a;
 }
 
 static int bch2_trans_mark_pointer(struct btree_trans *trans,
@@ -1582,27 +1567,20 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct btree_iter *iter;
        struct bkey_alloc_unpacked u;
-       struct bkey_i_alloc *a;
+       struct bkey_alloc_buf *a;
        int ret;
 
-       ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
-       if (ret)
-               return ret;
+       a = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u);
+       if (IS_ERR(a))
+               return PTR_ERR(a);
 
        ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type,
                             &u.dirty_sectors, &u.cached_sectors);
        if (ret)
                goto out;
 
-       a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
-       ret = PTR_ERR_OR_ZERO(a);
-       if (ret)
-               goto out;
-
-       bkey_alloc_init(&a->k_i);
-       a->k.p = iter->pos;
-       bch2_alloc_pack(a, u);
-       bch2_trans_update(trans, iter, &a->k_i, 0);
+       bch2_alloc_pack(c, a, u);
+       bch2_trans_update(trans, iter, &a->k, 0);
 out:
        bch2_trans_iter_put(trans, iter);
        return ret;
@@ -1713,34 +1691,51 @@ static int bch2_trans_mark_extent(struct btree_trans *trans,
 }
 
 static int bch2_trans_mark_stripe_alloc_ref(struct btree_trans *trans,
-                                           const struct bch_extent_ptr *ptr,
-                                           s64 sectors, bool parity)
+                                           struct bkey_s_c_stripe s,
+                                           unsigned idx, bool deleting)
 {
-       struct bkey_i_alloc *a;
+       struct bch_fs *c = trans->c;
+       const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
+       struct bkey_alloc_buf *a;
        struct btree_iter *iter;
        struct bkey_alloc_unpacked u;
-       int ret;
+       bool parity = idx >= s.v->nr_blocks - s.v->nr_redundant;
+       int ret = 0;
 
-       ret = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
-       if (ret)
-               return ret;
+       a = bch2_trans_start_alloc_update(trans, &iter, ptr, &u);
+       if (IS_ERR(a))
+               return PTR_ERR(a);
 
        if (parity) {
+               s64 sectors = le16_to_cpu(s.v->sectors);
+
+               if (deleting)
+                       sectors = -sectors;
+
                u.dirty_sectors += sectors;
                u.data_type = u.dirty_sectors
                        ? BCH_DATA_parity
                        : 0;
        }
 
-       a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
-       ret = PTR_ERR_OR_ZERO(a);
-       if (ret)
-               goto err;
+       if (!deleting) {
+               if (bch2_fs_inconsistent_on(u.stripe && u.stripe != s.k->p.offset, c,
+                               "bucket %llu:%llu gen %u: multiple stripes using same bucket (%u, %llu)",
+                               iter->pos.inode, iter->pos.offset, u.gen,
+                               u.stripe, s.k->p.offset)) {
+                       ret = -EIO;
+                       goto err;
+               }
 
-       bkey_alloc_init(&a->k_i);
-       a->k.p = iter->pos;
-       bch2_alloc_pack(a, u);
-       bch2_trans_update(trans, iter, &a->k_i, 0);
+               u.stripe                = s.k->p.offset;
+               u.stripe_redundancy     = s.v->nr_redundant;
+       } else {
+               u.stripe                = 0;
+               u.stripe_redundancy     = 0;
+       }
+
+       bch2_alloc_pack(c, a, u);
+       bch2_trans_update(trans, iter, &a->k, 0);
 err:
        bch2_trans_iter_put(trans, iter);
        return ret;
@@ -1750,51 +1745,50 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
                                  struct bkey_s_c old, struct bkey_s_c new,
                                  unsigned flags)
 {
-       const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
-               ? bkey_s_c_to_stripe(old).v : NULL;
-       const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
-               ? bkey_s_c_to_stripe(new).v : NULL;
+       struct bkey_s_c_stripe old_s = { NULL };
+       struct bkey_s_c_stripe new_s = { NULL };
        struct bch_replicas_padded r;
        unsigned i;
        int ret = 0;
 
+       if (old.k->type == KEY_TYPE_stripe)
+               old_s = bkey_s_c_to_stripe(old);
+       if (new.k->type == KEY_TYPE_stripe)
+               new_s = bkey_s_c_to_stripe(new);
+
        /*
         * If the pointers aren't changing, we don't need to do anything:
         */
-       if (new_s && old_s &&
-           !memcmp(old_s->ptrs, new_s->ptrs,
-                   new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
+       if (new_s.k && old_s.k &&
+           new_s.v->nr_blocks          == old_s.v->nr_blocks &&
+           new_s.v->nr_redundant       == old_s.v->nr_redundant &&
+           !memcmp(old_s.v->ptrs, new_s.v->ptrs,
+                   new_s.v->nr_blocks * sizeof(struct bch_extent_ptr)))
                return 0;
 
-       if (new_s) {
-               unsigned nr_data = new_s->nr_blocks - new_s->nr_redundant;
-               s64 sectors = le16_to_cpu(new_s->sectors);
+       if (new_s.k) {
+               s64 sectors = le16_to_cpu(new_s.v->sectors);
 
                bch2_bkey_to_replicas(&r.e, new);
-               update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
-
-               for (i = 0; i < new_s->nr_blocks; i++) {
-                       bool parity = i >= nr_data;
+               update_replicas_list(trans, &r.e, sectors * new_s.v->nr_redundant);
 
-                       ret = bch2_trans_mark_stripe_alloc_ref(trans,
-                                       &new_s->ptrs[i], sectors, parity);
+               for (i = 0; i < new_s.v->nr_blocks; i++) {
+                       ret = bch2_trans_mark_stripe_alloc_ref(trans, new_s,
+                                                              i, false);
                        if (ret)
                                return ret;
                }
        }
 
-       if (old_s) {
-               unsigned nr_data = old_s->nr_blocks - old_s->nr_redundant;
-               s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
+       if (old_s.k) {
+               s64 sectors = -((s64) le16_to_cpu(old_s.v->sectors));
 
                bch2_bkey_to_replicas(&r.e, old);
-               update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
-
-               for (i = 0; i < old_s->nr_blocks; i++) {
-                       bool parity = i >= nr_data;
+               update_replicas_list(trans, &r.e, sectors * old_s.v->nr_redundant);
 
-                       ret = bch2_trans_mark_stripe_alloc_ref(trans,
-                                       &old_s->ptrs[i], sectors, parity);
+               for (i = 0; i < old_s.v->nr_blocks; i++) {
+                       ret = bch2_trans_mark_stripe_alloc_ref(trans, old_s,
+                                                              i, true);
                        if (ret)
                                return ret;
                }
@@ -2065,21 +2059,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct btree_iter *iter;
        struct bkey_alloc_unpacked u;
-       struct bkey_i_alloc *a;
+       struct bkey_alloc_buf *a;
        struct bch_extent_ptr ptr = {
                .dev = ca->dev_idx,
                .offset = bucket_to_sector(ca, b),
        };
        int ret = 0;
 
-       a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
-       ret = PTR_ERR_OR_ZERO(a);
-       if (ret)
-               return ret;
-
-       ret = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
-       if (ret)
-               return ret;
+       a = bch2_trans_start_alloc_update(trans, &iter, &ptr, &u);
+       if (IS_ERR(a))
+               return PTR_ERR(a);
 
        if (u.data_type && u.data_type != type) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
@@ -2112,10 +2101,8 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
        u.data_type     = type;
        u.dirty_sectors = sectors;
 
-       bkey_alloc_init(&a->k_i);
-       a->k.p = iter->pos;
-       bch2_alloc_pack(a, u);
-       bch2_trans_update(trans, iter, &a->k_i, 0);
+       bch2_alloc_pack(c, a, u);
+       bch2_trans_update(trans, iter, &a->k, 0);
 out:
        bch2_trans_iter_put(trans, iter);
        return ret;
@@ -2422,13 +2409,24 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
                sizeof(struct bucket_array) +
                ca->mi.nbuckets * sizeof(struct bucket));
 
-       free_percpu(ca->usage[0]);
+       for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+               free_percpu(ca->usage[i]);
+       kfree(ca->usage_base);
 }
 
 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
-       if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
+       unsigned i;
+
+       ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
+       if (!ca->usage_base)
                return -ENOMEM;
 
+       for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
+               ca->usage[i] = alloc_percpu(struct bch_dev_usage);
+               if (!ca->usage[i])
+                       return -ENOMEM;
+       }
+
        return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;
 }
index 37346240cb7b025d87559bf5758d4be8f8f8a8e4..6d15c455e7cc302f88a7049dce960b86b1118f9c 100644 (file)
@@ -58,20 +58,13 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
        return __bucket(ca, b, false);
 }
 
-static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
-{
-       return c->bucket_clock[rw].hand - g->io_time[rw];
-}
-
 /*
  * bucket_gc_gen() returns the difference between the bucket's current gen and
  * the oldest gen of any pointer into that bucket in the btree.
  */
 
-static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
+static inline u8 bucket_gc_gen(struct bucket *g)
 {
-       struct bucket *g = bucket(ca, b);
-
        return g->mark.gen - g->oldest_gen;
 }
 
@@ -169,8 +162,6 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
 
 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
 
-void bch2_dev_usage_from_buckets(struct bch_fs *);
-
 static inline u64 __dev_buckets_available(struct bch_dev *ca,
                                          struct bch_dev_usage stats)
 {
@@ -214,6 +205,11 @@ static inline unsigned fs_usage_u64s(struct bch_fs *c)
                READ_ONCE(c->replicas.nr);
 }
 
+static inline unsigned dev_usage_u64s(void)
+{
+       return sizeof(struct bch_dev_usage) / sizeof(u64);
+}
+
 void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *);
 struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *);
 
index 5fbe940a5f6fa2818b4764d3663abe1d2600e19c..404c89a7a264a2b55be7ff0cb515e74e2cbc47b3 100644 (file)
@@ -37,11 +37,12 @@ struct bucket {
                const struct bucket_mark mark;
        };
 
-       u16                             io_time[2];
+       u64                             io_time[2];
        u8                              oldest_gen;
        u8                              gc_gen;
        unsigned                        gen_valid:1;
-       u8                              ec_redundancy;
+       u8                              stripe_redundancy;
+       u32                             stripe;
 };
 
 struct bucket_array {
index 1d1590de55e85b9a484882104057ecfc7a7a6478..4324cfe7eed0de48ef2a26d97b024d3bb5d712b5 100644 (file)
@@ -19,7 +19,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer)
 
        spin_lock(&clock->timer_lock);
 
-       if (time_after_eq((unsigned long) atomic_long_read(&clock->now),
+       if (time_after_eq((unsigned long) atomic64_read(&clock->now),
                          timer->expire)) {
                spin_unlock(&clock->timer_lock);
                timer->fn(timer);
@@ -146,7 +146,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock,
 void __bch2_increment_clock(struct io_clock *clock, unsigned sectors)
 {
        struct io_timer *timer;
-       unsigned long now = atomic_long_add_return(sectors, &clock->now);
+       unsigned long now = atomic64_add_return(sectors, &clock->now);
 
        while ((timer = get_expired_timer(clock, now)))
                timer->fn(timer);
@@ -158,7 +158,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock)
        unsigned i;
 
        spin_lock(&clock->timer_lock);
-       now = atomic_long_read(&clock->now);
+       now = atomic64_read(&clock->now);
 
        for (i = 0; i < clock->timers.used; i++)
                pr_buf(out, "%ps:\t%li\n",
@@ -175,7 +175,7 @@ void bch2_io_clock_exit(struct io_clock *clock)
 
 int bch2_io_clock_init(struct io_clock *clock)
 {
-       atomic_long_set(&clock->now, 0);
+       atomic64_set(&clock->now, 0);
        spin_lock_init(&clock->timer_lock);
 
        clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus();
index 92c740a475656da2093528b802320ad73eabbe46..5fae0012d808f7a1b5f4e5334804eee50c31d577 100644 (file)
@@ -26,7 +26,7 @@ struct io_timer {
 typedef HEAP(struct io_timer *)        io_timer_heap;
 
 struct io_clock {
-       atomic_long_t           now;
+       atomic64_t              now;
        u16 __percpu            *pcpu_buf;
        unsigned                max_slop;
 
index 086897c3bdc363d72ae582daea396867d786276b..10d55fc81bde764734998d4b12719fc9ff24ed4a 100644 (file)
@@ -105,6 +105,9 @@ const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
        const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
 
+       if (!bkey_cmp(k.k->p, POS_MIN))
+               return "stripe at pos 0";
+
        if (k.k->p.inode)
                return "invalid stripe key";
 
@@ -279,10 +282,14 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
                        struct bch_csum got = ec_block_checksum(buf, i, offset);
 
                        if (bch2_crc_cmp(want, got)) {
+                               char buf2[200];
+
+                               bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&buf->key.k_i));
+
                                bch_err_ratelimited(c,
-                                       "stripe checksum error at %u:%u: csum type %u, expected %llx got %llx",
-                                       i, j, v->csum_type,
-                                       want.lo, got.lo);
+                                       "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s",
+                                       (void *) _RET_IP_, i, j, v->csum_type,
+                                       want.lo, got.lo, buf2);
                                clear_bit(i, buf->valid);
                                break;
                        }
@@ -335,6 +342,8 @@ static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
 static void ec_block_endio(struct bio *bio)
 {
        struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio);
+       struct bch_stripe *v = &ec_bio->buf->key.v;
+       struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx];
        struct bch_dev *ca = ec_bio->ca;
        struct closure *cl = bio->bi_private;
 
@@ -343,6 +352,13 @@ static void ec_block_endio(struct bio *bio)
                               bch2_blk_status_to_str(bio->bi_status)))
                clear_bit(ec_bio->idx, ec_bio->buf->valid);
 
+       if (ptr_stale(ca, ptr)) {
+               bch_err_ratelimited(ca->fs,
+                                   "error %s stripe: stale pointer after io",
+                                   bio_data_dir(bio) == READ ? "reading from" : "writing to");
+               clear_bit(ec_bio->idx, ec_bio->buf->valid);
+       }
+
        bio_put(&ec_bio->bio);
        percpu_ref_put(&ca->io_ref);
        closure_put(cl);
@@ -652,7 +668,6 @@ void bch2_stripes_heap_update(struct bch_fs *c,
 
 static int ec_stripe_delete(struct bch_fs *c, size_t idx)
 {
-       //pr_info("deleting stripe %zu", idx);
        return bch2_btree_delete_range(c, BTREE_ID_EC,
                                       POS(0, idx),
                                       POS(0, idx + 1),
@@ -795,6 +810,7 @@ static void extent_stripe_ptr_add(struct bkey_s_extent e,
        *dst = (struct bch_extent_stripe_ptr) {
                .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr,
                .block          = block,
+               .redundancy     = s->key.v.nr_redundant,
                .idx            = s->key.k.p.offset,
        };
 }
@@ -1054,8 +1070,6 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
        if (!ob)
                return;
 
-       //pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset);
-
        ec = ob->ec;
        mutex_lock(&ec->lock);
 
@@ -1348,12 +1362,14 @@ static s64 get_existing_stripe(struct bch_fs *c,
        struct stripe *m;
        size_t heap_idx;
        u64 stripe_idx;
+       s64 ret = -1;
 
        if (may_create_new_stripe(c))
                return -1;
 
        spin_lock(&c->ec_stripes_heap_lock);
        for (heap_idx = 0; heap_idx < h->used; heap_idx++) {
+               /* No blocks worth reusing, stripe will just be deleted: */
                if (!h->data[heap_idx].blocks_nonempty)
                        continue;
 
@@ -1365,13 +1381,12 @@ static s64 get_existing_stripe(struct bch_fs *c,
                    m->sectors          == head->blocksize &&
                    m->blocks_nonempty  < m->nr_blocks - m->nr_redundant) {
                        bch2_stripes_heap_del(c, m, stripe_idx);
-                       spin_unlock(&c->ec_stripes_heap_lock);
-                       return stripe_idx;
+                       ret = stripe_idx;
+                       break;
                }
        }
-
        spin_unlock(&c->ec_stripes_heap_lock);
-       return -1;
+       return ret;
 }
 
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
index 67ba2c21627efd18a549c2df5efcf93f8d65bacb..4a3a3291a31b00b46d7a1f353a799e4c13bea742 100644 (file)
@@ -704,14 +704,8 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
                if (p.ptr.cached)
                        continue;
 
-               if (p.has_ec) {
-                       struct stripe *s =
-                               genradix_ptr(&c->stripes[0], p.ec.idx);
-
-                       WARN_ON(!s);
-                       if (s)
-                               replicas += s->nr_redundant;
-               }
+               if (p.has_ec)
+                       replicas += p.ec.redundancy;
 
                replicas++;
 
@@ -734,16 +728,9 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
        if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
                durability = max_t(unsigned, durability, ca->mi.durability);
 
-       if (p.has_ec) {
-               struct stripe *s =
-                       genradix_ptr(&c->stripes[0], p.ec.idx);
-
-               if (WARN_ON(!s))
-                       goto out;
+       if (p.has_ec)
+               durability += p.ec.redundancy;
 
-               durability += s->nr_redundant;
-       }
-out:
        return durability;
 }
 
index a7c5f5fddedb40c0e150acf8ed18862075f856c0..e41f02773dd026f8bc50ebc93aaccf0cec6ca50d 100644 (file)
@@ -1121,6 +1121,9 @@ int bch2_fs_journal_init(struct journal *j)
        j->entry_u64s_reserved +=
                BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX);
 
+       j->entry_u64s_reserved +=
+               2 * (sizeof(struct jset_entry_clock) / sizeof(u64));
+
        atomic64_set(&j->reservations.counter,
                ((union journal_res_state)
                 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
index eacc9b2c362fc476126871f19983a59f2e239037..2abca1644cdc2dc46b5369eba3c2203cbf408c7e 100644 (file)
@@ -5,6 +5,7 @@
 #include "btree_update_interior.h"
 #include "buckets.h"
 #include "checksum.h"
+#include "disk_groups.h"
 #include "error.h"
 #include "io.h"
 #include "journal.h"
@@ -426,6 +427,69 @@ fsck_err:
        return ret;
 }
 
+static int journal_entry_validate_clock(struct bch_fs *c,
+                                       struct jset *jset,
+                                       struct jset_entry *entry,
+                                       int write)
+{
+       struct jset_entry_clock *clock =
+               container_of(entry, struct jset_entry_clock, entry);
+       unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+       int ret = 0;
+
+       if (journal_entry_err_on(bytes != sizeof(*clock),
+                                c, "invalid journal entry clock: bad size")) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+
+       if (journal_entry_err_on(clock->rw > 1,
+                                c, "invalid journal entry clock: bad rw")) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+
+fsck_err:
+       return ret;
+}
+
+static int journal_entry_validate_dev_usage(struct bch_fs *c,
+                                           struct jset *jset,
+                                           struct jset_entry *entry,
+                                           int write)
+{
+       struct jset_entry_dev_usage *u =
+               container_of(entry, struct jset_entry_dev_usage, entry);
+       unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+       unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */
+       unsigned dev;
+       int ret = 0;
+
+       if (journal_entry_err_on(bytes < expected,
+                                c, "invalid journal entry dev usage: bad size (%u < %u)",
+                                bytes, expected)) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+
+       dev = le32_to_cpu(u->dev);
+
+       if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
+                                c, "invalid journal entry dev usage: bad dev")) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+
+       if (journal_entry_err_on(u->pad,
+                                c, "invalid journal entry dev usage: bad pad")) {
+               journal_entry_null_range(entry, vstruct_next(entry));
+               return ret;
+       }
+
+fsck_err:
+       return ret;
+}
+
 struct jset_entry_ops {
        int (*validate)(struct bch_fs *, struct jset *,
                        struct jset_entry *, int);
@@ -937,6 +1001,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                for (ptr = 0; ptr < i->nr_ptrs; ptr++)
                        replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
 
+               bch2_replicas_entry_sort(&replicas.e);
+
                /*
                 * If we're mounting in degraded mode - if we didn't read all
                 * the devices - this is wrong:
@@ -1032,16 +1098,20 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
                               unsigned sectors)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct bch_devs_mask devs;
        struct journal_device *ja;
        struct bch_dev *ca;
        struct dev_alloc_list devs_sorted;
+       unsigned target = c->opts.metadata_target ?:
+               c->opts.foreground_target;
        unsigned i, replicas = 0, replicas_want =
                READ_ONCE(c->opts.metadata_replicas);
 
        rcu_read_lock();
+retry:
+       devs = target_rw_devs(c, BCH_DATA_journal, target);
 
-       devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
-                                         &c->rw_devs[BCH_DATA_journal]);
+       devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
 
        __journal_write_alloc(j, w, &devs_sorted,
                              sectors, &replicas, replicas_want);
@@ -1073,6 +1143,12 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 
        __journal_write_alloc(j, w, &devs_sorted,
                              sectors, &replicas, replicas_want);
+
+       if (replicas < replicas_want && target) {
+               /* Retry from all devices: */
+               target = 0;
+               goto retry;
+       }
 done:
        rcu_read_unlock();
 
@@ -1278,6 +1354,9 @@ static void do_journal_write(struct closure *cl)
                bio->bi_private         = ca;
                bio->bi_opf             = REQ_OP_WRITE|REQ_SYNC|REQ_META;
 
+               BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
+               ca->prev_journal_sector = bio->bi_iter.bi_sector;
+
                if (!JSET_NO_FLUSH(w->data))
                        bio->bi_opf    |= REQ_FUA;
                if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
@@ -1348,8 +1427,8 @@ void bch2_journal_write(struct closure *cl)
 
        end     = bch2_btree_roots_to_journal_entries(c, jset->start, end);
 
-       end     = bch2_journal_super_entries_add_common(c, end,
-                                               le64_to_cpu(jset->seq));
+       bch2_journal_super_entries_add_common(c, &end,
+                               le64_to_cpu(jset->seq));
        u64s    = (u64 *) end - (u64 *) start;
        BUG_ON(u64s > j->entry_u64s_reserved);
 
@@ -1358,10 +1437,7 @@ void bch2_journal_write(struct closure *cl)
 
        journal_write_compact(jset);
 
-       jset->read_clock        = cpu_to_le16(c->bucket_clock[READ].hand);
-       jset->write_clock       = cpu_to_le16(c->bucket_clock[WRITE].hand);
        jset->magic             = cpu_to_le64(jset_magic(c));
-
        jset->version           = c->sb.version < bcachefs_metadata_version_new_versioning
                ? cpu_to_le32(BCH_JSET_VERSION_OLD)
                : cpu_to_le32(c->sb.version);
index d0acc1ee5cfeb5f06c7a85bbc5ad06992eb6bc19..f915b30ab6e042a2e180f57e21cec817276db632 100644 (file)
@@ -92,11 +92,8 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
                        data_opts->btree_insert_flags   = BTREE_INSERT_USE_RESERVE;
                        data_opts->rewrite_dev          = p.ptr.dev;
 
-                       if (p.has_ec) {
-                               struct stripe *m = genradix_ptr(&c->stripes[0], p.ec.idx);
-
-                               data_opts->nr_replicas += m->nr_redundant;
-                       }
+                       if (p.has_ec)
+                               data_opts->nr_replicas += p.ec.redundancy;
 
                        return DATA_REWRITE;
                }
@@ -179,12 +176,12 @@ static int bch2_copygc(struct bch_fs *c)
                            bucket_sectors_used(m) >= ca->mi.bucket_size)
                                continue;
 
-                       WARN_ON(m.stripe && !g->ec_redundancy);
+                       WARN_ON(m.stripe && !g->stripe_redundancy);
 
                        e = (struct copygc_heap_entry) {
                                .dev            = dev_idx,
                                .gen            = m.gen,
-                               .replicas       = 1 + g->ec_redundancy,
+                               .replicas       = 1 + g->stripe_redundancy,
                                .fragmentation  = bucket_sectors_used(m) * (1U << 15)
                                        / ca->mi.bucket_size,
                                .sectors        = bucket_sectors_used(m),
@@ -301,7 +298,7 @@ static int bch2_copygc_thread(void *arg)
 {
        struct bch_fs *c = arg;
        struct io_clock *clock = &c->io_clock[WRITE];
-       unsigned long last, wait;
+       u64 last, wait;
 
        set_freezable();
 
@@ -309,7 +306,7 @@ static int bch2_copygc_thread(void *arg)
                if (kthread_wait_freezable(c->copy_gc_enabled))
                        break;
 
-               last = atomic_long_read(&clock->now);
+               last = atomic64_read(&clock->now);
                wait = bch2_copygc_wait_amount(c);
 
                if (wait > clock->max_slop) {
index 710a7ee6703922e01af1cb8a5a4bb649c48631d7..d835a85338c65b887c21661ccc9dd07887b5610f 100644 (file)
@@ -136,6 +136,11 @@ enum opt_type {
          OPT_STR(bch2_str_hash_types),                                 \
          BCH_SB_STR_HASH_TYPE,         BCH_STR_HASH_OPT_SIPHASH,       \
          NULL,         "Hash function for directory entries and xattrs")\
+       x(metadata_target,              u16,                            \
+         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,                   \
+         OPT_FN(bch2_opt_target),                                      \
+         BCH_SB_METADATA_TARGET,       0,                              \
+         "(target)",   "Device or disk group for metadata writes")     \
        x(foreground_target,            u16,                            \
          OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE,                   \
          OPT_FN(bch2_opt_target),                                      \
index c3373c48fa8136c611213529adaf2fd6d606bf83..d89920b848ee936ef3f05124919cfc17c480540c 100644 (file)
@@ -169,12 +169,12 @@ static int bch2_rebalance_thread(void *arg)
        unsigned long start, prev_start;
        unsigned long prev_run_time, prev_run_cputime;
        unsigned long cputime, prev_cputime;
-       unsigned long io_start;
+       u64 io_start;
        long throttle;
 
        set_freezable();
 
-       io_start        = atomic_long_read(&clock->now);
+       io_start        = atomic64_read(&clock->now);
        p               = rebalance_work(c);
        prev_start      = jiffies;
        prev_cputime    = curr_cputime();
@@ -210,7 +210,7 @@ static int bch2_rebalance_thread(void *arg)
                                        (20 - w.dev_most_full_percent),
                                        50);
 
-                       if (atomic_long_read(&clock->now) + clock->max_slop <
+                       if (atomic64_read(&clock->now) + clock->max_slop <
                            r->throttled_until_iotime) {
                                r->throttled_until_cputime = start + throttle;
                                r->state = REBALANCE_THROTTLED;
@@ -229,7 +229,7 @@ static int bch2_rebalance_thread(void *arg)
                              max(p.dev_most_full_percent, 1U) /
                              max(w.dev_most_full_percent, 1U));
 
-               io_start        = atomic_long_read(&clock->now);
+               io_start        = atomic64_read(&clock->now);
                p               = w;
                prev_start      = start;
                prev_cputime    = cputime;
@@ -274,7 +274,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c)
        case REBALANCE_THROTTLED:
                bch2_hprint(&PBUF(h1),
                            (r->throttled_until_iotime -
-                            atomic_long_read(&c->io_clock[WRITE].now)) << 9);
+                            atomic64_read(&c->io_clock[WRITE].now)) << 9);
                pr_buf(out, "throttled for %lu sec or %s io\n",
                       (r->throttled_until_cputime - jiffies) / HZ,
                       h1);
index 192c6be20cedd841311518fbee9028f07f09b23b..2f62a643c39fbb0c08f024fbf58a7f3325755875 100644 (file)
@@ -17,7 +17,7 @@ struct bch_fs_rebalance {
        atomic64_t              work_unknown_dev;
 
        enum rebalance_state    state;
-       unsigned long           throttled_until_iotime;
+       u64                     throttled_until_iotime;
        unsigned long           throttled_until_cputime;
        struct bch_move_stats   move_stats;
 
index f470e0e233ce949c46480cb57242fae6d34074d3..7ba098adcab9b20f2e70e8aeeaaf28a868828cb9 100644 (file)
@@ -825,10 +825,31 @@ static int journal_replay_entry_early(struct bch_fs *c,
        case BCH_JSET_ENTRY_data_usage: {
                struct jset_entry_data_usage *u =
                        container_of(entry, struct jset_entry_data_usage, entry);
+
                ret = bch2_replicas_set_usage(c, &u->r,
                                              le64_to_cpu(u->v));
                break;
        }
+       case BCH_JSET_ENTRY_dev_usage: {
+               struct jset_entry_dev_usage *u =
+                       container_of(entry, struct jset_entry_dev_usage, entry);
+               struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev);
+               unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+               unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
+                       sizeof(struct jset_entry_dev_usage_type);
+               unsigned i;
+
+               ca->usage_base->buckets_ec              = le64_to_cpu(u->buckets_ec);
+               ca->usage_base->buckets_unavailable     = le64_to_cpu(u->buckets_unavailable);
+
+               for (i = 0; i < nr_types; i++) {
+                       ca->usage_base->d[i].buckets    = le64_to_cpu(u->d[i].buckets);
+                       ca->usage_base->d[i].sectors    = le64_to_cpu(u->d[i].sectors);
+                       ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented);
+               }
+
+               break;
+       }
        case BCH_JSET_ENTRY_blacklist: {
                struct jset_entry_blacklist *bl_entry =
                        container_of(entry, struct jset_entry_blacklist, entry);
@@ -847,6 +868,12 @@ static int journal_replay_entry_early(struct bch_fs *c,
                                le64_to_cpu(bl_entry->end) + 1);
                break;
        }
+       case BCH_JSET_ENTRY_clock: {
+               struct jset_entry_clock *clock =
+                       container_of(entry, struct jset_entry_clock, entry);
+
+               atomic64_set(&c->io_clock[clock->rw].now, clock->time);
+       }
        }
 
        return ret;
@@ -861,9 +888,6 @@ static int journal_replay_early(struct bch_fs *c,
        int ret;
 
        if (clean) {
-               c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
-               c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
-
                for (entry = clean->start;
                     entry != vstruct_end(&clean->field);
                     entry = vstruct_next(entry)) {
@@ -876,9 +900,6 @@ static int journal_replay_early(struct bch_fs *c,
                        if (i->ignore)
                                continue;
 
-                       c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock);
-                       c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock);
-
                        vstruct_for_each(&i->j, entry) {
                                ret = journal_replay_entry_early(c, entry);
                                if (ret)
@@ -942,13 +963,6 @@ static int verify_superblock_clean(struct bch_fs *c,
                return 0;
        }
 
-       mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
-                       "superblock read clock %u doesn't match journal %u after clean shutdown",
-                       clean->read_clock, j->read_clock);
-       mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
-                       "superblock write clock %u doesn't match journal %u after clean shutdown",
-                       clean->write_clock, j->write_clock);
-
        for (i = 0; i < BTREE_ID_NR; i++) {
                char buf1[200], buf2[200];
                struct bkey_i *k1, *k2;
index ce8b7355b349d1c110bda8afd1c2188158cd5d56..3970c442f19928038528e422f56141b5b3864159 100644 (file)
@@ -26,7 +26,7 @@ static void verify_replicas_entry(struct bch_replicas_entry *e)
 #endif
 }
 
-static void replicas_entry_sort(struct bch_replicas_entry *e)
+void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
 {
        bubble_sort(e->devs, e->nr_devs, u8_cmp);
 }
@@ -122,7 +122,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
                break;
        }
 
-       replicas_entry_sort(e);
+       bch2_replicas_entry_sort(e);
 }
 
 void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
@@ -142,7 +142,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
        for (i = 0; i < devs.nr; i++)
                e->devs[e->nr_devs++] = devs.devs[i];
 
-       replicas_entry_sort(e);
+       bch2_replicas_entry_sort(e);
 }
 
 static struct bch_replicas_cpu
@@ -197,7 +197,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
 int bch2_replicas_entry_idx(struct bch_fs *c,
                            struct bch_replicas_entry *search)
 {
-       replicas_entry_sort(search);
+       bch2_replicas_entry_sort(search);
 
        return __replicas_entry_idx(&c->replicas, search);
 }
@@ -681,7 +681,7 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
        for_each_replicas_entry(sb_r, e) {
                dst = cpu_replicas_entry(cpu_r, idx++);
                memcpy(dst, e, replicas_entry_bytes(e));
-               replicas_entry_sort(dst);
+               bch2_replicas_entry_sort(dst);
        }
 
        return 0;
@@ -718,7 +718,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
                dst->nr_devs    = e->nr_devs;
                dst->nr_required = 1;
                memcpy(dst->devs, e->devs, e->nr_devs);
-               replicas_entry_sort(dst);
+               bch2_replicas_entry_sort(dst);
        }
 
        return 0;
index 8b95164fbb56636fbee194ae4d61a0e4f63178bc..a16ef23bde8af4fdf7d9113601c504a04c5b3853 100644 (file)
@@ -5,6 +5,7 @@
 #include "eytzinger.h"
 #include "replicas_types.h"
 
+void bch2_replicas_entry_sort(struct bch_replicas_entry *);
 void bch2_replicas_entry_to_text(struct printbuf *,
                                 struct bch_replicas_entry *);
 void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
index 751efd28b672bd3fceedfade1b4d5af95ab1392c..a510a25e2edbf28363d7b414912926119eaab99a 100644 (file)
@@ -963,31 +963,28 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
        return ret;
 }
 
-static void
-entry_init_u64s(struct jset_entry *entry, unsigned u64s)
+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
 {
-       memset(entry, 0, u64s * sizeof(u64));
+       struct jset_entry *entry = *end;
+       unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
 
+       memset(entry, 0, u64s * sizeof(u64));
        /*
         * The u64s field counts from the start of data, ignoring the shared
         * fields.
         */
        entry->u64s = u64s - 1;
-}
 
-static void
-entry_init_size(struct jset_entry *entry, size_t size)
-{
-       unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-       entry_init_u64s(entry, u64s);
+       *end = vstruct_next(*end);
+       return entry;
 }
 
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *c,
-                                     struct jset_entry *entry,
-                                     u64 journal_seq)
+void bch2_journal_super_entries_add_common(struct bch_fs *c,
+                                          struct jset_entry **end,
+                                          u64 journal_seq)
 {
-       unsigned i;
+       struct bch_dev *ca;
+       unsigned i, dev;
 
        percpu_down_write(&c->mark_lock);
 
@@ -1000,58 +997,77 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
 
        {
                struct jset_entry_usage *u =
-                       container_of(entry, struct jset_entry_usage, entry);
+                       container_of(jset_entry_init(end, sizeof(*u)),
+                                    struct jset_entry_usage, entry);
 
-               entry_init_size(entry, sizeof(*u));
                u->entry.type   = BCH_JSET_ENTRY_usage;
                u->entry.btree_id = FS_USAGE_INODES;
                u->v            = cpu_to_le64(c->usage_base->nr_inodes);
-
-               entry = vstruct_next(entry);
        }
 
        {
                struct jset_entry_usage *u =
-                       container_of(entry, struct jset_entry_usage, entry);
+                       container_of(jset_entry_init(end, sizeof(*u)),
+                                    struct jset_entry_usage, entry);
 
-               entry_init_size(entry, sizeof(*u));
                u->entry.type   = BCH_JSET_ENTRY_usage;
                u->entry.btree_id = FS_USAGE_KEY_VERSION;
                u->v            = cpu_to_le64(atomic64_read(&c->key_version));
-
-               entry = vstruct_next(entry);
        }
 
        for (i = 0; i < BCH_REPLICAS_MAX; i++) {
                struct jset_entry_usage *u =
-                       container_of(entry, struct jset_entry_usage, entry);
+                       container_of(jset_entry_init(end, sizeof(*u)),
+                                    struct jset_entry_usage, entry);
 
-               entry_init_size(entry, sizeof(*u));
                u->entry.type   = BCH_JSET_ENTRY_usage;
                u->entry.btree_id = FS_USAGE_RESERVED;
                u->entry.level  = i;
                u->v            = cpu_to_le64(c->usage_base->persistent_reserved[i]);
-
-               entry = vstruct_next(entry);
        }
 
        for (i = 0; i < c->replicas.nr; i++) {
                struct bch_replicas_entry *e =
                        cpu_replicas_entry(&c->replicas, i);
                struct jset_entry_data_usage *u =
-                       container_of(entry, struct jset_entry_data_usage, entry);
+                       container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
+                                    struct jset_entry_data_usage, entry);
 
-               entry_init_size(entry, sizeof(*u) + e->nr_devs);
                u->entry.type   = BCH_JSET_ENTRY_data_usage;
                u->v            = cpu_to_le64(c->usage_base->replicas[i]);
                memcpy(&u->r, e, replicas_entry_bytes(e));
+       }
 
-               entry = vstruct_next(entry);
+       for_each_member_device(ca, c, dev) {
+               unsigned b = sizeof(struct jset_entry_dev_usage) +
+                       sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
+               struct jset_entry_dev_usage *u =
+                       container_of(jset_entry_init(end, b),
+                                    struct jset_entry_dev_usage, entry);
+
+               u->entry.type = BCH_JSET_ENTRY_dev_usage;
+               u->dev = cpu_to_le32(dev);
+               u->buckets_ec           = cpu_to_le64(ca->usage_base->buckets_ec);
+               u->buckets_unavailable  = cpu_to_le64(ca->usage_base->buckets_unavailable);
+
+               for (i = 0; i < BCH_DATA_NR; i++) {
+                       u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
+                       u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
+                       u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
+               }
        }
 
        percpu_up_write(&c->mark_lock);
 
-       return entry;
+       for (i = 0; i < 2; i++) {
+               struct jset_entry_clock *clock =
+                       container_of(jset_entry_init(end, sizeof(*clock)),
+                                    struct jset_entry_clock, entry);
+
+               clock->entry.type = BCH_JSET_ENTRY_clock;
+               clock->rw       = i;
+               clock->time     = atomic64_read(&c->io_clock[i].now);
+       }
 }
 
 void bch2_fs_mark_clean(struct bch_fs *c)
@@ -1080,15 +1096,13 @@ void bch2_fs_mark_clean(struct bch_fs *c)
        }
 
        sb_clean->flags         = 0;
-       sb_clean->read_clock    = cpu_to_le16(c->bucket_clock[READ].hand);
-       sb_clean->write_clock   = cpu_to_le16(c->bucket_clock[WRITE].hand);
        sb_clean->journal_seq   = cpu_to_le64(journal_cur_seq(&c->journal) - 1);
 
        /* Trying to catch outstanding bug: */
        BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX);
 
        entry = sb_clean->start;
-       entry = bch2_journal_super_entries_add_common(c, entry, 0);
+       bch2_journal_super_entries_add_common(c, &entry, 0);
        entry = bch2_btree_roots_to_journal_entries(c, entry, entry);
        BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
 
index 7a068158efcae906103b729488bc0ed83e2280a4..1a35124f5f47571f63afd3ef16ce35e184c7ef99 100644 (file)
@@ -122,9 +122,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 
 /* BCH_SB_FIELD_clean: */
 
-struct jset_entry *
-bch2_journal_super_entries_add_common(struct bch_fs *,
-                                     struct jset_entry *, u64);
+void bch2_journal_super_entries_add_common(struct bch_fs *,
+                                          struct jset_entry **, u64);
 
 void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int);
 
index f3c12d89df58bf6a5739249a8e07cac575c3d2c7..ac277df8840798b647f08870e5fdddc7d9692afb 100644 (file)
@@ -148,6 +148,22 @@ struct bch_fs *bch2_uuid_to_fs(uuid_le uuid)
        return c;
 }
 
+static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i, nr = 0, u64s =
+               (sizeof(struct jset_entry_dev_usage) +
+                sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR);
+
+       rcu_read_lock();
+       for_each_member_device_rcu(ca, c, i, NULL)
+               nr++;
+       rcu_read_unlock();
+
+       bch2_journal_entry_res_resize(&c->journal,
+                       &c->dev_usage_journal_res, u64s * nr);
+}
+
 /* Filesystem RO/RW: */
 
 /*
@@ -174,9 +190,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
        bch2_copygc_stop(c);
        bch2_gc_thread_stop(c);
 
-       bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
-       bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
        /*
         * Flush journal before stopping allocators, because flushing journal
         * blacklist entries involves allocating new btree nodes:
@@ -399,9 +412,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
                bch2_dev_allocator_add(c, ca);
        bch2_recalc_capacity(c);
 
-       bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale);
-       bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale);
-
        for_each_rw_member(ca, c, i) {
                ret = bch2_dev_allocator_start(ca);
                if (ret) {
@@ -779,6 +789,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
            bch2_fs_fsio_init(c))
                goto err;
 
+       bch2_dev_usage_journal_reserve(c);
+
        mi = bch2_sb_get_members(c->disk_sb.sb);
        for (i = 0; i < c->sb.nr_devices; i++)
                if (bch2_dev_exists(c->disk_sb.sb, mi, i) &&
@@ -1521,6 +1533,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 
        mutex_unlock(&c->sb_lock);
        up_write(&c->state_lock);
+
+       bch2_dev_usage_journal_reserve(c);
        return 0;
 err:
        if (ca->mi.state == BCH_MEMBER_STATE_RW &&
@@ -1530,19 +1544,6 @@ err:
        return ret;
 }
 
-static void dev_usage_clear(struct bch_dev *ca)
-{
-       struct bucket_array *buckets;
-
-       percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0]));
-
-       down_read(&ca->bucket_lock);
-       buckets = bucket_array(ca);
-
-       memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets);
-       up_read(&ca->bucket_lock);
-}
-
 /* Add new device to running filesystem: */
 int bch2_dev_add(struct bch_fs *c, const char *path)
 {
@@ -1600,8 +1601,6 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
        if (ret)
                goto err;
 
-       dev_usage_clear(ca);
-
        down_write(&c->state_lock);
        mutex_lock(&c->sb_lock);
 
@@ -1655,6 +1654,8 @@ have_slot:
        bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
 
+       bch2_dev_usage_journal_reserve(c);
+
        err = "error marking superblock";
        ret = bch2_trans_mark_dev_sb(c, NULL, ca);
        if (ret)
index 80964bdf6237432af7fe3f91c000720d607f5950..f934f12bc677c7c6b9c92788bb9193bcc481e179 100644 (file)
@@ -705,7 +705,7 @@ static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca,
 {
        int rw = (private ? 1 : 0);
 
-       return bucket_last_io(c, bucket(ca, b), rw);
+       return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw];
 }
 
 static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
@@ -718,7 +718,7 @@ static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca,
 static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca,
                                     size_t b, void *private)
 {
-       return bucket_gc_gen(ca, b);
+       return bucket_gc_gen(bucket(ca, b));
 }
 
 static int unsigned_cmp(const void *_l, const void *_r)