]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/alloc_background.c
New upstream snapshot
[bcachefs-tools-debian] / libbcachefs / alloc_background.c
index 97508de9f7214204f8f6297b0d0d6d6be901732e..48971fcf2d5bb393c818732e3b0f1be50c179ebc 100644 (file)
@@ -14,6 +14,7 @@
 #include "ec.h"
 #include "error.h"
 #include "recovery.h"
+#include "varint.h"
 
 #include <linux/kthread.h>
 #include <linux/math64.h>
 #include <linux/sort.h>
 #include <trace/events/bcachefs.h>
 
-static const char * const bch2_alloc_field_names[] = {
-#define x(name, bytes) #name,
-       BCH_ALLOC_FIELDS()
+static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
+#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
+       BCH_ALLOC_FIELDS_V1()
 #undef x
-       NULL
 };
 
-static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int);
-
 /* Ratelimiting/PD controllers */
 
 static void pd_controllers_update(struct work_struct *work)
@@ -54,10 +52,10 @@ static void pd_controllers_update(struct work_struct *work)
                 * reclaimed by copy GC
                 */
                fragmented += max_t(s64, 0, (bucket_to_sector(ca,
-                                       stats.buckets[BCH_DATA_user] +
-                                       stats.buckets[BCH_DATA_cached]) -
-                                 (stats.sectors[BCH_DATA_user] +
-                                  stats.sectors[BCH_DATA_cached])) << 9);
+                                       stats.d[BCH_DATA_user].buckets +
+                                       stats.d[BCH_DATA_cached].buckets) -
+                                 (stats.d[BCH_DATA_user].sectors +
+                                  stats.d[BCH_DATA_cached].sectors)) << 9);
        }
 
        bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1);
@@ -67,10 +65,10 @@ static void pd_controllers_update(struct work_struct *work)
 
 /* Persistent alloc info: */
 
-static inline u64 get_alloc_field(const struct bch_alloc *a,
-                                 const void **p, unsigned field)
+static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
+                                    const void **p, unsigned field)
 {
-       unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+       unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
        u64 v;
 
        if (!(a->fields & (1 << field)))
@@ -97,10 +95,10 @@ static inline u64 get_alloc_field(const struct bch_alloc *a,
        return v;
 }
 
-static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
-                                  unsigned field, u64 v)
+static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p,
+                                     unsigned field, u64 v)
 {
-       unsigned bytes = BCH_ALLOC_FIELD_BYTES[field];
+       unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
 
        if (!v)
                return;
@@ -127,55 +125,127 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
        *p += bytes;
 }
 
-struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
+                                struct bkey_s_c k)
 {
-       struct bkey_alloc_unpacked ret = { .gen = 0 };
+       const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
+       const void *d = in->data;
+       unsigned idx = 0;
 
-       if (k.k->type == KEY_TYPE_alloc) {
-               const struct bch_alloc *a = bkey_s_c_to_alloc(k).v;
-               const void *d = a->data;
-               unsigned idx = 0;
+       out->gen = in->gen;
 
-               ret.gen = a->gen;
+#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
+       BCH_ALLOC_FIELDS_V1()
+#undef  x
+}
 
-#define x(_name, _bits)        ret._name = get_alloc_field(a, &d, idx++);
-               BCH_ALLOC_FIELDS()
+static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
+                               struct bkey_s_c k)
+{
+       struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
+       const u8 *in = a.v->data;
+       const u8 *end = bkey_val_end(a);
+       unsigned fieldnr = 0;
+       int ret;
+       u64 v;
+
+       out->gen        = a.v->gen;
+       out->oldest_gen = a.v->oldest_gen;
+       out->data_type  = a.v->data_type;
+
+#define x(_name, _bits)                                                        \
+       if (fieldnr < a.v->nr_fields) {                                 \
+               ret = bch2_varint_decode(in, end, &v);                  \
+               if (ret < 0)                                            \
+                       return ret;                                     \
+               in += ret;                                              \
+       } else {                                                        \
+               v = 0;                                                  \
+       }                                                               \
+       out->_name = v;                                                 \
+       if (v != out->_name)                                            \
+               return -1;                                              \
+       fieldnr++;
+
+       BCH_ALLOC_FIELDS_V2()
 #undef  x
-       }
-       return ret;
+       return 0;
 }
 
-void bch2_alloc_pack(struct bkey_i_alloc *dst,
-                    const struct bkey_alloc_unpacked src)
+static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
+                              const struct bkey_alloc_unpacked src)
 {
-       unsigned idx = 0;
-       void *d = dst->v.data;
+       struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k);
+       unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
+       u8 *out = a->v.data;
+       u8 *end = (void *) &dst[1];
+       u8 *last_nonzero_field = out;
        unsigned bytes;
 
-       dst->v.fields   = 0;
-       dst->v.gen      = src.gen;
+       a->k.p          = POS(src.dev, src.bucket);
+       a->v.gen        = src.gen;
+       a->v.oldest_gen = src.oldest_gen;
+       a->v.data_type  = src.data_type;
+
+#define x(_name, _bits)                                                        \
+       nr_fields++;                                                    \
+                                                                       \
+       if (src._name) {                                                \
+               out += bch2_varint_encode(out, src._name);              \
+                                                                       \
+               last_nonzero_field = out;                               \
+               last_nonzero_fieldnr = nr_fields;                       \
+       } else {                                                        \
+               *out++ = 0;                                             \
+       }
 
-#define x(_name, _bits)        put_alloc_field(dst, &d, idx++, src._name);
-       BCH_ALLOC_FIELDS()
+       BCH_ALLOC_FIELDS_V2()
 #undef  x
+       BUG_ON(out > end);
+
+       out = last_nonzero_field;
+       a->v.nr_fields = last_nonzero_fieldnr;
+
+       bytes = (u8 *) out - (u8 *) &a->v;
+       set_bkey_val_bytes(&a->k, bytes);
+       memset_u64s_tail(&a->v, 0, bytes);
+}
+
+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
+{
+       struct bkey_alloc_unpacked ret = {
+               .dev    = k.k->p.inode,
+               .bucket = k.k->p.offset,
+               .gen    = 0,
+       };
+
+       if (k.k->type == KEY_TYPE_alloc_v2)
+               bch2_alloc_unpack_v2(&ret, k);
+       else if (k.k->type == KEY_TYPE_alloc)
+               bch2_alloc_unpack_v1(&ret, k);
+
+       return ret;
+}
 
-       bytes = (void *) d - (void *) &dst->v;
-       set_bkey_val_bytes(&dst->k, bytes);
-       memset_u64s_tail(&dst->v, 0, bytes);
+void bch2_alloc_pack(struct bch_fs *c,
+                    struct bkey_alloc_buf *dst,
+                    const struct bkey_alloc_unpacked src)
+{
+       bch2_alloc_pack_v2(dst, src);
 }
 
 static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
 {
        unsigned i, bytes = offsetof(struct bch_alloc, data);
 
-       for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++)
+       for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
                if (a->fields & (1 << i))
-                       bytes += BCH_ALLOC_FIELD_BYTES[i];
+                       bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
 
        return DIV_ROUND_UP(bytes, sizeof(u64));
 }
 
-const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
+const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
        struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
 
@@ -190,20 +260,30 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k)
        return NULL;
 }
 
-void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
-                       struct bkey_s_c k)
+const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-       struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
-       const void *d = a.v->data;
-       unsigned i;
+       struct bkey_alloc_unpacked u;
 
-       pr_buf(out, "gen %u", a.v->gen);
+       if (k.k->p.inode >= c->sb.nr_devices ||
+           !c->devs[k.k->p.inode])
+               return "invalid device";
+
+       if (bch2_alloc_unpack_v2(&u, k))
+               return "unpack error";
 
-       for (i = 0; i < BCH_ALLOC_FIELD_NR; i++)
-               if (a.v->fields & (1 << i))
-                       pr_buf(out, " %s %llu",
-                              bch2_alloc_field_names[i],
-                              get_alloc_field(a.v, &d, i));
+       return NULL;
+}
+
+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
+                          struct bkey_s_c k)
+{
+       struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
+
+       pr_buf(out, "gen %u oldest_gen %u data_type %u",
+              u.gen, u.oldest_gen, u.data_type);
+#define x(_name, ...)  pr_buf(out, #_name " %llu ", (u64) u._name);
+       BCH_ALLOC_FIELDS_V2()
+#undef  x
 }
 
 static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
@@ -213,11 +293,13 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
        struct bucket *g;
        struct bkey_alloc_unpacked u;
 
-       if (level || k.k->type != KEY_TYPE_alloc)
+       if (level ||
+           (k.k->type != KEY_TYPE_alloc &&
+            k.k->type != KEY_TYPE_alloc_v2))
                return 0;
 
        ca = bch_dev_bkey_exists(c, k.k->p.inode);
-       g = __bucket(ca, k.k->p.offset, 0);
+       g = bucket(ca, k.k->p.offset);
        u = bch2_alloc_unpack(k);
 
        g->_mark.gen            = u.gen;
@@ -234,12 +316,10 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id,
 
 int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
 {
-       struct bch_dev *ca;
-       unsigned i;
-       int ret = 0;
+       int ret;
 
        down_read(&c->gc_lock);
-       ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC,
+       ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_alloc,
                                          NULL, bch2_alloc_read_fn);
        up_read(&c->gc_lock);
 
@@ -248,26 +328,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys)
                return ret;
        }
 
-       percpu_down_write(&c->mark_lock);
-       bch2_dev_usage_from_buckets(c);
-       percpu_up_write(&c->mark_lock);
-
-       mutex_lock(&c->bucket_clock[READ].lock);
-       for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               bch2_recalc_oldest_io(c, ca, READ);
-               up_read(&ca->bucket_lock);
-       }
-       mutex_unlock(&c->bucket_clock[READ].lock);
-
-       mutex_lock(&c->bucket_clock[WRITE].lock);
-       for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               bch2_recalc_oldest_io(c, ca, WRITE);
-               up_read(&ca->bucket_lock);
-       }
-       mutex_unlock(&c->bucket_clock[WRITE].lock);
-
        return 0;
 }
 
@@ -278,18 +338,16 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
        struct bch_fs *c = trans->c;
        struct bkey_s_c k;
        struct bch_dev *ca;
-       struct bucket_array *ba;
        struct bucket *g;
        struct bucket_mark m;
        struct bkey_alloc_unpacked old_u, new_u;
-       __BKEY_PADDED(k, 8) alloc_key; /* hack: */
-       struct bkey_i_alloc *a;
+       struct bkey_alloc_buf a;
        int ret;
 retry:
        bch2_trans_begin(trans);
 
        ret = bch2_btree_key_cache_flush(trans,
-                       BTREE_ID_ALLOC, iter->pos);
+                       BTREE_ID_alloc, iter->pos);
        if (ret)
                goto err;
 
@@ -302,193 +360,60 @@ retry:
 
        percpu_down_read(&c->mark_lock);
        ca      = bch_dev_bkey_exists(c, iter->pos.inode);
-       ba      = bucket_array(ca);
-
-       g       = &ba->b[iter->pos.offset];
+       g       = bucket(ca, iter->pos.offset);
        m       = READ_ONCE(g->mark);
-       new_u   = alloc_mem_to_key(g, m);
+       new_u   = alloc_mem_to_key(iter, g, m);
        percpu_up_read(&c->mark_lock);
 
        if (!bkey_alloc_unpacked_cmp(old_u, new_u))
                return 0;
 
-       a = bkey_alloc_init(&alloc_key.k);
-       a->k.p = iter->pos;
-       bch2_alloc_pack(a, new_u);
-
-       bch2_trans_update(trans, iter, &a->k_i,
+       bch2_alloc_pack(c, &a, new_u);
+       bch2_trans_update(trans, iter, &a.k,
                          BTREE_TRIGGER_NORUN);
        ret = bch2_trans_commit(trans, NULL, NULL,
-                               BTREE_INSERT_NOFAIL|
-                               BTREE_INSERT_USE_RESERVE|
-                               flags);
+                               BTREE_INSERT_NOFAIL|flags);
 err:
        if (ret == -EINTR)
                goto retry;
        return ret;
 }
 
-int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags)
+int bch2_alloc_write(struct bch_fs *c, unsigned flags)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
-       u64 first_bucket, nbuckets;
+       struct bch_dev *ca;
+       unsigned i;
        int ret = 0;
 
-       percpu_down_read(&c->mark_lock);
-       first_bucket    = bucket_array(ca)->first_bucket;
-       nbuckets        = bucket_array(ca)->nbuckets;
-       percpu_up_read(&c->mark_lock);
-
-       BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
-
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
-                                  POS(ca->dev_idx, first_bucket),
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS_MIN,
                                   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
-       while (iter->pos.offset < nbuckets) {
-               bch2_trans_cond_resched(&trans);
-
-               ret = bch2_alloc_write_key(&trans, iter, flags);
-               if (ret)
-                       break;
-               bch2_btree_iter_next_slot(iter);
-       }
-
-       bch2_trans_exit(&trans);
-
-       return ret;
-}
+       for_each_member_device(ca, c, i) {
+               bch2_btree_iter_set_pos(iter,
+                       POS(ca->dev_idx, ca->mi.first_bucket));
 
-int bch2_alloc_write(struct bch_fs *c, unsigned flags)
-{
-       struct bch_dev *ca;
-       unsigned i;
-       int ret = 0;
+               while (iter->pos.offset < ca->mi.nbuckets) {
+                       bch2_trans_cond_resched(&trans);
 
-       for_each_rw_member(ca, c, i) {
-               bch2_dev_alloc_write(c, ca, flags);
-               if (ret) {
-                       percpu_ref_put(&ca->io_ref);
-                       break;
+                       ret = bch2_alloc_write_key(&trans, iter, flags);
+                       if (ret) {
+                               percpu_ref_put(&ca->io_ref);
+                               goto err;
+                       }
+                       bch2_btree_iter_next_slot(iter);
                }
        }
-
+err:
+       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_exit(&trans);
        return ret;
 }
 
 /* Bucket IO clocks: */
 
-static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw)
-{
-       struct bucket_clock *clock = &c->bucket_clock[rw];
-       struct bucket_array *buckets = bucket_array(ca);
-       struct bucket *g;
-       u16 max_last_io = 0;
-       unsigned i;
-
-       lockdep_assert_held(&c->bucket_clock[rw].lock);
-
-       /* Recalculate max_last_io for this device: */
-       for_each_bucket(g, buckets)
-               max_last_io = max(max_last_io, bucket_last_io(c, g, rw));
-
-       ca->max_last_bucket_io[rw] = max_last_io;
-
-       /* Recalculate global max_last_io: */
-       max_last_io = 0;
-
-       for_each_member_device(ca, c, i)
-               max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]);
-
-       clock->max_last_io = max_last_io;
-}
-
-static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw)
-{
-       struct bucket_clock *clock = &c->bucket_clock[rw];
-       struct bucket_array *buckets;
-       struct bch_dev *ca;
-       struct bucket *g;
-       unsigned i;
-
-       trace_rescale_prios(c);
-
-       for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               buckets = bucket_array(ca);
-
-               for_each_bucket(g, buckets)
-                       g->io_time[rw] = clock->hand -
-                       bucket_last_io(c, g, rw) / 2;
-
-               bch2_recalc_oldest_io(c, ca, rw);
-
-               up_read(&ca->bucket_lock);
-       }
-}
-
-static inline u64 bucket_clock_freq(u64 capacity)
-{
-       return max(capacity >> 10, 2028ULL);
-}
-
-static void bch2_inc_clock_hand(struct io_timer *timer)
-{
-       struct bucket_clock *clock = container_of(timer,
-                                               struct bucket_clock, rescale);
-       struct bch_fs *c = container_of(clock,
-                                       struct bch_fs, bucket_clock[clock->rw]);
-       struct bch_dev *ca;
-       u64 capacity;
-       unsigned i;
-
-       mutex_lock(&clock->lock);
-
-       /* if clock cannot be advanced more, rescale prio */
-       if (clock->max_last_io >= U16_MAX - 2)
-               bch2_rescale_bucket_io_times(c, clock->rw);
-
-       BUG_ON(clock->max_last_io >= U16_MAX - 2);
-
-       for_each_member_device(ca, c, i)
-               ca->max_last_bucket_io[clock->rw]++;
-       clock->max_last_io++;
-       clock->hand++;
-
-       mutex_unlock(&clock->lock);
-
-       capacity = READ_ONCE(c->capacity);
-
-       if (!capacity)
-               return;
-
-       /*
-        * we only increment when 0.1% of the filesystem capacity has been read
-        * or written too, this determines if it's time
-        *
-        * XXX: we shouldn't really be going off of the capacity of devices in
-        * RW mode (that will be 0 when we're RO, yet we can still service
-        * reads)
-        */
-       timer->expire += bucket_clock_freq(capacity);
-
-       bch2_io_timer_add(&c->io_clock[clock->rw], timer);
-}
-
-static void bch2_bucket_clock_init(struct bch_fs *c, int rw)
-{
-       struct bucket_clock *clock = &c->bucket_clock[rw];
-
-       clock->hand             = 1;
-       clock->rw               = rw;
-       clock->rescale.fn       = bch2_inc_clock_hand;
-       clock->rescale.expire   = bucket_clock_freq(c->capacity);
-       mutex_init(&clock->lock);
-}
-
 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
                              size_t bucket_nr, int rw)
 {
@@ -496,40 +421,38 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
        struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
        struct btree_iter *iter;
        struct bucket *g;
-       struct bkey_i_alloc *a;
+       struct bkey_alloc_buf *a;
        struct bkey_alloc_unpacked u;
-       u16 *time;
+       u64 *time, now;
        int ret = 0;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr),
+       iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, POS(dev, bucket_nr),
                                   BTREE_ITER_CACHED|
                                   BTREE_ITER_CACHED_NOFILL|
                                   BTREE_ITER_INTENT);
-       if (IS_ERR(iter))
-               return PTR_ERR(iter);
+       ret = bch2_btree_iter_traverse(iter);
+       if (ret)
+               goto out;
 
-       a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8);
+       a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
        ret = PTR_ERR_OR_ZERO(a);
        if (ret)
                goto out;
 
        percpu_down_read(&c->mark_lock);
        g = bucket(ca, bucket_nr);
-       u = alloc_mem_to_key(g, READ_ONCE(g->mark));
+       u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark));
        percpu_up_read(&c->mark_lock);
 
-       bkey_alloc_init(&a->k_i);
-       a->k.p = iter->pos;
-
        time = rw == READ ? &u.read_time : &u.write_time;
-       if (*time == c->bucket_clock[rw].hand)
+       now = atomic64_read(&c->io_clock[rw].now);
+       if (*time == now)
                goto out;
 
-       *time = c->bucket_clock[rw].hand;
-
-       bch2_alloc_pack(a, u);
+       *time = now;
 
-       ret   = bch2_trans_update(trans, iter, &a->k_i, 0) ?:
+       bch2_alloc_pack(c, a, u);
+       ret   = bch2_trans_update(trans, iter, &a->k, 0) ?:
                bch2_trans_commit(trans, NULL, NULL, 0);
 out:
        bch2_trans_iter_put(trans, iter);
@@ -553,7 +476,8 @@ out:
 static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 {
        unsigned long gc_count = c->gc_count;
-       u64 available;
+       s64 available;
+       unsigned i;
        int ret = 0;
 
        ca->allocator_state = ALLOCATOR_BLOCKED;
@@ -569,13 +493,19 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
                if (gc_count != c->gc_count)
                        ca->inc_gen_really_needs_gc = 0;
 
-               available = max_t(s64, 0, dev_buckets_available(ca) -
-                                 ca->inc_gen_really_needs_gc);
+               available  = dev_buckets_available(ca);
+               available -= ca->inc_gen_really_needs_gc;
+
+               spin_lock(&c->freelist_lock);
+               for (i = 0; i < RESERVE_NR; i++)
+                       available -= fifo_used(&ca->free[i]);
+               spin_unlock(&c->freelist_lock);
+
+               available = max(available, 0LL);
 
                if (available > fifo_free(&ca->free_inc) ||
                    (available &&
-                    (!fifo_full(&ca->free[RESERVE_BTREE]) ||
-                     !fifo_full(&ca->free[RESERVE_MOVINGGC]))))
+                    !fifo_full(&ca->free[RESERVE_MOVINGGC])))
                        break;
 
                up_read(&c->gc_lock);
@@ -591,20 +521,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
        return ret;
 }
 
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
-                                      size_t bucket,
-                                      struct bucket_mark mark)
+static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
+                                      struct bucket_mark m)
 {
        u8 gc_gen;
 
-       if (!is_available_bucket(mark))
+       if (!is_available_bucket(m))
+               return false;
+
+       if (m.owned_by_allocator)
                return false;
 
        if (ca->buckets_nouse &&
-           test_bit(bucket, ca->buckets_nouse))
+           test_bit(b, ca->buckets_nouse))
                return false;
 
-       gc_gen = bucket_gc_gen(ca, bucket);
+       gc_gen = bucket_gc_gen(bucket(ca, b));
 
        if (gc_gen >= BUCKET_GC_GEN_MAX / 2)
                ca->inc_gen_needs_gc++;
@@ -618,43 +550,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca,
 /*
  * Determines what order we're going to reuse buckets, smallest bucket_key()
  * first.
- *
- *
- * - We take into account the read prio of the bucket, which gives us an
- *   indication of how hot the data is -- we scale the prio so that the prio
- *   farthest from the clock is worth 1/8th of the closest.
- *
- * - The number of sectors of cached data in the bucket, which gives us an
- *   indication of the cost in cache misses this eviction will cause.
- *
- * - If hotness * sectors used compares equal, we pick the bucket with the
- *   smallest bucket_gc_gen() - since incrementing the same bucket's generation
- *   number repeatedly forces us to run mark and sweep gc to avoid generation
- *   number wraparound.
  */
 
-static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
-                                    size_t b, struct bucket_mark m)
+static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
+                               u64 now, u64 last_seq_ondisk)
 {
-       unsigned last_io = bucket_last_io(c, bucket(ca, b), READ);
-       unsigned max_last_io = ca->max_last_bucket_io[READ];
-
-       /*
-        * Time since last read, scaled to [0, 8) where larger value indicates
-        * more recently read data:
-        */
-       unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io;
+       unsigned used = bucket_sectors_used(m);
 
-       /* How much we want to keep the data in this bucket: */
-       unsigned long data_wantness =
-               (hotness + 1) * bucket_sectors_used(m);
-
-       unsigned long needs_journal_commit =
-               bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
+       if (used) {
+               /*
+                * Prefer to keep buckets that have been read more recently, and
+                * buckets that have more data in them:
+                */
+               u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
+               u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
 
-       return  (data_wantness << 9) |
-               (needs_journal_commit << 8) |
-               (bucket_gc_gen(ca, b) / 16);
+               return -last_read_scaled;
+       } else {
+               /*
+                * Prefer to use buckets with smaller gc_gen so that we don't
+                * have to walk the btree and recalculate oldest_gen - but shift
+                * off the low bits so that buckets will still have equal sort
+                * keys when there's only a small difference, so that we can
+                * keep sequential buckets together:
+                */
+               return  (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
+                       (bucket_gc_gen(g) >> 4);
+       }
 }
 
 static inline int bucket_alloc_cmp(alloc_heap *h,
@@ -677,16 +599,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 {
        struct bucket_array *buckets;
        struct alloc_heap_entry e = { 0 };
+       u64 now, last_seq_ondisk;
        size_t b, i, nr = 0;
 
-       ca->alloc_heap.used = 0;
-
-       mutex_lock(&c->bucket_clock[READ].lock);
        down_read(&ca->bucket_lock);
 
        buckets = bucket_array(ca);
-
-       bch2_recalc_oldest_io(c, ca, READ);
+       ca->alloc_heap.used = 0;
+       now = atomic64_read(&c->io_clock[READ].now);
+       last_seq_ondisk = c->journal.last_seq_ondisk;
 
        /*
         * Find buckets with lowest read priority, by building a maxheap sorted
@@ -694,8 +615,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
         * all buckets have been visited.
         */
        for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
-               struct bucket_mark m = READ_ONCE(buckets->b[b].mark);
-               unsigned long key = bucket_sort_key(c, ca, b, m);
+               struct bucket *g = &buckets->b[b];
+               struct bucket_mark m = READ_ONCE(g->mark);
+               unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
 
                if (!bch2_can_invalidate_bucket(ca, b, m))
                        continue;
@@ -730,7 +652,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
        }
 
        up_read(&ca->bucket_lock);
-       mutex_unlock(&c->bucket_clock[READ].lock);
 }
 
 static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
@@ -810,13 +731,13 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
        ca->inc_gen_needs_gc                    = 0;
 
        switch (ca->mi.replacement) {
-       case CACHE_REPLACEMENT_LRU:
+       case BCH_CACHE_REPLACEMENT_lru:
                find_reclaimable_buckets_lru(c, ca);
                break;
-       case CACHE_REPLACEMENT_FIFO:
+       case BCH_CACHE_REPLACEMENT_fifo:
                find_reclaimable_buckets_fifo(c, ca);
                break;
-       case CACHE_REPLACEMENT_RANDOM:
+       case BCH_CACHE_REPLACEMENT_random:
                find_reclaimable_buckets_random(c, ca);
                break;
        }
@@ -875,14 +796,8 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
                                       struct btree_iter *iter,
                                       u64 *journal_seq, unsigned flags)
 {
-#if 0
-       __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
-#else
-       /* hack: */
-       __BKEY_PADDED(k, 8) alloc_key;
-#endif
        struct bch_fs *c = trans->c;
-       struct bkey_i_alloc *a;
+       struct bkey_alloc_buf a;
        struct bkey_alloc_unpacked u;
        struct bucket *g;
        struct bucket_mark m;
@@ -896,34 +811,33 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
 
        /* first, put on free_inc and mark as owned by allocator: */
        percpu_down_read(&c->mark_lock);
-       spin_lock(&c->freelist_lock);
-
-       verify_not_on_freelist(c, ca, b);
-
-       BUG_ON(!fifo_push(&ca->free_inc, b));
-
        g = bucket(ca, b);
        m = READ_ONCE(g->mark);
 
-       invalidating_cached_data = m.cached_sectors != 0;
+       BUG_ON(m.dirty_sectors);
+
+       bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
+
+       spin_lock(&c->freelist_lock);
+       verify_not_on_freelist(c, ca, b);
+       BUG_ON(!fifo_push(&ca->free_inc, b));
+       spin_unlock(&c->freelist_lock);
 
        /*
         * If we're not invalidating cached data, we only increment the bucket
         * gen in memory here, the incremented gen will be updated in the btree
         * by bch2_trans_mark_pointer():
         */
+       if (!m.cached_sectors &&
+           !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) {
+               BUG_ON(m.data_type);
+               bucket_cmpxchg(g, m, m.gen++);
+               percpu_up_read(&c->mark_lock);
+               goto out;
+       }
 
-       if (!invalidating_cached_data)
-               bch2_invalidate_bucket(c, ca, b, &m);
-       else
-               bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
-
-       spin_unlock(&c->freelist_lock);
        percpu_up_read(&c->mark_lock);
 
-       if (!invalidating_cached_data)
-               goto out;
-
        /*
         * If the read-only path is trying to shut down, we can't be generating
         * new btree updates:
@@ -933,8 +847,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans,
                goto out;
        }
 
-       BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
-
        bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
 retry:
        ret = bch2_btree_iter_traverse(iter);
@@ -944,7 +856,7 @@ retry:
        percpu_down_read(&c->mark_lock);
        g = bucket(ca, iter->pos.offset);
        m = READ_ONCE(g->mark);
-       u = alloc_mem_to_key(g, m);
+       u = alloc_mem_to_key(iter, g, m);
 
        percpu_up_read(&c->mark_lock);
 
@@ -954,14 +866,11 @@ retry:
        u.data_type     = 0;
        u.dirty_sectors = 0;
        u.cached_sectors = 0;
-       u.read_time     = c->bucket_clock[READ].hand;
-       u.write_time    = c->bucket_clock[WRITE].hand;
-
-       a = bkey_alloc_init(&alloc_key.k);
-       a->k.p = iter->pos;
-       bch2_alloc_pack(a, u);
+       u.read_time     = atomic64_read(&c->io_clock[READ].now);
+       u.write_time    = atomic64_read(&c->io_clock[WRITE].now);
 
-       bch2_trans_update(trans, iter, &a->k_i,
+       bch2_alloc_pack(c, &a, u);
+       bch2_trans_update(trans, iter, &a.k,
                          BTREE_TRIGGER_BUCKET_INVALIDATE);
 
        /*
@@ -976,8 +885,7 @@ retry:
                                BTREE_INSERT_NOUNLOCK|
                                BTREE_INSERT_NOCHECK_RW|
                                BTREE_INSERT_NOFAIL|
-                               BTREE_INSERT_USE_RESERVE|
-                               BTREE_INSERT_USE_ALLOC_RESERVE|
+                               BTREE_INSERT_JOURNAL_RESERVED|
                                flags);
        if (ret == -EINTR)
                goto retry;
@@ -1029,8 +937,7 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
        int ret = 0;
 
        bch2_trans_init(&trans, c, 0, 0);
-
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC,
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc,
                                   POS(ca->dev_idx, 0),
                                   BTREE_ITER_CACHED|
                                   BTREE_ITER_CACHED_NOFILL|
@@ -1045,6 +952,7 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
                                (!fifo_empty(&ca->free_inc)
                                 ? BTREE_INSERT_NOWAIT : 0));
 
+       bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
 
        /* If we used NOWAIT, don't return the error: */
@@ -1138,6 +1046,12 @@ static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca)
        return 0;
 }
 
+static inline bool allocator_thread_running(struct bch_dev *ca)
+{
+       return ca->mi.state == BCH_MEMBER_STATE_rw &&
+               test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags);
+}
+
 /**
  * bch_allocator_thread - move buckets from free_inc to reserves
  *
@@ -1154,9 +1068,16 @@ static int bch2_allocator_thread(void *arg)
        int ret;
 
        set_freezable();
-       ca->allocator_state = ALLOCATOR_RUNNING;
 
        while (1) {
+               if (!allocator_thread_running(ca)) {
+                       ca->allocator_state = ALLOCATOR_STOPPED;
+                       if (kthread_wait_freezable(allocator_thread_running(ca)))
+                               break;
+               }
+
+               ca->allocator_state = ALLOCATOR_RUNNING;
+
                cond_resched();
                if (kthread_should_stop())
                        break;
@@ -1456,9 +1377,12 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
                return 0;
 
        p = kthread_create(bch2_allocator_thread, ca,
-                          "bch_alloc[%s]", ca->name);
-       if (IS_ERR(p))
+                          "bch-alloc/%s", ca->name);
+       if (IS_ERR(p)) {
+               bch_err(ca->fs, "error creating allocator thread: %li",
+                       PTR_ERR(p));
                return PTR_ERR(p);
+       }
 
        get_task_struct(p);
        rcu_assign_pointer(ca->alloc_thread, p);
@@ -1469,8 +1393,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
        spin_lock_init(&c->freelist_lock);
-       bch2_bucket_clock_init(c, READ);
-       bch2_bucket_clock_init(c, WRITE);
 
        c->pd_controllers_update_seconds = 5;
        INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);