]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to f05b3c1af9 bcachefs: Improve bucket_alloc_fail tracepoint
authorKent Overstreet <kent.overstreet@gmail.com>
Sun, 13 Mar 2022 23:21:13 +0000 (19:21 -0400)
committerKent Overstreet <kent.overstreet@gmail.com>
Sun, 13 Mar 2022 23:21:13 +0000 (19:21 -0400)
31 files changed:
.bcachefs_revision
include/trace/events/bcachefs.h
libbcachefs/alloc_background.c
libbcachefs/alloc_background.h
libbcachefs/alloc_foreground.c
libbcachefs/alloc_foreground.h
libbcachefs/alloc_types.h
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bkey_methods.c
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_types.h
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/buckets_types.h
libbcachefs/extent_update.c
libbcachefs/journal.c
libbcachefs/journal_io.c
libbcachefs/journal_sb.c [new file with mode: 0644]
libbcachefs/journal_sb.h [new file with mode: 0644]
libbcachefs/lru.c [new file with mode: 0644]
libbcachefs/lru.h [new file with mode: 0644]
libbcachefs/movinggc.c
libbcachefs/opts.h
libbcachefs/recovery.c
libbcachefs/super-io.c
libbcachefs/super-io.h
libbcachefs/super.c
libbcachefs/super_types.h
libbcachefs/sysfs.c

index 74f5970f7a516fb9f45efdc43ec5497455261b08..be0ed057ebf613c0844be11f3b938c9aeaa1c1cb 100644 (file)
@@ -1 +1 @@
-e48731a188639563444d475622782b7963df4b47
+f05b3c1af906802e46f9caca13fb6260d8293fdf
index 0596887959d3ef6be198e58c1c19dd73caef2ea3..832e9f1914094466eb2c5e0042f11c3d949eedd1 100644 (file)
@@ -491,9 +491,30 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc,
        TP_ARGS(ca, reserve)
 );
 
-DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
-       TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
-       TP_ARGS(ca, reserve)
+TRACE_EVENT(bucket_alloc_fail,
+       TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve,
+                u64 avail, u64 need_journal_commit),
+       TP_ARGS(ca, reserve, avail, need_journal_commit),
+
+       TP_STRUCT__entry(
+               __field(dev_t,                  dev     )
+               __field(enum alloc_reserve,     reserve )
+               __field(u64,                    avail   )
+               __field(u64,                    need_journal_commit )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = ca->dev;
+               __entry->reserve        = reserve;
+               __entry->avail          = avail;
+               __entry->need_journal_commit = need_journal_commit;
+       ),
+
+       TP_printk("%d,%d reserve %d avail %llu need_journal_commit %llu",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->reserve,
+                 __entry->avail,
+                 __entry->need_journal_commit)
 );
 
 DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail,
index 4afb2d457fb09cbf92a1f078f5e64fe3c7d1bfc1..0c33424393bee1b87300e657cfc4c5866cb642f9 100644 (file)
@@ -14,6 +14,7 @@
 #include "debug.h"
 #include "ec.h"
 #include "error.h"
+#include "lru.h"
 #include "recovery.h"
 #include "varint.h"
 
 #include <linux/sort.h>
 #include <trace/events/bcachefs.h>
 
-const char * const bch2_allocator_states[] = {
-#define x(n)   #n,
-       ALLOC_THREAD_STATES()
-#undef x
-       NULL
-};
-
 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
        BCH_ALLOC_FIELDS_V1()
 #undef x
 };
 
+const char * const bch2_bucket_states[] = {
+       "free",
+       "need gc gens",
+       "need discard",
+       "cached",
+       "dirty",
+       NULL
+};
+
 /* Persistent alloc info: */
 
 static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
@@ -161,6 +164,8 @@ static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
        out->gen        = a.v->gen;
        out->oldest_gen = a.v->oldest_gen;
        out->data_type  = a.v->data_type;
+       out->need_discard = BCH_ALLOC_NEED_DISCARD(a.v);
+       out->need_inc_gen = BCH_ALLOC_NEED_INC_GEN(a.v);
        out->journal_seq = le64_to_cpu(a.v->journal_seq);
 
 #define x(_name, _bits)                                                        \
@@ -197,6 +202,8 @@ static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst,
        a->v.oldest_gen = src.oldest_gen;
        a->v.data_type  = src.data_type;
        a->v.journal_seq = cpu_to_le64(src.journal_seq);
+       SET_BCH_ALLOC_NEED_DISCARD(&a->v, src.need_discard);
+       SET_BCH_ALLOC_NEED_INC_GEN(&a->v, src.need_inc_gen);
 
 #define x(_name, _bits)                                                        \
        nr_fields++;                                                    \
@@ -325,22 +332,20 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 {
        struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
 
-       pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu",
+       pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu need_discard %u",
               u.gen, u.oldest_gen, bch2_data_types[u.data_type],
-              u.journal_seq);
+              u.journal_seq, u.need_discard);
 #define x(_name, ...)  pr_buf(out, " " #_name " %llu", (u64) u._name);
        BCH_ALLOC_FIELDS_V2()
 #undef  x
 }
 
-int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
+int bch2_alloc_read(struct bch_fs *c)
 {
        struct btree_trans trans;
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bch_dev *ca;
-       struct bucket *g;
-       struct bkey_alloc_unpacked u;
        int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
@@ -348,31 +353,8 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
        for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
                ca = bch_dev_bkey_exists(c, k.k->p.inode);
-               g = __bucket(ca, k.k->p.offset, gc);
-               u = bch2_alloc_unpack(k);
-
-               if (!gc)
-                       *bucket_gen(ca, k.k->p.offset) = u.gen;
-
-               g->_mark.gen            = u.gen;
-               g->io_time[READ]        = u.read_time;
-               g->io_time[WRITE]       = u.write_time;
-               g->oldest_gen           = !gc ? u.oldest_gen : u.gen;
-               g->gen_valid            = 1;
-
-               if (!gc ||
-                   (metadata_only &&
-                    (u.data_type == BCH_DATA_user ||
-                     u.data_type == BCH_DATA_cached ||
-                     u.data_type == BCH_DATA_parity))) {
-                       g->_mark.data_type      = u.data_type;
-                       g->_mark.dirty_sectors  = u.dirty_sectors;
-                       g->_mark.cached_sectors = u.cached_sectors;
-                       g->_mark.stripe         = u.stripe != 0;
-                       g->stripe               = u.stripe;
-                       g->stripe_redundancy    = u.stripe_redundancy;
-               }
 
+               *bucket_gen(ca, k.k->p.offset) = bch2_alloc_unpack(k).gen;
        }
        bch2_trans_iter_exit(&trans, &iter);
 
@@ -384,515 +366,711 @@ int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
        return ret;
 }
 
-/* Bucket IO clocks: */
+/* Free space/discard btree: */
 
-int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
-                             size_t bucket_nr, int rw)
+static int bch2_bucket_do_index(struct btree_trans *trans,
+                               struct bkey_s_c alloc_k,
+                               struct bkey_alloc_unpacked a,
+                               bool set)
 {
        struct bch_fs *c = trans->c;
+       struct bch_dev *ca = bch_dev_bkey_exists(c, a.dev);
        struct btree_iter iter;
-       struct bkey_s_c k;
-       struct bkey_alloc_unpacked u;
-       u64 *time, now;
-       int ret = 0;
+       struct bkey_s_c old;
+       struct bkey_i *k;
+       enum bucket_state state = bucket_state(a);
+       enum btree_id btree;
+       enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted;
+       enum bch_bkey_type new_type =  set ? KEY_TYPE_set : KEY_TYPE_deleted;
+       struct printbuf buf = PRINTBUF;
+       int ret;
 
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr),
-                            BTREE_ITER_CACHED|
-                            BTREE_ITER_INTENT);
-       k = bch2_btree_iter_peek_slot(&iter);
-       ret = bkey_err(k);
-       if (ret)
-               goto out;
+       if (state != BUCKET_free &&
+           state != BUCKET_need_discard)
+               return 0;
 
-       u = bch2_alloc_unpack(k);
+       k = bch2_trans_kmalloc(trans, sizeof(*k));
+       if (IS_ERR(k))
+               return PTR_ERR(k);
 
-       time = rw == READ ? &u.read_time : &u.write_time;
-       now = atomic64_read(&c->io_clock[rw].now);
-       if (*time == now)
-               goto out;
+       bkey_init(&k->k);
+       k->k.type = new_type;
 
-       *time = now;
+       switch (state) {
+       case BUCKET_free:
+               btree = BTREE_ID_freespace;
+               k->k.p = alloc_freespace_pos(a);
+               bch2_key_resize(&k->k, 1);
+               break;
+       case BUCKET_need_discard:
+               btree = BTREE_ID_need_discard;
+               k->k.p = POS(a.dev, a.bucket);
+               break;
+       default:
+               return 0;
+       }
 
-       ret   = bch2_alloc_write(trans, &iter, &u, 0) ?:
-               bch2_trans_commit(trans, NULL, NULL, 0);
-out:
+       bch2_trans_iter_init(trans, &iter, btree,
+                            bkey_start_pos(&k->k),
+                            BTREE_ITER_INTENT);
+       old = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(old);
+       if (ret)
+               goto err;
+
+       if (ca->mi.freespace_initialized &&
+           bch2_fs_inconsistent_on(old.k->type != old_type, c,
+                       "incorrect key when %s %s btree (got %s should be %s)\n"
+                       "  for %s",
+                       set ? "setting" : "clearing",
+                       bch2_btree_ids[btree],
+                       bch2_bkey_types[old.k->type],
+                       bch2_bkey_types[old_type],
+                       (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+               ret = -EIO;
+               goto err;
+       }
+
+       ret = bch2_trans_update(trans, &iter, k, 0);
+err:
        bch2_trans_iter_exit(trans, &iter);
+       printbuf_exit(&buf);
        return ret;
 }
 
-/* Background allocator thread: */
+int bch2_trans_mark_alloc(struct btree_trans *trans,
+                         struct bkey_s_c old, struct bkey_i *new,
+                         unsigned flags)
+{
+       struct bch_fs *c = trans->c;
+       struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old);
+       struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(bkey_i_to_s_c(new));
+       u64 old_lru, new_lru;
+       bool need_repack = false;
+       int ret = 0;
+
+       if (new_u.dirty_sectors > old_u.dirty_sectors ||
+           new_u.cached_sectors > old_u.cached_sectors) {
+               new_u.read_time = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+               new_u.write_time = max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
+               new_u.need_inc_gen = true;
+               new_u.need_discard = true;
+               need_repack = true;
+       }
+
+       if (old_u.data_type && !new_u.data_type &&
+           old_u.gen == new_u.gen &&
+           !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
+               new_u.gen++;
+               new_u.need_inc_gen = false;
+               need_repack = true;
+       }
+
+       if (bucket_state(old_u) != bucket_state(new_u) ||
+           (bucket_state(new_u) == BUCKET_free &&
+            alloc_freespace_genbits(old_u) != alloc_freespace_genbits(new_u))) {
+               ret =   bch2_bucket_do_index(trans, old, old_u, false) ?:
+                       bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_u, true);
+               if (ret)
+                       return ret;
+       }
+
+       old_lru = alloc_lru_idx(old_u);
+       new_lru = alloc_lru_idx(new_u);
 
-/*
- * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
- * (marking them as invalidated on disk), then optionally issues discard
- * commands to the newly free buckets, then puts them on the various freelists.
- */
+       if (old_lru != new_lru) {
+               ret = bch2_lru_change(trans, new->k.p.inode, new->k.p.offset,
+                                     old_lru, &new_lru);
+               if (ret)
+                       return ret;
+
+               if (new_lru && new_u.read_time != new_lru) {
+                       new_u.read_time = new_lru;
+                       need_repack = true;
+               }
+       }
+
+       if (need_repack && !bkey_deleted(&new->k))
+               bch2_alloc_pack_v3((void *) new, new_u);
+
+       return 0;
+}
 
-static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
-                                      struct bucket_mark m)
+static int bch2_check_alloc_key(struct btree_trans *trans,
+                               struct btree_iter *alloc_iter)
 {
-       u8 gc_gen;
+       struct bch_fs *c = trans->c;
+       struct btree_iter discard_iter, freespace_iter, lru_iter;
+       struct bkey_alloc_unpacked a;
+       unsigned discard_key_type, freespace_key_type;
+       struct bkey_s_c alloc_k, k;
+       struct printbuf buf = PRINTBUF;
+       struct printbuf buf2 = PRINTBUF;
+       int ret;
 
-       if (!is_available_bucket(m))
-               return false;
+       alloc_k = bch2_btree_iter_peek(alloc_iter);
+       if (!alloc_k.k)
+               return 0;
 
-       if (m.owned_by_allocator)
-               return false;
+       ret = bkey_err(alloc_k);
+       if (ret)
+               return ret;
 
-       if (ca->buckets_nouse &&
-           test_bit(b, ca->buckets_nouse))
-               return false;
+       a = bch2_alloc_unpack(alloc_k);
+       discard_key_type = bucket_state(a) == BUCKET_need_discard
+               ? KEY_TYPE_set : 0;
+       freespace_key_type = bucket_state(a) == BUCKET_free
+               ? KEY_TYPE_set : 0;
 
-       if (ca->new_fs_bucket_idx) {
-               /*
-                * Device or filesystem is still being initialized, and we
-                * haven't fully marked superblocks & journal:
-                */
-               if (is_superblock_bucket(ca, b))
-                       return false;
+       bch2_trans_iter_init(trans, &discard_iter, BTREE_ID_need_discard,
+                            alloc_k.k->p, 0);
+       bch2_trans_iter_init(trans, &freespace_iter, BTREE_ID_freespace,
+                            alloc_freespace_pos(a), 0);
+       bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
+                            POS(a.dev, a.read_time), 0);
 
-               if (b < ca->new_fs_bucket_idx)
-                       return false;
+       k = bch2_btree_iter_peek_slot(&discard_iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (fsck_err_on(k.k->type != discard_key_type, c,
+                       "incorrect key in need_discard btree (got %s should be %s)\n"
+                       "  %s",
+                       bch2_bkey_types[k.k->type],
+                       bch2_bkey_types[discard_key_type],
+                       (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+               struct bkey_i *update =
+                       bch2_trans_kmalloc(trans, sizeof(*update));
+
+               ret = PTR_ERR_OR_ZERO(update);
+               if (ret)
+                       goto err;
+
+               bkey_init(&update->k);
+               update->k.type  = discard_key_type;
+               update->k.p     = discard_iter.pos;
+
+               ret =   bch2_trans_update(trans, &discard_iter, update, 0) ?:
+                       bch2_trans_commit(trans, NULL, NULL, 0);
+               if (ret)
+                       goto err;
        }
 
-       gc_gen = bucket_gc_gen(bucket(ca, b));
+       k = bch2_btree_iter_peek_slot(&freespace_iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (fsck_err_on(k.k->type != freespace_key_type, c,
+                       "incorrect key in freespace btree (got %s should be %s)\n"
+                       "  %s",
+                       bch2_bkey_types[k.k->type],
+                       bch2_bkey_types[freespace_key_type],
+                       (printbuf_reset(&buf),
+                        bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+               struct bkey_i *update =
+                       bch2_trans_kmalloc(trans, sizeof(*update));
+
+               ret = PTR_ERR_OR_ZERO(update);
+               if (ret)
+                       goto err;
 
-       ca->inc_gen_needs_gc            += gc_gen >= BUCKET_GC_GEN_MAX / 2;
-       ca->inc_gen_really_needs_gc     += gc_gen >= BUCKET_GC_GEN_MAX;
+               bkey_init(&update->k);
+               update->k.type  = freespace_key_type;
+               update->k.p     = freespace_iter.pos;
+               bch2_key_resize(&update->k, 1);
 
-       return gc_gen < BUCKET_GC_GEN_MAX;
-}
+               ret   = bch2_trans_update(trans, &freespace_iter, update, 0) ?:
+                       bch2_trans_commit(trans, NULL, NULL, 0);
+               if (ret)
+                       goto err;
+       }
 
-/*
- * Determines what order we're going to reuse buckets, smallest bucket_key()
- * first.
- */
+       if (bucket_state(a) == BUCKET_cached) {
+               if (fsck_err_on(!a.read_time, c,
+                               "cached bucket with read_time 0\n"
+                               "  %s",
+                       (printbuf_reset(&buf),
+                        bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
 
-static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
-                               u64 now, u64 last_seq_ondisk)
-{
-       unsigned used = m.cached_sectors;
+                       a.read_time = atomic64_read(&c->io_clock[READ].now);
 
-       if (used) {
-               /*
-                * Prefer to keep buckets that have been read more recently, and
-                * buckets that have more data in them:
-                */
-               u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
-               u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
+                       ret   = bch2_lru_change(trans, a.dev, a.bucket,
+                                               0, &a.read_time) ?:
+                               bch2_alloc_write(trans, alloc_iter, &a, BTREE_TRIGGER_NORUN);
+                               bch2_trans_commit(trans, NULL, NULL, 0);
+                       if (ret)
+                               goto err;
+               }
 
-               return -last_read_scaled;
-       } else {
-               /*
-                * Prefer to use buckets with smaller gc_gen so that we don't
-                * have to walk the btree and recalculate oldest_gen - but shift
-                * off the low bits so that buckets will still have equal sort
-                * keys when there's only a small difference, so that we can
-                * keep sequential buckets together:
-                */
-               return bucket_gc_gen(g) >> 4;
+               k = bch2_btree_iter_peek_slot(&lru_iter);
+               ret = bkey_err(k);
+               if (ret)
+                       goto err;
+
+               if (fsck_err_on(k.k->type != KEY_TYPE_lru ||
+                               le64_to_cpu(bkey_s_c_to_lru(k).v->idx) != a.bucket, c,
+                       "incorrect/missing lru entry\n"
+                       "  %s\n"
+                       "  %s",
+                       (printbuf_reset(&buf),
+                        bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf),
+                       (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
+                       u64 read_time = a.read_time;
+
+                       ret   = bch2_lru_change(trans, a.dev, a.bucket,
+                                               0, &a.read_time) ?:
+                               (a.read_time != read_time
+                                ? bch2_alloc_write(trans, alloc_iter, &a, BTREE_TRIGGER_NORUN)
+                                : 0) ?:
+                               bch2_trans_commit(trans, NULL, NULL, 0);
+                       if (ret)
+                               goto err;
+               }
        }
+err:
+fsck_err:
+       bch2_trans_iter_exit(trans, &lru_iter);
+       bch2_trans_iter_exit(trans, &freespace_iter);
+       bch2_trans_iter_exit(trans, &discard_iter);
+       printbuf_exit(&buf2);
+       printbuf_exit(&buf);
+       return ret;
 }
 
-static inline int bucket_alloc_cmp(alloc_heap *h,
-                                  struct alloc_heap_entry l,
-                                  struct alloc_heap_entry r)
+static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos)
 {
-       return  cmp_int(l.key, r.key) ?:
-               cmp_int(r.nr, l.nr) ?:
-               cmp_int(l.bucket, r.bucket);
-}
+       struct bch_dev *ca;
 
-static inline int bucket_idx_cmp(const void *_l, const void *_r)
-{
-       const struct alloc_heap_entry *l = _l, *r = _r;
+       if (pos.inode >= c->sb.nr_devices || !c->devs[pos.inode])
+               return false;
 
-       return cmp_int(l->bucket, r->bucket);
+       ca = bch_dev_bkey_exists(c, pos.inode);
+       return pos.offset >= ca->mi.first_bucket &&
+               pos.offset < ca->mi.nbuckets;
 }
 
-static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
+static int bch2_check_freespace_key(struct btree_trans *trans,
+                                   struct btree_iter *freespace_iter,
+                                   bool initial)
 {
-       struct bucket_array *buckets;
-       struct alloc_heap_entry e = { 0 };
-       u64 now, last_seq_ondisk;
-       size_t b, i, nr = 0;
-
-       down_read(&ca->bucket_lock);
+       struct bch_fs *c = trans->c;
+       struct btree_iter alloc_iter;
+       struct bkey_s_c k, freespace_k;
+       struct bkey_alloc_unpacked a;
+       u64 genbits;
+       struct bpos pos;
+       struct bkey_i *update;
+       struct printbuf buf = PRINTBUF;
+       int ret;
 
-       buckets = bucket_array(ca);
-       ca->alloc_heap.used = 0;
-       now = atomic64_read(&c->io_clock[READ].now);
-       last_seq_ondisk = c->journal.flushed_seq_ondisk;
+       freespace_k = bch2_btree_iter_peek(freespace_iter);
+       if (!freespace_k.k)
+               return 1;
 
-       /*
-        * Find buckets with lowest read priority, by building a maxheap sorted
-        * by read priority and repeatedly replacing the maximum element until
-        * all buckets have been visited.
-        */
-       for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
-               struct bucket *g = &buckets->b[b];
-               struct bucket_mark m = READ_ONCE(g->mark);
-               unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
+       ret = bkey_err(freespace_k);
+       if (ret)
+               return ret;
 
-               cond_resched();
+       pos = freespace_iter->pos;
+       pos.offset &= ~(~0ULL << 56);
+       genbits = freespace_iter->pos.offset & (~0ULL << 56);
 
-               if (!bch2_can_invalidate_bucket(ca, b, m))
-                       continue;
+       bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc, pos, 0);
 
-               if (!m.data_type &&
-                   bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
-                                                    last_seq_ondisk,
-                                                    ca->dev_idx, b)) {
-                       ca->buckets_waiting_on_journal++;
-                       continue;
-               }
+       if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c,
+                       "%llu:%llu set in freespace btree but device or bucket does not exist",
+                       pos.inode, pos.offset))
+               goto delete;
 
-               if (e.nr && e.bucket + e.nr == b && e.key == key) {
-                       e.nr++;
-               } else {
-                       if (e.nr)
-                               heap_add_or_replace(&ca->alloc_heap, e,
-                                       -bucket_alloc_cmp, NULL);
-
-                       e = (struct alloc_heap_entry) {
-                               .bucket = b,
-                               .nr     = 1,
-                               .key    = key,
-                       };
-               }
-       }
+       k = bch2_btree_iter_peek_slot(&alloc_iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
 
-       if (e.nr)
-               heap_add_or_replace(&ca->alloc_heap, e,
-                               -bucket_alloc_cmp, NULL);
+       a = bch2_alloc_unpack(k);
 
-       for (i = 0; i < ca->alloc_heap.used; i++)
-               nr += ca->alloc_heap.data[i].nr;
+       if (fsck_err_on(bucket_state(a) != BUCKET_free ||
+                       genbits != alloc_freespace_genbits(a), c,
+                       "%s\n  incorrectly set in freespace index (free %u, genbits %llu should be %llu)",
+                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+                       bucket_state(a) == BUCKET_free,
+                       genbits >> 56, alloc_freespace_genbits(a) >> 56))
+               goto delete;
+out:
+err:
+fsck_err:
+       bch2_trans_iter_exit(trans, &alloc_iter);
+       printbuf_exit(&buf);
+       return ret;
+delete:
+       update = bch2_trans_kmalloc(trans, sizeof(*update));
+       ret = PTR_ERR_OR_ZERO(update);
+       if (ret)
+               goto err;
 
-       while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
-               nr -= ca->alloc_heap.data[0].nr;
-               heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
-       }
+       bkey_init(&update->k);
+       update->k.p = freespace_iter->pos;
+       bch2_key_resize(&update->k, 1);
 
-       up_read(&ca->bucket_lock);
+       ret   = bch2_trans_update(trans, freespace_iter, update, 0) ?:
+               bch2_trans_commit(trans, NULL, NULL, 0);
+       goto out;
 }
 
-static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
+int bch2_check_alloc_info(struct bch_fs *c, bool initial)
 {
-       size_t i, nr = 0;
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret = 0;
 
-       ca->inc_gen_needs_gc                    = 0;
-       ca->inc_gen_really_needs_gc             = 0;
-       ca->buckets_waiting_on_journal          = 0;
+       bch2_trans_init(&trans, c, 0, 0);
 
-       find_reclaimable_buckets_lru(c, ca);
+       for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+                       bch2_check_alloc_key(&trans, &iter));
+               if (ret)
+                       break;
+       }
+       bch2_trans_iter_exit(&trans, &iter);
 
-       heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
+       if (ret)
+               goto err;
 
-       for (i = 0; i < ca->alloc_heap.used; i++)
-               nr += ca->alloc_heap.data[i].nr;
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_freespace, POS_MIN,
+                            BTREE_ITER_PREFETCH);
+       while (1) {
+               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+                       bch2_check_freespace_key(&trans, &iter, initial));
+               if (ret)
+                       break;
 
-       return nr;
+               bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+err:
+       bch2_trans_exit(&trans);
+       return ret < 0 ? ret : 0;
 }
 
-static int bucket_invalidate_btree(struct btree_trans *trans,
-                                  struct bch_dev *ca, u64 b,
-                                  struct bkey_alloc_unpacked *u)
+static int bch2_clear_need_discard(struct btree_trans *trans, struct bpos pos,
+                                  struct bch_dev *ca, bool *discard_done)
 {
        struct bch_fs *c = trans->c;
        struct btree_iter iter;
        struct bkey_s_c k;
+       struct bkey_alloc_unpacked a;
+       struct printbuf buf = PRINTBUF;
        int ret;
 
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
-                            POS(ca->dev_idx, b),
-                            BTREE_ITER_CACHED|
-                            BTREE_ITER_INTENT);
-
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, pos,
+                            BTREE_ITER_CACHED);
        k = bch2_btree_iter_peek_slot(&iter);
        ret = bkey_err(k);
        if (ret)
-               goto err;
+               goto out;
 
-       *u = bch2_alloc_unpack(k);
-       u->gen++;
-       u->data_type            = 0;
-       u->dirty_sectors        = 0;
-       u->cached_sectors       = 0;
-       u->read_time            = atomic64_read(&c->io_clock[READ].now);
-       u->write_time           = atomic64_read(&c->io_clock[WRITE].now);
+       a = bch2_alloc_unpack(k);
 
-       ret = bch2_alloc_write(trans, &iter, u,
-                              BTREE_TRIGGER_BUCKET_INVALIDATE);
-err:
+       if (a.need_inc_gen) {
+               a.gen++;
+               a.need_inc_gen = false;
+               goto write;
+       }
+
+       BUG_ON(a.journal_seq > c->journal.flushed_seq_ondisk);
+
+       if (bch2_fs_inconsistent_on(!a.need_discard, c,
+                       "%s\n  incorrectly set in need_discard btree",
+                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+               ret = -EIO;
+               goto out;
+       }
+
+       if (!*discard_done && ca->mi.discard && !c->opts.nochanges) {
+               /*
+                * This works without any other locks because this is the only
+                * thread that removes items from the need_discard tree
+                */
+               bch2_trans_unlock(trans);
+               blkdev_issue_discard(ca->disk_sb.bdev,
+                                    k.k->p.offset * ca->mi.bucket_size,
+                                    ca->mi.bucket_size,
+                                    GFP_KERNEL, 0);
+               *discard_done = true;
+
+               ret = bch2_trans_relock(trans);
+               if (ret)
+                       goto out;
+       }
+
+       a.need_discard = false;
+write:
+       ret = bch2_alloc_write(trans, &iter, &a, 0);
+out:
        bch2_trans_iter_exit(trans, &iter);
+       printbuf_exit(&buf);
        return ret;
 }
 
-static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
-                                     u64 *journal_seq, unsigned flags)
+static void bch2_do_discards_work(struct work_struct *work)
 {
-       struct bkey_alloc_unpacked u;
-       size_t b;
-       u64 commit_seq = 0;
-       int ret = 0;
+       struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
+       struct bch_dev *ca = NULL;
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
 
-       /*
-        * If the read-only path is trying to shut down, we can't be generating
-        * new btree updates:
-        */
-       if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags))
-               return 1;
+       bch2_trans_init(&trans, c, 0, 0);
 
-       BUG_ON(!ca->alloc_heap.used ||
-              !ca->alloc_heap.data[0].nr);
-       b = ca->alloc_heap.data[0].bucket;
+       for_each_btree_key(&trans, iter, BTREE_ID_need_discard,
+                          POS_MIN, 0, k, ret) {
+               bool discard_done = false;
 
-       /* first, put on free_inc and mark as owned by allocator: */
-       percpu_down_read(&c->mark_lock);
+               if (ca && k.k->p.inode != ca->dev_idx) {
+                       percpu_ref_put(&ca->io_ref);
+                       ca = NULL;
+               }
+
+               if (!ca) {
+                       ca = bch_dev_bkey_exists(c, k.k->p.inode);
+                       if (!percpu_ref_tryget(&ca->io_ref)) {
+                               ca = NULL;
+                               bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+                               continue;
+                       }
+               }
 
-       bch2_mark_alloc_bucket(c, ca, b, true);
+               if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+                               c->journal.flushed_seq_ondisk,
+                               k.k->p.inode, k.k->p.offset) ||
+                   bch2_bucket_is_open_safe(c, k.k->p.inode, k.k->p.offset))
+                       continue;
 
-       spin_lock(&c->freelist_lock);
-       verify_not_on_freelist(c, ca, b);
-       BUG_ON(!fifo_push(&ca->free_inc, b));
-       spin_unlock(&c->freelist_lock);
+               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+                               bch2_clear_need_discard(&trans, k.k->p, ca, &discard_done));
+               if (ret)
+                       break;
+       }
+       bch2_trans_iter_exit(&trans, &iter);
 
-       percpu_up_read(&c->mark_lock);
+       if (ca)
+               percpu_ref_put(&ca->io_ref);
 
-       ret = bch2_trans_do(c, NULL, &commit_seq,
-                           BTREE_INSERT_NOCHECK_RW|
-                           BTREE_INSERT_NOFAIL|
-                           BTREE_INSERT_JOURNAL_RESERVED|
-                           flags,
-                           bucket_invalidate_btree(&trans, ca, b, &u));
+       bch2_trans_exit(&trans);
+       percpu_ref_put(&c->writes);
+}
 
-       if (!ret) {
-               /* remove from alloc_heap: */
-               struct alloc_heap_entry e, *top = ca->alloc_heap.data;
+void bch2_do_discards(struct bch_fs *c)
+{
+       if (percpu_ref_tryget(&c->writes) &&
+           !queue_work(system_long_wq, &c->discard_work))
+               percpu_ref_put(&c->writes);
+}
 
-               top->bucket++;
-               top->nr--;
+static int invalidate_one_bucket(struct btree_trans *trans, struct bch_dev *ca)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter lru_iter, alloc_iter = { NULL };
+       struct bkey_s_c k;
+       struct bkey_alloc_unpacked a;
+       u64 bucket, idx;
+       int ret;
 
-               if (!top->nr)
-                       heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
+       bch2_trans_iter_init(trans, &lru_iter, BTREE_ID_lru,
+                            POS(ca->dev_idx, 0), 0);
+       k = bch2_btree_iter_peek(&lru_iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto out;
 
-               /*
-                * If we invalidating cached data then we need to wait on the
-                * journal commit:
-                */
-               if (u.data_type)
-                       *journal_seq = max(*journal_seq, commit_seq);
+       if (!k.k || k.k->p.inode != ca->dev_idx)
+               goto out;
 
-               /*
-                * We already waiting on u.alloc_seq when we filtered out
-                * buckets that need journal commit:
-                */
-               BUG_ON(*journal_seq > u.journal_seq);
-       } else {
-               size_t b2;
+       if (bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_lru, c,
+                                   "non lru key in lru btree"))
+               goto out;
 
-               /* remove from free_inc: */
-               percpu_down_read(&c->mark_lock);
-               spin_lock(&c->freelist_lock);
+       idx     = k.k->p.offset;
+       bucket  = le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
 
-               bch2_mark_alloc_bucket(c, ca, b, false);
+       bch2_trans_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
+                            POS(ca->dev_idx, bucket),
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&alloc_iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto out;
 
-               BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
-               BUG_ON(b != b2);
+       a = bch2_alloc_unpack(k);
 
-               spin_unlock(&c->freelist_lock);
-               percpu_up_read(&c->mark_lock);
-       }
+       if (bch2_fs_inconsistent_on(idx != alloc_lru_idx(a), c,
+                       "invalidating bucket with wrong lru idx (got %llu should be %llu",
+                       idx, alloc_lru_idx(a)))
+               goto out;
 
-       return ret < 0 ? ret : 0;
+       a.gen++;
+       a.need_inc_gen          = false;
+       a.data_type             = 0;
+       a.dirty_sectors         = 0;
+       a.cached_sectors        = 0;
+       a.read_time             = atomic64_read(&c->io_clock[READ].now);
+       a.write_time            = atomic64_read(&c->io_clock[WRITE].now);
+
+       ret = bch2_alloc_write(trans, &alloc_iter, &a,
+                              BTREE_TRIGGER_BUCKET_INVALIDATE);
+out:
+       bch2_trans_iter_exit(trans, &alloc_iter);
+       bch2_trans_iter_exit(trans, &lru_iter);
+       return ret;
 }
 
-/*
- * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
- */
-static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
+static void bch2_do_invalidates_work(struct work_struct *work)
 {
-       u64 journal_seq = 0;
+       struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
+       struct bch_dev *ca;
+       struct btree_trans trans;
+       unsigned i;
        int ret = 0;
 
-       /* Only use nowait if we've already invalidated at least one bucket: */
-       while (!ret &&
-              !fifo_full(&ca->free_inc) &&
-              ca->alloc_heap.used) {
-               if (kthread_should_stop()) {
-                       ret = 1;
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_member_device(ca, c, i)
+               while (!ret && should_invalidate_buckets(ca))
+                       ret = __bch2_trans_do(&trans, NULL, NULL,
+                                             BTREE_INSERT_NOFAIL,
+                                       invalidate_one_bucket(&trans, ca));
+
+       bch2_trans_exit(&trans);
+       percpu_ref_put(&c->writes);
+}
+
+void bch2_do_invalidates(struct bch_fs *c)
+{
+       if (percpu_ref_tryget(&c->writes))
+               queue_work(system_long_wq, &c->invalidate_work);
+}
+
+static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_alloc_unpacked a;
+       struct bch_member *m;
+       int ret;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_alloc,
+                          POS(ca->dev_idx, ca->mi.first_bucket),
+                          BTREE_ITER_SLOTS|
+                          BTREE_ITER_PREFETCH, k, ret) {
+               if (iter.pos.offset >= ca->mi.nbuckets)
                        break;
-               }
 
-               ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
-                               (!fifo_empty(&ca->free_inc)
-                                ? BTREE_INSERT_NOWAIT : 0));
-               /*
-                * We only want to batch up invalidates when they're going to
-                * require flushing the journal:
-                */
-               if (!journal_seq)
+               a = bch2_alloc_unpack(k);
+               ret = __bch2_trans_do(&trans, NULL, NULL,
+                                     BTREE_INSERT_LAZY_RW,
+                                bch2_bucket_do_index(&trans, k, a, true));
+               if (ret)
                        break;
        }
+       bch2_trans_iter_exit(&trans, &iter);
 
-       /* If we used NOWAIT, don't return the error: */
-       if (!fifo_empty(&ca->free_inc))
-               ret = 0;
-       if (ret < 0)
-               bch_err(ca, "error invalidating buckets: %i", ret);
-       if (ret)
-               return ret;
+       bch2_trans_exit(&trans);
 
-       if (journal_seq)
-               ret = bch2_journal_flush_seq(&c->journal, journal_seq);
        if (ret) {
-               bch_err(ca, "journal error: %i", ret);
+               bch_err(ca, "error initializing free space: %i", ret);
                return ret;
        }
 
-       return 0;
-}
+       mutex_lock(&c->sb_lock);
+       m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx;
+       SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true);
+       mutex_unlock(&c->sb_lock);
 
-static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
-{
-       if (ca->allocator_state != new_state) {
-               ca->allocator_state = new_state;
-               closure_wake_up(&ca->fs->freelist_wait);
-       }
+       return ret;
 }
 
-static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
+int bch2_fs_freespace_init(struct bch_fs *c)
 {
+       struct bch_dev *ca;
        unsigned i;
        int ret = 0;
+       bool doing_init = false;
 
-       spin_lock(&c->freelist_lock);
-       for (i = 0; i < RESERVE_NR; i++) {
-               /*
-                * Don't strand buckets on the copygc freelist until
-                * after recovery is finished:
-                */
-               if (i == RESERVE_MOVINGGC &&
-                   !test_bit(BCH_FS_STARTED, &c->flags))
+       /*
+        * We can crash during the device add path, so we need to check this on
+        * every mount:
+        */
+
+       for_each_member_device(ca, c, i) {
+               if (ca->mi.freespace_initialized)
                        continue;
 
-               if (fifo_push(&ca->free[i], b)) {
-                       fifo_pop(&ca->free_inc, b);
-                       ret = 1;
-                       break;
+               if (!doing_init) {
+                       bch_info(c, "initializing freespace");
+                       doing_init = true;
                }
-       }
-       spin_unlock(&c->freelist_lock);
 
-       ca->allocator_state = ret
-               ? ALLOCATOR_running
-               : ALLOCATOR_blocked_full;
-       closure_wake_up(&c->freelist_wait);
-       return ret;
-}
-
-static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
-{
-       if (!c->opts.nochanges &&
-           ca->mi.discard &&
-           blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
-               blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
-                                    ca->mi.bucket_size, GFP_NOFS, 0);
-}
+               ret = bch2_dev_freespace_init(c, ca);
+               if (ret) {
+                       percpu_ref_put(&ca->ref);
+                       return ret;
+               }
+       }
 
-static bool allocator_thread_running(struct bch_dev *ca)
-{
-       unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
-               test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
-               ? ALLOCATOR_running
-               : ALLOCATOR_stopped;
-       alloc_thread_set_state(ca, state);
-       return state == ALLOCATOR_running;
-}
+       if (doing_init) {
+               mutex_lock(&c->sb_lock);
+               bch2_write_super(c);
+               mutex_unlock(&c->sb_lock);
 
-static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
-{
-       s64 available = dev_buckets_reclaimable(ca) -
-               (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
-       bool ret = available > 0;
+               bch_verbose(c, "done initializing freespace");
+       }
 
-       alloc_thread_set_state(ca, ret
-                              ? ALLOCATOR_running
-                              : ALLOCATOR_blocked);
        return ret;
 }
 
-/**
- * bch_allocator_thread - move buckets from free_inc to reserves
- *
- * The free_inc FIFO is populated by find_reclaimable_buckets(), and
- * the reserves are depleted by bucket allocation. When we run out
- * of free_inc, try to invalidate some buckets and write out
- * prios and gens.
- */
-static int bch2_allocator_thread(void *arg)
-{
-       struct bch_dev *ca = arg;
-       struct bch_fs *c = ca->fs;
-       unsigned long gc_count = c->gc_count;
-       size_t nr;
-       int ret;
-
-       set_freezable();
-
-       while (1) {
-               ret = kthread_wait_freezable(allocator_thread_running(ca));
-               if (ret)
-                       goto stop;
-
-               while (!ca->alloc_heap.used) {
-                       cond_resched();
-
-                       ret = kthread_wait_freezable(buckets_available(ca, gc_count));
-                       if (ret)
-                               goto stop;
-
-                       gc_count = c->gc_count;
-                       nr = find_reclaimable_buckets(c, ca);
-
-                       if (!nr && ca->buckets_waiting_on_journal) {
-                               ret = bch2_journal_flush(&c->journal);
-                               if (ret)
-                                       goto stop;
-                       } else if (nr < (ca->mi.nbuckets >> 6) &&
-                                  ca->buckets_waiting_on_journal >= nr / 2) {
-                               bch2_journal_flush_async(&c->journal, NULL);
-                       }
+/* Bucket IO clocks: */
 
-                       if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
-                            ca->inc_gen_really_needs_gc) &&
-                           c->gc_thread) {
-                               atomic_inc(&c->kick_gc);
-                               wake_up_process(c->gc_thread);
-                       }
+int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+                             size_t bucket_nr, int rw)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_alloc_unpacked u;
+       u64 *time, now;
+       int ret = 0;
 
-                       trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
-                                        ca->inc_gen_really_needs_gc);
-               }
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr),
+                            BTREE_ITER_CACHED|
+                            BTREE_ITER_INTENT);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto out;
 
-               ret = bch2_invalidate_buckets(c, ca);
-               if (ret)
-                       goto stop;
+       u = bch2_alloc_unpack(k);
 
-               while (!fifo_empty(&ca->free_inc)) {
-                       u64 b = fifo_peek(&ca->free_inc);
+       time = rw == READ ? &u.read_time : &u.write_time;
+       now = atomic64_read(&c->io_clock[rw].now);
+       if (*time == now)
+               goto out;
 
-                       discard_one_bucket(c, ca, b);
+       *time = now;
 
-                       ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
-                       if (ret)
-                               goto stop;
-               }
-       }
-stop:
-       alloc_thread_set_state(ca, ALLOCATOR_stopped);
-       return 0;
+       ret   = bch2_alloc_write(trans, &iter, &u, 0) ?:
+               bch2_trans_commit(trans, NULL, NULL, 0);
+out:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
 }
 
 /* Startup/shutdown (ro/rw): */
@@ -903,7 +1081,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
        u64 capacity = 0, reserved_sectors = 0, gc_reserve;
        unsigned bucket_size_max = 0;
        unsigned long ra_pages = 0;
-       unsigned i, j;
+       unsigned i;
 
        lockdep_assert_held(&c->state_lock);
 
@@ -934,8 +1112,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
                 * allocations for foreground writes must wait -
                 * not -ENOSPC calculations.
                 */
-               for (j = 0; j < RESERVE_NONE; j++)
-                       dev_reserve += ca->free[j].size;
+
+               dev_reserve += ca->nr_btree_reserve * 2;
+               dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */
 
                dev_reserve += 1;       /* btree write point */
                dev_reserve += 1;       /* copygc write point */
@@ -991,8 +1170,6 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 {
        unsigned i;
 
-       BUG_ON(ca->alloc_thread);
-
        /* First, remove device from allocation groups: */
 
        for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
@@ -1066,62 +1243,9 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
                        set_bit(ca->dev_idx, c->rw_devs[i].d);
 }
 
-void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
-{
-       if (ca->alloc_thread)
-               closure_wait_event(&c->freelist_wait,
-                                  ca->allocator_state != ALLOCATOR_running);
-}
-
-/* stop allocator thread: */
-void bch2_dev_allocator_stop(struct bch_dev *ca)
-{
-       struct task_struct *p;
-
-       p = rcu_dereference_protected(ca->alloc_thread, 1);
-       ca->alloc_thread = NULL;
-
-       /*
-        * We need an rcu barrier between setting ca->alloc_thread = NULL and
-        * the thread shutting down to avoid bch2_wake_allocator() racing:
-        *
-        * XXX: it would be better to have the rcu barrier be asynchronous
-        * instead of blocking us here
-        */
-       synchronize_rcu();
-
-       if (p) {
-               kthread_stop(p);
-               put_task_struct(p);
-       }
-}
-
-/* start allocator thread: */
-int bch2_dev_allocator_start(struct bch_dev *ca)
-{
-       struct task_struct *p;
-
-       /*
-        * allocator thread already started?
-        */
-       if (ca->alloc_thread)
-               return 0;
-
-       p = kthread_create(bch2_allocator_thread, ca,
-                          "bch-alloc/%s", ca->name);
-       if (IS_ERR(p)) {
-               bch_err(ca->fs, "error creating allocator thread: %li",
-                       PTR_ERR(p));
-               return PTR_ERR(p);
-       }
-
-       get_task_struct(p);
-       rcu_assign_pointer(ca->alloc_thread, p);
-       wake_up_process(p);
-       return 0;
-}
-
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
        spin_lock_init(&c->freelist_lock);
+       INIT_WORK(&c->discard_work, bch2_do_discards_work);
+       INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
 }
index 3eaa6d2042861f6ba46020ee0fa2f7d79140ea7e..06539e036f13cacf127b6af17e3ebd86b3b01d1a 100644 (file)
@@ -8,8 +8,6 @@
 #include "debug.h"
 #include "super.h"
 
-extern const char * const bch2_allocator_states[];
-
 struct bkey_alloc_unpacked {
        u64             journal_seq;
        u64             bucket;
@@ -17,6 +15,8 @@ struct bkey_alloc_unpacked {
        u8              gen;
        u8              oldest_gen;
        u8              data_type;
+       bool            need_discard:1;
+       bool            need_inc_gen:1;
 #define x(_name, _bits)        u##_bits _name;
        BCH_ALLOC_FIELDS_V2()
 #undef  x
@@ -25,6 +25,50 @@ struct bkey_alloc_unpacked {
 /* How out of date a pointer gen is allowed to be: */
 #define BUCKET_GC_GEN_MAX      96U
 
+static inline u8 alloc_gc_gen(struct bkey_alloc_unpacked a)
+{
+       return a.gen - a.oldest_gen;
+}
+
+enum bucket_state {
+       BUCKET_free,
+       BUCKET_need_gc_gens,
+       BUCKET_need_discard,
+       BUCKET_cached,
+       BUCKET_dirty,
+};
+
+extern const char * const bch2_bucket_states[];
+
+static inline enum bucket_state bucket_state(struct bkey_alloc_unpacked a)
+{
+       if (a.dirty_sectors || a.stripe)
+               return BUCKET_dirty;
+       if (a.cached_sectors)
+               return BUCKET_cached;
+       BUG_ON(a.data_type);
+       if (a.need_discard)
+               return BUCKET_need_discard;
+       if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX)
+               return BUCKET_need_gc_gens;
+       return BUCKET_free;
+}
+
+static inline u64 alloc_lru_idx(struct bkey_alloc_unpacked a)
+{
+       return bucket_state(a) == BUCKET_cached ? a.read_time : 0;
+}
+
+static inline u64 alloc_freespace_genbits(struct bkey_alloc_unpacked a)
+{
+       return ((u64) alloc_gc_gen(a) >> 4) << 56;
+}
+
+static inline struct bpos alloc_freespace_pos(struct bkey_alloc_unpacked a)
+{
+       return POS(a.dev, a.bucket | alloc_freespace_genbits(a));
+}
+
 /* returns true if not equal */
 static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
                                           struct bkey_alloc_unpacked r)
@@ -65,18 +109,21 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
 #define bch2_bkey_ops_alloc (struct bkey_ops) {                \
        .key_invalid    = bch2_alloc_v1_invalid,        \
        .val_to_text    = bch2_alloc_to_text,           \
+       .trans_trigger  = bch2_trans_mark_alloc,        \
        .atomic_trigger = bch2_mark_alloc,              \
 }
 
 #define bch2_bkey_ops_alloc_v2 (struct bkey_ops) {     \
        .key_invalid    = bch2_alloc_v2_invalid,        \
        .val_to_text    = bch2_alloc_to_text,           \
+       .trans_trigger  = bch2_trans_mark_alloc,        \
        .atomic_trigger = bch2_mark_alloc,              \
 }
 
 #define bch2_bkey_ops_alloc_v3 (struct bkey_ops) {     \
        .key_invalid    = bch2_alloc_v3_invalid,        \
        .val_to_text    = bch2_alloc_to_text,           \
+       .trans_trigger  = bch2_trans_mark_alloc,        \
        .atomic_trigger = bch2_mark_alloc,              \
 }
 
@@ -87,44 +134,31 @@ static inline bool bkey_is_alloc(const struct bkey *k)
                k->type == KEY_TYPE_alloc_v3;
 }
 
-int bch2_alloc_read(struct bch_fs *, bool, bool);
+int bch2_alloc_read(struct bch_fs *);
+
+int bch2_trans_mark_alloc(struct btree_trans *, struct bkey_s_c,
+                         struct bkey_i *, unsigned);
+int bch2_check_alloc_info(struct bch_fs *, bool);
+void bch2_do_discards(struct bch_fs *);
 
-static inline void bch2_wake_allocator(struct bch_dev *ca)
+static inline bool should_invalidate_buckets(struct bch_dev *ca)
 {
-       struct task_struct *p;
+       struct bch_dev_usage u = bch2_dev_usage_read(ca);
 
-       rcu_read_lock();
-       p = rcu_dereference(ca->alloc_thread);
-       if (p)
-               wake_up_process(p);
-       rcu_read_unlock();
+       return u.d[BCH_DATA_cached].buckets &&
+               u.buckets_unavailable + u.d[BCH_DATA_cached].buckets <
+               ca->mi.nbuckets >> 7;
 }
 
-static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
-                                         size_t bucket)
-{
-       if (bch2_expensive_debug_checks) {
-               size_t iter;
-               long i;
-               unsigned j;
-
-               for (j = 0; j < RESERVE_NR; j++)
-                       fifo_for_each_entry(i, &ca->free[j], iter)
-                               BUG_ON(i == bucket);
-               fifo_for_each_entry(i, &ca->free_inc, iter)
-                       BUG_ON(i == bucket);
-       }
-}
+void bch2_do_invalidates(struct bch_fs *);
+
+int bch2_fs_freespace_init(struct bch_fs *);
 
 void bch2_recalc_capacity(struct bch_fs *);
 
 void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
 
-void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
-void bch2_dev_allocator_stop(struct bch_dev *);
-int bch2_dev_allocator_start(struct bch_dev *);
-
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
index 9b81ed2665c8d93324d19dc1e5f8e5f0e4930eae..178d7c058597ada79a67a58359b7ca0554885e83 100644 (file)
 #include "bcachefs.h"
 #include "alloc_background.h"
 #include "alloc_foreground.h"
+#include "btree_iter.h"
+#include "btree_update.h"
 #include "btree_gc.h"
 #include "buckets.h"
+#include "buckets_waiting_for_journal.h"
 #include "clock.h"
 #include "debug.h"
 #include "disk_groups.h"
 #include "ec.h"
+#include "error.h"
 #include "io.h"
+#include "journal.h"
 
 #include <linux/math64.h>
 #include <linux/rculist.h>
@@ -78,7 +83,6 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
        percpu_down_read(&c->mark_lock);
        spin_lock(&ob->lock);
 
-       bch2_mark_alloc_bucket(c, ca, ob->bucket, false);
        ob->valid = false;
        ob->data_type = 0;
 
@@ -178,39 +182,28 @@ static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
        }
 }
 
-/**
- * bch_bucket_alloc - allocate a single bucket from a specific device
- *
- * Returns index of bucket on success, 0 on failure
- * */
-struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
-                                     enum alloc_reserve reserve,
-                                     bool may_alloc_partial,
-                                     struct closure *cl)
+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+                                             enum alloc_reserve reserve,
+                                             struct bkey_alloc_unpacked a,
+                                             size_t *need_journal_commit,
+                                             struct closure *cl)
 {
        struct open_bucket *ob;
-       long b = 0;
 
-       spin_lock(&c->freelist_lock);
+       if (unlikely(ca->buckets_nouse && test_bit(a.bucket, ca->buckets_nouse)))
+               return NULL;
 
-       if (may_alloc_partial) {
-               int i;
-
-               for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
-                       ob = c->open_buckets + ca->open_buckets_partial[i];
-
-                       if (reserve <= ob->alloc_reserve) {
-                               array_remove_item(ca->open_buckets_partial,
-                                                 ca->open_buckets_partial_nr,
-                                                 i);
-                               ob->on_partial_list = false;
-                               ob->alloc_reserve = reserve;
-                               spin_unlock(&c->freelist_lock);
-                               return ob;
-                       }
-               }
+       if (bch2_bucket_is_open(c, ca->dev_idx, a.bucket))
+               return NULL;
+
+       if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+                       c->journal.flushed_seq_ondisk, ca->dev_idx, a.bucket)) {
+               (*need_journal_commit)++;
+               return NULL;
        }
 
+       spin_lock(&c->freelist_lock);
+
        if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
                if (cl)
                        closure_wait(&c->open_buckets_wait, cl);
@@ -219,36 +212,17 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
                        c->blocked_allocate_open_bucket = local_clock();
 
                spin_unlock(&c->freelist_lock);
+
                trace_open_bucket_alloc_fail(ca, reserve);
                return ERR_PTR(-OPEN_BUCKETS_EMPTY);
        }
 
-       if (likely(fifo_pop(&ca->free[RESERVE_NONE], b)))
-               goto out;
-
-       switch (reserve) {
-       case RESERVE_BTREE_MOVINGGC:
-       case RESERVE_MOVINGGC:
-               if (fifo_pop(&ca->free[RESERVE_MOVINGGC], b))
-                       goto out;
-               break;
-       default:
-               break;
+       /* Recheck under lock: */
+       if (bch2_bucket_is_open(c, ca->dev_idx, a.bucket)) {
+               spin_unlock(&c->freelist_lock);
+               return NULL;
        }
 
-       if (cl)
-               closure_wait(&c->freelist_wait, cl);
-
-       if (!c->blocked_allocate)
-               c->blocked_allocate = local_clock();
-
-       spin_unlock(&c->freelist_lock);
-
-       trace_bucket_alloc_fail(ca, reserve);
-       return ERR_PTR(-FREELIST_EMPTY);
-out:
-       verify_not_on_freelist(c, ca, b);
-
        ob = bch2_open_bucket_alloc(c);
 
        spin_lock(&ob->lock);
@@ -257,8 +231,8 @@ out:
        ob->sectors_free = ca->mi.bucket_size;
        ob->alloc_reserve = reserve;
        ob->dev         = ca->dev_idx;
-       ob->gen         = *bucket_gen(ca, b);
-       ob->bucket      = b;
+       ob->gen         = a.gen;
+       ob->bucket      = a.bucket;
        spin_unlock(&ob->lock);
 
        ca->nr_open_buckets++;
@@ -280,12 +254,246 @@ out:
 
        spin_unlock(&c->freelist_lock);
 
-       bch2_wake_allocator(ca);
-
        trace_bucket_alloc(ca, reserve);
        return ob;
 }
 
+static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca,
+                                           enum alloc_reserve reserve, u64 free_entry,
+                                           size_t *need_journal_commit,
+                                           struct closure *cl)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct open_bucket *ob;
+       struct bkey_alloc_unpacked a;
+       u64 b = free_entry & ~(~0ULL << 56);
+       unsigned genbits = free_entry >> 56;
+       struct printbuf buf = PRINTBUF;
+       int ret;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(ca->dev_idx, b), BTREE_ITER_CACHED);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret) {
+               ob = ERR_PTR(ret);
+               goto err;
+       }
+
+       a = bch2_alloc_unpack(k);
+
+       if (bch2_fs_inconsistent_on(bucket_state(a) != BUCKET_free, c,
+                       "non free bucket in freespace btree (state %s)\n"
+                       "  %s\n"
+                       "  at %llu (genbits %u)",
+                       bch2_bucket_states[bucket_state(a)],
+                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf),
+                       free_entry, genbits)) {
+               ob = ERR_PTR(-EIO);
+               goto err;
+       }
+
+       if (bch2_fs_inconsistent_on(genbits != (alloc_freespace_genbits(a) >> 56), c,
+                       "bucket in freespace btree with wrong genbits (got %u should be %llu)\n"
+                       "  %s",
+                       genbits, alloc_freespace_genbits(a) >> 56,
+                       (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+               ob = ERR_PTR(-EIO);
+               goto err;
+       }
+
+       if (bch2_fs_inconsistent_on(b < ca->mi.first_bucket || b >= ca->mi.nbuckets, c,
+                       "freespace btree has bucket outside allowed range (got %llu, valid %u-%llu)",
+                       b, ca->mi.first_bucket, ca->mi.nbuckets)) {
+               ob = ERR_PTR(-EIO);
+               goto err;
+       }
+
+       ob = __try_alloc_bucket(c, ca, reserve, a, need_journal_commit, cl);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       printbuf_exit(&buf);
+       return ob;
+}
+
+static struct open_bucket *try_alloc_partial_bucket(struct bch_fs *c, struct bch_dev *ca,
+                                                   enum alloc_reserve reserve)
+{
+       struct open_bucket *ob;
+       int i;
+
+       spin_lock(&c->freelist_lock);
+
+       for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) {
+               ob = c->open_buckets + ca->open_buckets_partial[i];
+
+               if (reserve <= ob->alloc_reserve) {
+                       array_remove_item(ca->open_buckets_partial,
+                                         ca->open_buckets_partial_nr,
+                                         i);
+                       ob->on_partial_list = false;
+                       ob->alloc_reserve = reserve;
+                       spin_unlock(&c->freelist_lock);
+                       return ob;
+               }
+       }
+
+       spin_unlock(&c->freelist_lock);
+       return NULL;
+}
+
+/*
+ * This path is for before the freespace btree is initialized:
+ *
+ * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock &
+ * journal buckets - journal buckets will be < ca->new_fs_bucket_idx
+ */
+static noinline struct open_bucket *
+bch2_bucket_alloc_trans_early(struct btree_trans *trans,
+                             struct bch_dev *ca,
+                             enum alloc_reserve reserve,
+                             u64 *b,
+                             size_t *need_journal_commit,
+                             struct closure *cl)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct open_bucket *ob = NULL;
+       int ret;
+
+       *b = max_t(u64, *b, ca->mi.first_bucket);
+       *b = max_t(u64, *b, ca->new_fs_bucket_idx);
+
+       for_each_btree_key(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, *b),
+                          BTREE_ITER_SLOTS, k, ret) {
+               struct bkey_alloc_unpacked a;
+
+               if (bkey_cmp(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+                       break;
+
+               if (ca->new_fs_bucket_idx &&
+                   is_superblock_bucket(ca, k.k->p.offset))
+                       continue;
+
+               a = bch2_alloc_unpack(k);
+
+               if (bucket_state(a) != BUCKET_free)
+                       continue;
+
+               ob = __try_alloc_bucket(trans->c, ca, reserve, a,
+                                       need_journal_commit, cl);
+               if (ob)
+                       break;
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       *b = iter.pos.offset;
+
+       return ob ?: ERR_PTR(ret ?: -FREELIST_EMPTY);
+}
+
+static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans,
+                                                  struct bch_dev *ca,
+                                                  enum alloc_reserve reserve,
+                                                  u64 *b,
+                                                  size_t *need_journal_commit,
+                                                  struct closure *cl)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct open_bucket *ob = NULL;
+       int ret;
+
+       if (unlikely(!ca->mi.freespace_initialized))
+               return bch2_bucket_alloc_trans_early(trans, ca, reserve, b,
+                                                    need_journal_commit, cl);
+
+       BUG_ON(ca->new_fs_bucket_idx);
+
+       for_each_btree_key(trans, iter, BTREE_ID_freespace,
+                          POS(ca->dev_idx, *b), 0, k, ret) {
+               if (k.k->p.inode != ca->dev_idx)
+                       break;
+
+               for (*b = max(*b, bkey_start_offset(k.k));
+                    *b != k.k->p.offset && !ob;
+                    (*b)++) {
+                       if (btree_trans_too_many_iters(trans)) {
+                               ob = ERR_PTR(-EINTR);
+                               break;
+                       }
+
+                       ob = try_alloc_bucket(trans, ca, reserve, *b,
+                                             need_journal_commit, cl);
+               }
+               if (ob)
+                       break;
+       }
+       bch2_trans_iter_exit(trans, &iter);
+
+       return ob ?: ERR_PTR(ret);
+}
+
+/**
+ * bch_bucket_alloc - allocate a single bucket from a specific device
+ *
+ * Returns index of bucket on success, 0 on failure
+ * */
+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+                                     enum alloc_reserve reserve,
+                                     bool may_alloc_partial,
+                                     struct closure *cl)
+{
+       struct open_bucket *ob = NULL;
+       size_t need_journal_commit = 0;
+       u64 avail = dev_buckets_available(ca, reserve);
+       u64 b = 0;
+       int ret;
+
+       if (may_alloc_partial) {
+               ob = try_alloc_partial_bucket(c, ca, reserve);
+               if (ob)
+                       return ob;
+       }
+again:
+       if (!avail) {
+               if (cl) {
+                       closure_wait(&c->freelist_wait, cl);
+                       /* recheck after putting ourself on waitlist */
+                       avail = dev_buckets_available(ca, reserve);
+                       if (avail) {
+                               closure_wake_up(&c->freelist_wait);
+                               goto again;
+                       }
+               }
+
+               if (!c->blocked_allocate)
+                       c->blocked_allocate = local_clock();
+
+               ob = ERR_PTR(-FREELIST_EMPTY);
+               goto err;
+       }
+
+       ret = bch2_trans_do(c, NULL, NULL, 0,
+                       PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans,
+                                                       ca, reserve, &b,
+                                                       &need_journal_commit, cl)));
+
+       if (need_journal_commit * 2 > avail)
+               bch2_journal_flush_async(&c->journal, NULL);
+err:
+       if (!ob)
+               ob = ERR_PTR(ret ?: -FREELIST_EMPTY);
+
+       if (ob == ERR_PTR(-FREELIST_EMPTY)) {
+               trace_bucket_alloc_fail(ca, reserve, avail, need_journal_commit);
+               atomic_long_inc(&c->bucket_alloc_fail);
+       }
+
+       return ob;
+}
+
 static int __dev_stripe_cmp(struct dev_stripe_state *stripe,
                            unsigned l, unsigned r)
 {
@@ -313,7 +521,7 @@ void bch2_dev_stripe_increment(struct bch_dev *ca,
                               struct dev_stripe_state *stripe)
 {
        u64 *v = stripe->next_alloc + ca->dev_idx;
-       u64 free_space = dev_buckets_available(ca);
+       u64 free_space = dev_buckets_available(ca, RESERVE_NONE);
        u64 free_space_inv = free_space
                ? div64_u64(1ULL << 48, free_space)
                : 1ULL << 48;
@@ -364,6 +572,7 @@ int bch2_bucket_alloc_set(struct bch_fs *c,
 {
        struct dev_alloc_list devs_sorted =
                bch2_dev_alloc_list(c, stripe, devs_may_alloc);
+       unsigned dev;
        struct bch_dev *ca;
        int ret = -INSUFFICIENT_DEVICES;
        unsigned i;
@@ -373,30 +582,43 @@ int bch2_bucket_alloc_set(struct bch_fs *c,
        for (i = 0; i < devs_sorted.nr; i++) {
                struct open_bucket *ob;
 
-               ca = rcu_dereference(c->devs[devs_sorted.devs[i]]);
+               dev = devs_sorted.devs[i];
+
+               rcu_read_lock();
+               ca = rcu_dereference(c->devs[dev]);
+               if (ca)
+                       percpu_ref_get(&ca->ref);
+               rcu_read_unlock();
+
                if (!ca)
                        continue;
 
-               if (!ca->mi.durability && *have_cache)
+               if (!ca->mi.durability && *have_cache) {
+                       percpu_ref_put(&ca->ref);
                        continue;
+               }
 
                ob = bch2_bucket_alloc(c, ca, reserve,
                                flags & BUCKET_MAY_ALLOC_PARTIAL, cl);
+               if (!IS_ERR(ob))
+                       bch2_dev_stripe_increment(ca, stripe);
+               percpu_ref_put(&ca->ref);
+
                if (IS_ERR(ob)) {
                        ret = PTR_ERR(ob);
 
                        if (cl)
-                               return ret;
+                               break;
                        continue;
                }
 
                add_new_bucket(c, ptrs, devs_may_alloc,
                               nr_effective, have_cache, flags, ob);
 
-               bch2_dev_stripe_increment(ca, stripe);
-
-               if (*nr_effective >= nr_replicas)
-                       return 0;
+               if (*nr_effective >= nr_replicas) {
+                       ret = 0;
+                       break;
+               }
        }
 
        return ret;
@@ -564,9 +786,6 @@ static int open_bucket_add_buckets(struct bch_fs *c,
        if (*nr_effective >= nr_replicas)
                return 0;
 
-       percpu_down_read(&c->mark_lock);
-       rcu_read_lock();
-
 retry_blocking:
        /*
         * Try nonblocking first, so that if one device is full we'll try from
@@ -580,9 +799,6 @@ retry_blocking:
                goto retry_blocking;
        }
 
-       rcu_read_unlock();
-       percpu_up_read(&c->mark_lock);
-
        return ret;
 }
 
@@ -863,7 +1079,7 @@ err:
        case -INSUFFICIENT_DEVICES:
                return ERR_PTR(-EROFS);
        default:
-               BUG();
+               return ERR_PTR(ret);
        }
 }
 
index d466bda9afc8fdddb49f7b353c8c571b12f1fcf6..f51cec5e7cc12be3aa326715e6e56465a707655b 100644 (file)
@@ -115,6 +115,20 @@ static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucke
        return false;
 }
 
+static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket)
+{
+       bool ret;
+
+       if (bch2_bucket_is_open(c, dev, bucket))
+               return true;
+
+       spin_lock(&c->freelist_lock);
+       ret = bch2_bucket_is_open(c, dev, bucket);
+       spin_unlock(&c->freelist_lock);
+
+       return ret;
+}
+
 int bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *,
                      struct dev_stripe_state *, struct bch_devs_mask *,
                      unsigned, unsigned *, bool *, enum alloc_reserve,
index 409232e3d99800ef652ce6fcd8b2cf0a2e6476b9..22e1fbda9046658f29f2d554bb7893401f25534f 100644 (file)
 
 struct ec_bucket_buf;
 
-#define ALLOC_THREAD_STATES()          \
-       x(stopped)                      \
-       x(running)                      \
-       x(blocked)                      \
-       x(blocked_full)
-
-enum allocator_states {
-#define x(n)   ALLOCATOR_##n,
-       ALLOC_THREAD_STATES()
-#undef x
-};
-
 enum alloc_reserve {
        RESERVE_BTREE_MOVINGGC  = -2,
        RESERVE_BTREE           = -1,
@@ -30,8 +18,6 @@ enum alloc_reserve {
        RESERVE_NR              = 2,
 };
 
-typedef FIFO(long)     alloc_fifo;
-
 #define OPEN_BUCKETS_COUNT     1024
 
 #define WRITE_POINT_HASH_NR    32
@@ -94,12 +80,4 @@ struct write_point_specifier {
        unsigned long           v;
 };
 
-struct alloc_heap_entry {
-       size_t                  bucket;
-       size_t                  nr;
-       unsigned long           key;
-};
-
-typedef HEAP(struct alloc_heap_entry) alloc_heap;
-
 #endif /* _BCACHEFS_ALLOC_TYPES_H */
index 211fd5adf9e3031a0cb94e98dea4615091656024..a4ef9aabf274a46016e0f43797a74639b4188602 100644 (file)
@@ -391,6 +391,9 @@ enum gc_phase {
        GC_PHASE_BTREE_reflink,
        GC_PHASE_BTREE_subvolumes,
        GC_PHASE_BTREE_snapshots,
+       GC_PHASE_BTREE_lru,
+       GC_PHASE_BTREE_freespace,
+       GC_PHASE_BTREE_need_discard,
 
        GC_PHASE_PENDING_DELETE,
 };
@@ -447,7 +450,7 @@ struct bch_dev {
         * gc_lock, for device resize - holding any is sufficient for access:
         * Or rcu_read_lock(), but only for ptr_stale():
         */
-       struct bucket_array __rcu *buckets[2];
+       struct bucket_array __rcu *buckets_gc;
        struct bucket_gens __rcu *bucket_gens;
        u8                      *oldest_gen;
        unsigned long           *buckets_nouse;
@@ -459,34 +462,17 @@ struct bch_dev {
 
        /* Allocator: */
        u64                     new_fs_bucket_idx;
-       struct task_struct __rcu *alloc_thread;
 
-       /*
-        * free: Buckets that are ready to be used
-        *
-        * free_inc: Incoming buckets - these are buckets that currently have
-        * cached data in them, and we can't reuse them until after we write
-        * their new gen to disk. After prio_write() finishes writing the new
-        * gens/prios, they'll be moved to the free list (and possibly discarded
-        * in the process)
-        */
-       alloc_fifo              free[RESERVE_NR];
-       alloc_fifo              free_inc;
        unsigned                nr_open_buckets;
+       unsigned                nr_btree_reserve;
 
        open_bucket_idx_t       open_buckets_partial[OPEN_BUCKETS_COUNT];
        open_bucket_idx_t       open_buckets_partial_nr;
 
-       size_t                  fifo_last_bucket;
-
        size_t                  inc_gen_needs_gc;
        size_t                  inc_gen_really_needs_gc;
        size_t                  buckets_waiting_on_journal;
 
-       enum allocator_states   allocator_state;
-
-       alloc_heap              alloc_heap;
-
        atomic64_t              rebalance_work;
 
        struct journal_device   journal;
@@ -508,8 +494,6 @@ struct bch_dev {
 enum {
        /* startup: */
        BCH_FS_ALLOC_CLEAN,
-       BCH_FS_ALLOCATOR_RUNNING,
-       BCH_FS_ALLOCATOR_STOPPING,
        BCH_FS_INITIAL_GC_DONE,
        BCH_FS_INITIAL_GC_UNFIXED,
        BCH_FS_TOPOLOGY_REPAIR_DONE,
@@ -773,6 +757,8 @@ struct bch_fs {
        unsigned                write_points_nr;
 
        struct buckets_waiting_for_journal buckets_waiting_for_journal;
+       struct work_struct      discard_work;
+       struct work_struct      invalidate_work;
 
        /* GARBAGE COLLECTION */
        struct task_struct      *gc_thread;
@@ -911,6 +897,7 @@ struct bch_fs {
        atomic_long_t           read_realloc_races;
        atomic_long_t           extent_migrate_done;
        atomic_long_t           extent_migrate_raced;
+       atomic_long_t           bucket_alloc_fail;
 
        unsigned                btree_gc_periodic:1;
        unsigned                copy_gc_enabled:1;
index 5153f0e420541c1b8a03e4ebf76501a7c87c2b6a..bb54ac175b69daabefa11f9771aee9c87e2bbb9b 100644 (file)
@@ -347,7 +347,9 @@ static inline void bkey_init(struct bkey *k)
        x(subvolume,            21)                     \
        x(snapshot,             22)                     \
        x(inode_v2,             23)                     \
-       x(alloc_v3,             24)
+       x(alloc_v3,             24)                     \
+       x(set,                  25)                     \
+       x(lru,                  26)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name    = nr,
@@ -377,6 +379,10 @@ struct bch_hash_whiteout {
        struct bch_val          v;
 };
 
+struct bch_set {
+       struct bch_val          v;
+};
+
 /* Extents */
 
 /*
@@ -877,8 +883,8 @@ struct bch_alloc_v2 {
 #define BCH_ALLOC_FIELDS_V2()                  \
        x(read_time,            64)             \
        x(write_time,           64)             \
-       x(dirty_sectors,        16)             \
-       x(cached_sectors,       16)             \
+       x(dirty_sectors,        32)             \
+       x(cached_sectors,       32)             \
        x(stripe,               32)             \
        x(stripe_redundancy,    8)
 
@@ -893,11 +899,13 @@ struct bch_alloc_v3 {
        __u8                    data[];
 } __attribute__((packed, aligned(8)));
 
+LE32_BITMASK(BCH_ALLOC_NEED_DISCARD,struct bch_alloc_v3, flags,  0,  1)
+LE32_BITMASK(BCH_ALLOC_NEED_INC_GEN,struct bch_alloc_v3, flags,  1,  2)
+
 enum {
 #define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
        BCH_ALLOC_FIELDS_V1()
 #undef x
-       BCH_ALLOC_FIELD_NR
 };
 
 /* Quotas: */
@@ -1015,6 +1023,15 @@ LE32_BITMASK(BCH_SNAPSHOT_DELETED,       struct bch_snapshot, flags,  0,  1)
 /* True if a subvolume points to this snapshot node: */
 LE32_BITMASK(BCH_SNAPSHOT_SUBVOL,      struct bch_snapshot, flags,  1,  2)
 
+/* LRU btree: */
+
+struct bch_lru {
+       struct bch_val          v;
+       __le64                  idx;
+} __attribute__((packed, aligned(8)));
+
+#define LRU_ID_STRIPES         (1U << 16)
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -1023,16 +1040,17 @@ struct bch_sb_field {
        __le32                  type;
 };
 
-#define BCH_SB_FIELDS()                \
-       x(journal,      0)      \
-       x(members,      1)      \
-       x(crypt,        2)      \
-       x(replicas_v0,  3)      \
-       x(quota,        4)      \
-       x(disk_groups,  5)      \
-       x(clean,        6)      \
-       x(replicas,     7)      \
-       x(journal_seq_blacklist, 8)
+#define BCH_SB_FIELDS()                                \
+       x(journal,      0)                      \
+       x(members,      1)                      \
+       x(crypt,        2)                      \
+       x(replicas_v0,  3)                      \
+       x(quota,        4)                      \
+       x(disk_groups,  5)                      \
+       x(clean,        6)                      \
+       x(replicas,     7)                      \
+       x(journal_seq_blacklist, 8)             \
+       x(journal_v2,   9)
 
 enum bch_sb_field_type {
 #define x(f, nr)       BCH_SB_FIELD_##f = nr,
@@ -1041,6 +1059,14 @@ enum bch_sb_field_type {
        BCH_SB_FIELD_NR
 };
 
+/*
+ * Most superblock fields are replicated in all device's superblocks - a few are
+ * not:
+ */
+#define BCH_SINGLE_DEVICE_SB_FIELDS            \
+       ((1U << BCH_SB_FIELD_journal)|          \
+        (1U << BCH_SB_FIELD_journal_v2))
+
 /* BCH_SB_FIELD_journal: */
 
 struct bch_sb_field_journal {
@@ -1048,6 +1074,15 @@ struct bch_sb_field_journal {
        __le64                  buckets[0];
 };
 
+struct bch_sb_field_journal_v2 {
+       struct bch_sb_field     field;
+
+       struct bch_sb_field_journal_v2_entry {
+               __le64          start;
+               __le64          nr;
+       }                       d[0];
+};
+
 /* BCH_SB_FIELD_members: */
 
 #define BCH_MIN_NR_NBUCKETS    (1 << 6)
@@ -1069,6 +1104,8 @@ LE64_BITMASK(BCH_MEMBER_DISCARD,  struct bch_member, flags[0], 14, 15)
 LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED,  struct bch_member, flags[0], 15, 20)
 LE64_BITMASK(BCH_MEMBER_GROUP,         struct bch_member, flags[0], 20, 28)
 LE64_BITMASK(BCH_MEMBER_DURABILITY,    struct bch_member, flags[0], 28, 30)
+LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
+                                       struct bch_member, flags[0], 30, 31)
 
 #if 0
 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS,        struct bch_member, flags[1], 0,  20);
@@ -1287,7 +1324,8 @@ enum bcachefs_metadata_version {
        bcachefs_metadata_version_reflink_p_fix         = 16,
        bcachefs_metadata_version_subvol_dirent         = 17,
        bcachefs_metadata_version_inode_v2              = 18,
-       bcachefs_metadata_version_max                   = 19,
+       bcachefs_metadata_version_freespace             = 19,
+       bcachefs_metadata_version_max                   = 20,
 };
 
 #define bcachefs_metadata_version_current      (bcachefs_metadata_version_max - 1)
@@ -1804,7 +1842,10 @@ LE32_BITMASK(JSET_NO_FLUSH,      struct jset, flags, 5, 6);
        x(stripes,      6)                      \
        x(reflink,      7)                      \
        x(subvolumes,   8)                      \
-       x(snapshots,    9)
+       x(snapshots,    9)                      \
+       x(lru,          10)                     \
+       x(freespace,    11)                     \
+       x(need_discard, 12)
 
 enum btree_id {
 #define x(kwd, val) BTREE_ID_##kwd = val,
index e83aeb683a0977c84f82b0e4559c1ccc1e2d3194..3c1bf3310d999e5217144702272f136bfe8cddad 100644 (file)
@@ -9,6 +9,7 @@
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
+#include "lru.h"
 #include "quota.h"
 #include "reflink.h"
 #include "subvolume.h"
@@ -85,6 +86,24 @@ static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
        .val_to_text    = key_type_inline_data_to_text, \
 }
 
+static const char *key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       if (bkey_val_bytes(k.k))
+               return "nonempty value";
+       return NULL;
+}
+
+static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
+{
+       bch2_key_resize(l.k, l.k->size + r.k->size);
+       return true;
+}
+
+#define bch2_bkey_ops_set (struct bkey_ops) {          \
+       .key_invalid    = key_type_set_invalid,         \
+       .key_merge      = key_type_set_merge,           \
+}
+
 const struct bkey_ops bch2_bkey_ops[] = {
 #define x(name, nr) [KEY_TYPE_##name]  = bch2_bkey_ops_##name,
        BCH_BKEY_TYPES()
@@ -147,6 +166,15 @@ static unsigned bch2_key_types_allowed[] = {
        [BKEY_TYPE_snapshots] =
                (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_snapshot),
+       [BKEY_TYPE_lru] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_lru),
+       [BKEY_TYPE_freespace] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_set),
+       [BKEY_TYPE_need_discard] =
+               (1U << KEY_TYPE_deleted)|
+               (1U << KEY_TYPE_set),
        [BKEY_TYPE_btree] =
                (1U << KEY_TYPE_deleted)|
                (1U << KEY_TYPE_btree_ptr)|
index 73b947a493a2582541ef7287d303fcb3b58e5fc6..5c54a0ca681cadca29d256fd0d5d97de4cd2717d 100644 (file)
@@ -571,37 +571,37 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
                                (printbuf_reset(&buf),
                                 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
                        if (!p.ptr.cached) {
-                               g->_mark.gen            = p.ptr.gen;
                                g->gen_valid            = true;
+                               g->gen                  = p.ptr.gen;
                        } else {
                                do_update = true;
                        }
                }
 
-               if (fsck_err_on(gen_cmp(p.ptr.gen, g->mark.gen) > 0, c,
+               if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, c,
                                "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
                                "while marking %s",
                                p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
                                bch2_data_types[ptr_data_type(k->k, &p.ptr)],
-                               p.ptr.gen, g->mark.gen,
+                               p.ptr.gen, g->gen,
                                (printbuf_reset(&buf),
                                 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
                        if (!p.ptr.cached) {
-                               g->_mark.gen            = p.ptr.gen;
                                g->gen_valid            = true;
-                               g->_mark.data_type      = 0;
-                               g->_mark.dirty_sectors  = 0;
-                               g->_mark.cached_sectors = 0;
+                               g->gen                  = p.ptr.gen;
+                               g->data_type            = 0;
+                               g->dirty_sectors        = 0;
+                               g->cached_sectors       = 0;
                                set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
                        } else {
                                do_update = true;
                        }
                }
 
-               if (fsck_err_on(gen_cmp(g->mark.gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
+               if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, c,
                                "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
                                "while marking %s",
-                               p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->mark.gen,
+                               p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
                                bch2_data_types[ptr_data_type(k->k, &p.ptr)],
                                p.ptr.gen,
                                (printbuf_reset(&buf),
@@ -609,30 +609,30 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
                        do_update = true;
 
                if (fsck_err_on(!p.ptr.cached &&
-                               gen_cmp(p.ptr.gen, g->mark.gen) < 0, c,
+                               gen_cmp(p.ptr.gen, g->gen) < 0, c,
                                "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
                                "while marking %s",
                                p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
                                bch2_data_types[ptr_data_type(k->k, &p.ptr)],
-                               p.ptr.gen, g->mark.gen,
+                               p.ptr.gen, g->gen,
                                (printbuf_reset(&buf),
                                 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
                        do_update = true;
 
-               if (data_type != BCH_DATA_btree && p.ptr.gen != g->mark.gen)
+               if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
                        continue;
 
-               if (fsck_err_on(g->mark.data_type &&
-                               g->mark.data_type != data_type, c,
+               if (fsck_err_on(g->data_type &&
+                               g->data_type != data_type, c,
                                "bucket %u:%zu different types of data in same bucket: %s, %s\n"
                                "while marking %s",
                                p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-                               bch2_data_types[g->mark.data_type],
+                               bch2_data_types[g->data_type],
                                bch2_data_types[data_type],
                                (printbuf_reset(&buf),
                                 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
                        if (data_type == BCH_DATA_btree) {
-                               g->_mark.data_type      = data_type;
+                               g->data_type    = data_type;
                                set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
                        } else {
                                do_update = true;
@@ -692,7 +692,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
                                struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
                                struct bucket *g = PTR_GC_BUCKET(ca, ptr);
 
-                               ptr->gen = g->mark.gen;
+                               ptr->gen = g->gen;
                        }
                } else {
                        bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
@@ -701,12 +701,12 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
                                enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
 
                                (ptr->cached &&
-                                (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
+                                (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) ||
                                (!ptr->cached &&
-                                gen_cmp(ptr->gen, g->mark.gen) < 0) ||
-                               gen_cmp(g->mark.gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
-                               (g->mark.data_type &&
-                                g->mark.data_type != data_type);
+                                gen_cmp(ptr->gen, g->gen) < 0) ||
+                               gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
+                               (g->data_type &&
+                                g->data_type != data_type);
                        }));
 again:
                        ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
@@ -1163,10 +1163,10 @@ static void bch2_gc_free(struct bch_fs *c)
        genradix_free(&c->gc_stripes);
 
        for_each_member_device(ca, c, i) {
-               kvpfree(rcu_dereference_protected(ca->buckets[1], 1),
+               kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
                        sizeof(struct bucket_array) +
                        ca->mi.nbuckets * sizeof(struct bucket));
-               ca->buckets[1] = NULL;
+               ca->buckets_gc = NULL;
 
                free_percpu(ca->usage_gc);
                ca->usage_gc = NULL;
@@ -1295,7 +1295,7 @@ static int bch2_gc_start(struct bch_fs *c,
        }
 
        for_each_member_device(ca, c, i) {
-               BUG_ON(ca->buckets[1]);
+               BUG_ON(ca->buckets_gc);
                BUG_ON(ca->usage_gc);
 
                ca->usage_gc = alloc_percpu(struct bch_dev_usage);
@@ -1315,9 +1315,9 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
-       struct bucket *g;
+       struct bucket gc;
        struct bkey_s_c k;
-       struct bkey_alloc_unpacked old_u, new_u, gc_u;
+       struct bkey_alloc_unpacked old_u, new_u;
        struct bkey_alloc_buf *a;
        int ret;
 
@@ -1329,39 +1329,27 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
        old_u = new_u = bch2_alloc_unpack(k);
 
        percpu_down_read(&c->mark_lock);
-       g       = gc_bucket(ca, iter->pos.offset);
-       gc_u = (struct bkey_alloc_unpacked) {
-               .dev            = iter->pos.inode,
-               .bucket         = iter->pos.offset,
-               .gen            = g->mark.gen,
-               .data_type      = g->mark.data_type,
-               .dirty_sectors  = g->mark.dirty_sectors,
-               .cached_sectors = g->mark.cached_sectors,
-               .read_time      = g->io_time[READ],
-               .write_time     = g->io_time[WRITE],
-               .stripe         = g->stripe,
-               .stripe_redundancy = g->stripe_redundancy,
-       };
+       gc = *gc_bucket(ca, iter->pos.offset);
        percpu_up_read(&c->mark_lock);
 
        if (metadata_only &&
-           gc_u.data_type != BCH_DATA_sb &&
-           gc_u.data_type != BCH_DATA_journal &&
-           gc_u.data_type != BCH_DATA_btree)
+           gc.data_type != BCH_DATA_sb &&
+           gc.data_type != BCH_DATA_journal &&
+           gc.data_type != BCH_DATA_btree)
                return 0;
 
-       if (gen_after(old_u.gen, gc_u.gen))
+       if (gen_after(old_u.gen, gc.gen))
                return 0;
 
 #define copy_bucket_field(_f)                                          \
-       if (fsck_err_on(new_u._f != gc_u._f, c,                         \
+       if (fsck_err_on(new_u._f != gc._f, c,                           \
                        "bucket %llu:%llu gen %u data type %s has wrong " #_f   \
                        ": got %u, should be %u",                       \
                        iter->pos.inode, iter->pos.offset,              \
-                       new_u.gen,                                      \
-                       bch2_data_types[new_u.data_type],               \
-                       new_u._f, gc_u._f))                             \
-               new_u._f = gc_u._f;                                     \
+                       gc.gen,                                         \
+                       bch2_data_types[gc.data_type],                  \
+                       new_u._f, gc._f))                               \
+               new_u._f = gc._f;                                       \
 
        copy_bucket_field(gen);
        copy_bucket_field(data_type);
@@ -1379,7 +1367,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
        if (IS_ERR(a))
                return PTR_ERR(a);
 
-       ret = bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN);
+       ret = bch2_trans_update(trans, iter, &a->k, 0);
 fsck_err:
        return ret;
 }
@@ -1426,7 +1414,13 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
 static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 {
        struct bch_dev *ca;
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bucket *g;
+       struct bkey_alloc_unpacked u;
        unsigned i;
+       int ret;
 
        for_each_member_device(ca, c, i) {
                struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
@@ -1434,17 +1428,45 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
                                GFP_KERNEL|__GFP_ZERO);
                if (!buckets) {
                        percpu_ref_put(&ca->ref);
-                       percpu_up_write(&c->mark_lock);
                        bch_err(c, "error allocating ca->buckets[gc]");
                        return -ENOMEM;
                }
 
                buckets->first_bucket   = ca->mi.first_bucket;
                buckets->nbuckets       = ca->mi.nbuckets;
-               rcu_assign_pointer(ca->buckets[1], buckets);
+               rcu_assign_pointer(ca->buckets_gc, buckets);
        };
 
-       return bch2_alloc_read(c, true, metadata_only);
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               ca = bch_dev_bkey_exists(c, k.k->p.inode);
+               g = gc_bucket(ca, k.k->p.offset);
+               u = bch2_alloc_unpack(k);
+
+               g->gen_valid    = 1;
+               g->gen          = u.gen;
+
+               if (metadata_only &&
+                   (u.data_type == BCH_DATA_user ||
+                    u.data_type == BCH_DATA_cached ||
+                    u.data_type == BCH_DATA_parity)) {
+                       g->data_type            = u.data_type;
+                       g->dirty_sectors        = u.dirty_sectors;
+                       g->cached_sectors       = u.cached_sectors;
+                       g->stripe               = u.stripe;
+                       g->stripe_redundancy    = u.stripe_redundancy;
+               }
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+
+       bch2_trans_exit(&trans);
+
+       if (ret)
+               bch_err(c, "error reading alloc info at gc start: %i", ret);
+
+       return ret;
 }
 
 static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
@@ -1453,17 +1475,17 @@ static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
        unsigned i;
 
        for_each_member_device(ca, c, i) {
-               struct bucket_array *buckets = __bucket_array(ca, true);
+               struct bucket_array *buckets = gc_bucket_array(ca);
                struct bucket *g;
 
                for_each_bucket(g, buckets) {
                        if (metadata_only &&
-                           (g->mark.data_type == BCH_DATA_user ||
-                            g->mark.data_type == BCH_DATA_cached ||
-                            g->mark.data_type == BCH_DATA_parity))
+                           (g->data_type == BCH_DATA_user ||
+                            g->data_type == BCH_DATA_cached ||
+                            g->data_type == BCH_DATA_parity))
                                continue;
-                       g->_mark.dirty_sectors = 0;
-                       g->_mark.cached_sectors = 0;
+                       g->dirty_sectors = 0;
+                       g->cached_sectors = 0;
                }
        };
 }
@@ -1673,9 +1695,8 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
  */
 int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
 {
-       struct bch_dev *ca;
        u64 start_time = local_clock();
-       unsigned i, iter = 0;
+       unsigned iter = 0;
        int ret;
 
        lockdep_assert_held(&c->state_lock);
@@ -1776,13 +1797,6 @@ out:
        trace_gc_end(c);
        bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
 
-       /*
-        * Wake up allocator in case it was waiting for buckets
-        * because of not being able to inc gens
-        */
-       for_each_member_device(ca, c, i)
-               bch2_wake_allocator(ca);
-
        /*
         * At startup, allocations can happen directly instead of via the
         * allocator thread - issue wakeup in case they blocked on gc_lock:
@@ -1891,7 +1905,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i
 
        u.oldest_gen = ca->oldest_gen[iter->pos.offset];
 
-       return bch2_alloc_write(trans, iter, &u, BTREE_TRIGGER_NORUN);
+       return bch2_alloc_write(trans, iter, &u, 0);
 }
 
 int bch2_gc_gens(struct bch_fs *c)
index e6cea4c687e125fa73afd55b2e3d172ab4f551c7..1df454f24b5441233055da5cf3969d9fa1f6fc69 100644 (file)
@@ -930,7 +930,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                                        "error decrypting btree node: %i", ret))
                                goto fsck_err;
 
-                       btree_err_on(btree_node_is_extents(b) &&
+                       btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
                                     !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data),
                                     BTREE_ERR_FATAL, c, NULL, b, NULL,
                                     "btree node does not have NEW_EXTENT_OVERWRITE set");
index 575635b5fa102187e27f81f36ac3e040d0b7ef67..788b9811148f4656a9ef596485e05ab5f3de4eab 100644 (file)
@@ -596,24 +596,9 @@ static inline enum btree_node_type btree_node_type(struct btree *b)
        return __btree_node_type(b->c.level, b->c.btree_id);
 }
 
-static inline bool btree_node_type_is_extents(enum btree_node_type type)
-{
-       switch (type) {
-       case BKEY_TYPE_extents:
-       case BKEY_TYPE_reflink:
-               return true;
-       default:
-               return false;
-       }
-}
-
-static inline bool btree_node_is_extents(struct btree *b)
-{
-       return btree_node_type_is_extents(btree_node_type(b));
-}
-
 #define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS             \
        ((1U << BKEY_TYPE_extents)|                     \
+        (1U << BKEY_TYPE_alloc)|                       \
         (1U << BKEY_TYPE_inodes)|                      \
         (1U << BKEY_TYPE_stripes)|                     \
         (1U << BKEY_TYPE_reflink)|                     \
@@ -629,6 +614,16 @@ static inline bool btree_node_is_extents(struct btree *b)
        (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS|            \
         BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
 
+#define BTREE_ID_IS_EXTENTS                            \
+       ((1U << BTREE_ID_extents)|                      \
+        (1U << BTREE_ID_reflink)|                      \
+        (1U << BTREE_ID_freespace))
+
+static inline bool btree_node_type_is_extents(enum btree_node_type type)
+{
+       return (1U << type) & BTREE_ID_IS_EXTENTS;
+}
+
 #define BTREE_ID_HAS_SNAPSHOTS                         \
        ((1U << BTREE_ID_extents)|                      \
         (1U << BTREE_ID_inodes)|                       \
index d52263759ee586d0d8c6e6a65ce6c062474bc3bb..fbce6cdf4cf8ac3f64f38d5932c278a38667ccca 100644 (file)
@@ -279,29 +279,24 @@ bch2_fs_usage_read_short(struct bch_fs *c)
        return ret;
 }
 
-static inline int is_unavailable_bucket(struct bucket_mark m)
+static inline int is_unavailable_bucket(struct bkey_alloc_unpacked a)
 {
-       return !is_available_bucket(m);
+       return a.dirty_sectors || a.stripe;
 }
 
 static inline int bucket_sectors_fragmented(struct bch_dev *ca,
-                                           struct bucket_mark m)
+                                           struct bkey_alloc_unpacked a)
 {
-       return m.dirty_sectors
-               ? max(0, (int) ca->mi.bucket_size - (int) m.dirty_sectors)
+       return a.dirty_sectors
+               ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors)
                : 0;
 }
 
-static inline int is_stripe_data_bucket(struct bucket_mark m)
+static inline enum bch_data_type bucket_type(struct bkey_alloc_unpacked a)
 {
-       return m.stripe && m.data_type != BCH_DATA_parity;
-}
-
-static inline enum bch_data_type bucket_type(struct bucket_mark m)
-{
-       return m.cached_sectors && !m.dirty_sectors
+       return a.cached_sectors && !a.dirty_sectors
                ? BCH_DATA_cached
-               : m.data_type;
+               : a.data_type;
 }
 
 static inline void account_bucket(struct bch_fs_usage *fs_usage,
@@ -316,7 +311,8 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
 }
 
 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
-                                 struct bucket_mark old, struct bucket_mark new,
+                                 struct bkey_alloc_unpacked old,
+                                 struct bkey_alloc_unpacked new,
                                  u64 journal_seq, bool gc)
 {
        struct bch_fs_usage *fs_usage;
@@ -347,9 +343,28 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
        u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
 
        preempt_enable();
+}
+
+static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
+                                   struct bucket old, struct bucket new,
+                                   u64 journal_seq, bool gc)
+{
+       struct bkey_alloc_unpacked old_a = {
+               .gen            = old.gen,
+               .data_type      = old.data_type,
+               .dirty_sectors  = old.dirty_sectors,
+               .cached_sectors = old.cached_sectors,
+               .stripe         = old.stripe,
+       };
+       struct bkey_alloc_unpacked new_a = {
+               .gen            = new.gen,
+               .data_type      = new.data_type,
+               .dirty_sectors  = new.dirty_sectors,
+               .cached_sectors = new.cached_sectors,
+               .stripe         = new.stripe,
+       };
 
-       if (!is_available_bucket(old) && is_available_bucket(new))
-               bch2_wake_allocator(ca);
+       bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
 }
 
 static inline int __update_replicas(struct bch_fs *c,
@@ -484,19 +499,6 @@ static inline void update_cached_sectors_list(struct btree_trans *trans,
        update_replicas_list(trans, &r.e, sectors);
 }
 
-void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
-                           size_t b, bool owned_by_allocator)
-{
-       struct bucket *g = bucket(ca, b);
-       struct bucket_mark old, new;
-
-       old = bucket_cmpxchg(g, new, ({
-               new.owned_by_allocator  = owned_by_allocator;
-       }));
-
-       BUG_ON(owned_by_allocator == old.owned_by_allocator);
-}
-
 int bch2_mark_alloc(struct btree_trans *trans,
                    struct bkey_s_c old, struct bkey_s_c new,
                    unsigned flags)
@@ -507,8 +509,6 @@ int bch2_mark_alloc(struct btree_trans *trans,
        struct bkey_alloc_unpacked old_u = bch2_alloc_unpack(old);
        struct bkey_alloc_unpacked new_u = bch2_alloc_unpack(new);
        struct bch_dev *ca = bch_dev_bkey_exists(c, new_u.dev);
-       struct bucket *g;
-       struct bucket_mark old_m, m;
        int ret = 0;
 
        if (bch2_trans_inconsistent_on(new_u.bucket < ca->mi.first_bucket ||
@@ -555,28 +555,46 @@ int bch2_mark_alloc(struct btree_trans *trans,
                }
        }
 
+       if (!new_u.data_type &&
+           (!new_u.journal_seq || new_u.journal_seq < c->journal.flushed_seq_ondisk))
+               closure_wake_up(&c->freelist_wait);
+
+       if ((flags & BTREE_TRIGGER_INSERT) &&
+           new_u.need_discard &&
+           !new_u.journal_seq)
+               bch2_do_discards(c);
+
+       if (!old_u.data_type &&
+           new_u.data_type &&
+           should_invalidate_buckets(ca))
+               bch2_do_invalidates(c);
+
+       if (bucket_state(new_u) == BUCKET_need_gc_gens) {
+               atomic_inc(&c->kick_gc);
+               wake_up_process(c->gc_thread);
+       }
+
        percpu_down_read(&c->mark_lock);
        if (!gc && new_u.gen != old_u.gen)
                *bucket_gen(ca, new_u.bucket) = new_u.gen;
 
-       g = __bucket(ca, new_u.bucket, gc);
+       bch2_dev_usage_update(c, ca, old_u, new_u, journal_seq, gc);
+
+       if (gc) {
+               struct bucket *g = gc_bucket(ca, new_u.bucket);
 
-       old_m = bucket_cmpxchg(g, m, ({
-               m.gen                   = new_u.gen;
-               m.data_type             = new_u.data_type;
-               m.dirty_sectors         = new_u.dirty_sectors;
-               m.cached_sectors        = new_u.cached_sectors;
-               m.stripe                = new_u.stripe != 0;
-       }));
+               bucket_lock(g);
 
-       bch2_dev_usage_update(c, ca, old_m, m, journal_seq, gc);
+               g->gen_valid            = 1;
+               g->gen                  = new_u.gen;
+               g->data_type            = new_u.data_type;
+               g->stripe               = new_u.stripe;
+               g->stripe_redundancy    = new_u.stripe_redundancy;
+               g->dirty_sectors        = new_u.dirty_sectors;
+               g->cached_sectors       = new_u.cached_sectors;
 
-       g->io_time[READ]        = new_u.read_time;
-       g->io_time[WRITE]       = new_u.write_time;
-       g->oldest_gen           = new_u.oldest_gen;
-       g->gen_valid            = 1;
-       g->stripe               = new_u.stripe;
-       g->stripe_redundancy    = new_u.stripe_redundancy;
+               bucket_unlock(g);
+       }
        percpu_up_read(&c->mark_lock);
 
        /*
@@ -585,9 +603,9 @@ int bch2_mark_alloc(struct btree_trans *trans,
         */
 
        if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
-           old_m.cached_sectors) {
+           old_u.cached_sectors) {
                ret = update_cached_sectors(c, new, ca->dev_idx,
-                                           -old_m.cached_sectors,
+                                           -old_u.cached_sectors,
                                            journal_seq, gc);
                if (ret) {
                        bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
@@ -595,29 +613,18 @@ int bch2_mark_alloc(struct btree_trans *trans,
                }
 
                trace_invalidate(ca, bucket_to_sector(ca, new_u.bucket),
-                                old_m.cached_sectors);
+                                old_u.cached_sectors);
        }
 
        return 0;
 }
 
-#define checked_add(a, b)                                      \
-({                                                             \
-       unsigned _res = (unsigned) (a) + (b);                   \
-       bool overflow = _res > U16_MAX;                         \
-       if (overflow)                                           \
-               _res = U16_MAX;                                 \
-       (a) = _res;                                             \
-       overflow;                                               \
-})
-
 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
                               size_t b, enum bch_data_type data_type,
                               unsigned sectors, struct gc_pos pos,
                               unsigned flags)
 {
-       struct bucket *g;
-       struct bucket_mark old, new;
+       struct bucket old, new, *g;
        bool overflow;
 
        BUG_ON(!(flags & BTREE_TRIGGER_GC));
@@ -632,10 +639,16 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 
        percpu_down_read(&c->mark_lock);
        g = gc_bucket(ca, b);
-       old = bucket_cmpxchg(g, new, ({
-               new.data_type   = data_type;
-               overflow = checked_add(new.dirty_sectors, sectors);
-       }));
+
+       bucket_lock(g);
+       old = *g;
+
+       g->data_type = data_type;
+       g->dirty_sectors += sectors;
+       overflow = g->dirty_sectors < sectors;
+
+       new = *g;
+       bucket_unlock(g);
 
        bch2_fs_inconsistent_on(old.data_type &&
                                old.data_type != data_type, c,
@@ -649,7 +662,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
                bch2_data_types[old.data_type ?: data_type],
                old.dirty_sectors, sectors);
 
-       bch2_dev_usage_update(c, ca, old, new, 0, true);
+       bch2_dev_usage_update_m(c, ca, old, new, 0, true);
        percpu_up_read(&c->mark_lock);
 }
 
@@ -669,7 +682,7 @@ static int check_bucket_ref(struct bch_fs *c,
                            const struct bch_extent_ptr *ptr,
                            s64 sectors, enum bch_data_type ptr_data_type,
                            u8 b_gen, u8 bucket_data_type,
-                           u16 dirty_sectors, u16 cached_sectors)
+                           u32 dirty_sectors, u32 cached_sectors)
 {
        struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
        size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
@@ -737,7 +750,7 @@ static int check_bucket_ref(struct bch_fs *c,
                goto err;
        }
 
-       if ((unsigned) (bucket_sectors + sectors) > U16_MAX) {
+       if ((unsigned) (bucket_sectors + sectors) > U32_MAX) {
                bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK,
                        "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n"
                        "while marking %s",
@@ -768,8 +781,7 @@ static int mark_stripe_bucket(struct btree_trans *trans,
        s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
        const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
        struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-       struct bucket *g;
-       struct bucket_mark new, old;
+       struct bucket old, new, *g;
        struct printbuf buf = PRINTBUF;
        int ret = 0;
 
@@ -781,34 +793,38 @@ static int mark_stripe_bucket(struct btree_trans *trans,
        buf.atomic++;
        g = PTR_GC_BUCKET(ca, ptr);
 
-       if (g->mark.dirty_sectors ||
+       if (g->dirty_sectors ||
            (g->stripe && g->stripe != k.k->p.offset)) {
                bch2_fs_inconsistent(c,
                              "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
-                             ptr->dev, PTR_BUCKET_NR(ca, ptr), g->mark.gen,
+                             ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
                              (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
                ret = -EINVAL;
                goto err;
        }
 
-       old = bucket_cmpxchg(g, new, ({
-               ret = check_bucket_ref(c, k, ptr, sectors, data_type,
-                                      new.gen, new.data_type,
-                                      new.dirty_sectors, new.cached_sectors);
-               if (ret)
-                       goto err;
+       bucket_lock(g);
+       old = *g;
 
-               new.dirty_sectors += sectors;
-               if (data_type)
-                       new.data_type           = data_type;
+       ret = check_bucket_ref(c, k, ptr, sectors, data_type,
+                              new.gen, new.data_type,
+                              new.dirty_sectors, new.cached_sectors);
+       if (ret) {
+               bucket_unlock(g);
+               goto err;
+       }
 
-               new.stripe = true;
-       }));
+       new.dirty_sectors += sectors;
+       if (data_type)
+               new.data_type = data_type;
 
        g->stripe               = k.k->p.offset;
        g->stripe_redundancy    = s->nr_redundant;
 
-       bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
+       new = *g;
+       bucket_unlock(g);
+
+       bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
 err:
        percpu_up_read(&c->mark_lock);
        printbuf_exit(&buf);
@@ -820,9 +836,9 @@ static int __mark_pointer(struct btree_trans *trans,
                          const struct bch_extent_ptr *ptr,
                          s64 sectors, enum bch_data_type ptr_data_type,
                          u8 bucket_gen, u8 *bucket_data_type,
-                         u16 *dirty_sectors, u16 *cached_sectors)
+                         u32 *dirty_sectors, u32 *cached_sectors)
 {
-       u16 *dst_sectors = !ptr->cached
+       u32 *dst_sectors = !ptr->cached
                ? dirty_sectors
                : cached_sectors;
        int ret = check_bucket_ref(trans->c, k, ptr, sectors, ptr_data_type,
@@ -846,11 +862,9 @@ static int bch2_mark_pointer(struct btree_trans *trans,
 {
        u64 journal_seq = trans->journal_res.seq;
        struct bch_fs *c = trans->c;
-       struct bucket_mark old, new;
        struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-       struct bucket *g;
+       struct bucket old, new, *g;
        u8 bucket_data_type;
-       u64 v;
        int ret = 0;
 
        BUG_ON(!(flags & BTREE_TRIGGER_GC));
@@ -858,30 +872,27 @@ static int bch2_mark_pointer(struct btree_trans *trans,
        percpu_down_read(&c->mark_lock);
        g = PTR_GC_BUCKET(ca, &p.ptr);
 
-       v = atomic64_read(&g->_mark.v);
-       do {
-               new.v.counter = old.v.counter = v;
-               bucket_data_type = new.data_type;
-
-               ret = __mark_pointer(trans, k, &p.ptr, sectors,
-                                    data_type, new.gen,
-                                    &bucket_data_type,
-                                    &new.dirty_sectors,
-                                    &new.cached_sectors);
-               if (ret)
-                       goto err;
+       bucket_lock(g);
+       old = *g;
 
-               new.data_type = bucket_data_type;
+       bucket_data_type = g->data_type;
 
-               if (flags & BTREE_TRIGGER_NOATOMIC) {
-                       g->_mark = new;
-                       break;
-               }
-       } while ((v = atomic64_cmpxchg(&g->_mark.v,
-                             old.v.counter,
-                             new.v.counter)) != old.v.counter);
+       ret = __mark_pointer(trans, k, &p.ptr, sectors,
+                            data_type, g->gen,
+                            &bucket_data_type,
+                            &g->dirty_sectors,
+                            &g->cached_sectors);
+       if (ret) {
+               bucket_unlock(g);
+               goto err;
+       }
+
+       g->data_type = bucket_data_type;
+
+       new = *g;
+       bucket_unlock(g);
 
-       bch2_dev_usage_update(c, ca, old, new, journal_seq, true);
+       bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
 err:
        percpu_up_read(&c->mark_lock);
 
@@ -2041,16 +2052,6 @@ recalculate:
 
 /* Startup/shutdown: */
 
-static void buckets_free_rcu(struct rcu_head *rcu)
-{
-       struct bucket_array *buckets =
-               container_of(rcu, struct bucket_array, rcu);
-
-       kvpfree(buckets,
-               sizeof(*buckets) +
-               buckets->nbuckets * sizeof(struct bucket));
-}
-
 static void bucket_gens_free_rcu(struct rcu_head *rcu)
 {
        struct bucket_gens *buckets =
@@ -2061,46 +2062,19 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu)
 
 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 {
-       struct bucket_array *buckets = NULL, *old_buckets = NULL;
        struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL;
        unsigned long *buckets_nouse = NULL;
-       alloc_fifo      free[RESERVE_NR];
-       alloc_fifo      free_inc;
-       alloc_heap      alloc_heap;
-
-       size_t btree_reserve    = DIV_ROUND_UP(BTREE_NODE_RESERVE,
-                            ca->mi.bucket_size / btree_sectors(c));
-       /* XXX: these should be tunable */
-       size_t reserve_none     = max_t(size_t, 1, nbuckets >> 9);
-       size_t copygc_reserve   = max_t(size_t, 2, nbuckets >> 6);
-       size_t free_inc_nr      = max(max_t(size_t, 1, nbuckets >> 12),
-                                     btree_reserve * 2);
-       bool resize = ca->buckets[0] != NULL;
+       bool resize = ca->bucket_gens != NULL;
        int ret = -ENOMEM;
-       unsigned i;
 
-       memset(&free,           0, sizeof(free));
-       memset(&free_inc,       0, sizeof(free_inc));
-       memset(&alloc_heap,     0, sizeof(alloc_heap));
-
-       if (!(buckets           = kvpmalloc(sizeof(struct bucket_array) +
-                                           nbuckets * sizeof(struct bucket),
-                                           GFP_KERNEL|__GFP_ZERO)) ||
-           !(bucket_gens       = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
+       if (!(bucket_gens       = kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
                                            GFP_KERNEL|__GFP_ZERO)) ||
            (c->opts.buckets_nouse &&
             !(buckets_nouse    = kvpmalloc(BITS_TO_LONGS(nbuckets) *
                                            sizeof(unsigned long),
-                                           GFP_KERNEL|__GFP_ZERO))) ||
-           !init_fifo(&free[RESERVE_MOVINGGC],
-                      copygc_reserve, GFP_KERNEL) ||
-           !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
-           !init_fifo(&free_inc,       free_inc_nr, GFP_KERNEL) ||
-           !init_heap(&alloc_heap,     ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL))
+                                           GFP_KERNEL|__GFP_ZERO))))
                goto err;
 
-       buckets->first_bucket   = ca->mi.first_bucket;
-       buckets->nbuckets       = nbuckets;
        bucket_gens->first_bucket = ca->mi.first_bucket;
        bucket_gens->nbuckets   = nbuckets;
 
@@ -2112,15 +2086,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
                percpu_down_write(&c->mark_lock);
        }
 
-       old_buckets = bucket_array(ca);
        old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1);
 
        if (resize) {
-               size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
+               size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets);
 
-               memcpy(buckets->b,
-                      old_buckets->b,
-                      n * sizeof(struct bucket));
                memcpy(bucket_gens->b,
                       old_bucket_gens->b,
                       n);
@@ -2130,47 +2100,25 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
                               BITS_TO_LONGS(n) * sizeof(unsigned long));
        }
 
-       rcu_assign_pointer(ca->buckets[0], buckets);
        rcu_assign_pointer(ca->bucket_gens, bucket_gens);
-       buckets         = old_buckets;
        bucket_gens     = old_bucket_gens;
 
        swap(ca->buckets_nouse, buckets_nouse);
 
+       nbuckets = ca->mi.nbuckets;
+
        if (resize) {
                percpu_up_write(&c->mark_lock);
+               up_write(&ca->bucket_lock);
                up_write(&c->gc_lock);
        }
 
-       spin_lock(&c->freelist_lock);
-       for (i = 0; i < RESERVE_NR; i++) {
-               fifo_move(&free[i], &ca->free[i]);
-               swap(ca->free[i], free[i]);
-       }
-       fifo_move(&free_inc, &ca->free_inc);
-       swap(ca->free_inc, free_inc);
-       spin_unlock(&c->freelist_lock);
-
-       /* with gc lock held, alloc_heap can't be in use: */
-       swap(ca->alloc_heap, alloc_heap);
-
-       nbuckets = ca->mi.nbuckets;
-
-       if (resize)
-               up_write(&ca->bucket_lock);
-
        ret = 0;
 err:
-       free_heap(&alloc_heap);
-       free_fifo(&free_inc);
-       for (i = 0; i < RESERVE_NR; i++)
-               free_fifo(&free[i]);
        kvpfree(buckets_nouse,
                BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
        if (bucket_gens)
                call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
-       if (buckets)
-               call_rcu(&buckets->rcu, buckets_free_rcu);
 
        return ret;
 }
@@ -2179,17 +2127,10 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
 {
        unsigned i;
 
-       free_heap(&ca->alloc_heap);
-       free_fifo(&ca->free_inc);
-       for (i = 0; i < RESERVE_NR; i++)
-               free_fifo(&ca->free[i]);
        kvpfree(ca->buckets_nouse,
                BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
        kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
                sizeof(struct bucket_gens) + ca->mi.nbuckets);
-       kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
-               sizeof(struct bucket_array) +
-               ca->mi.nbuckets * sizeof(struct bucket));
 
        for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
                free_percpu(ca->usage[i]);
index 392e03d4c319e8039bf30c4eea4896c1f047b0e6..4a3d6bf1e3efa4a5a805982413343b9ecf73a474 100644 (file)
        for (_b = (_buckets)->b + (_buckets)->first_bucket;     \
             _b < (_buckets)->b + (_buckets)->nbuckets; _b++)
 
-#define bucket_cmpxchg(g, new, expr)                           \
-({                                                             \
-       struct bucket *_g = g;                                  \
-       u64 _v = atomic64_read(&(g)->_mark.v);                  \
-       struct bucket_mark _old;                                \
-                                                               \
-       do {                                                    \
-               (new).v.counter = _old.v.counter = _v;          \
-               expr;                                           \
-       } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v,         \
-                              _old.v.counter,                  \
-                              (new).v.counter)) != _old.v.counter);\
-       _old;                                                   \
-})
-
-static inline struct bucket_array *__bucket_array(struct bch_dev *ca,
-                                                 bool gc)
+static inline void bucket_unlock(struct bucket *b)
 {
-       return rcu_dereference_check(ca->buckets[gc],
-                                    !ca->fs ||
-                                    percpu_rwsem_is_held(&ca->fs->mark_lock) ||
-                                    lockdep_is_held(&ca->fs->gc_lock) ||
-                                    lockdep_is_held(&ca->bucket_lock));
+       smp_store_release(&b->lock, 0);
 }
 
-static inline struct bucket_array *bucket_array(struct bch_dev *ca)
+static inline void bucket_lock(struct bucket *b)
 {
-       return __bucket_array(ca, false);
+       while (xchg(&b->lock, 1))
+               cpu_relax();
 }
 
-static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc)
+static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca)
 {
-       struct bucket_array *buckets = __bucket_array(ca, gc);
-
-       BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
-       return buckets->b + b;
+       return rcu_dereference_check(ca->buckets_gc,
+                                    !ca->fs ||
+                                    percpu_rwsem_is_held(&ca->fs->mark_lock) ||
+                                    lockdep_is_held(&ca->fs->gc_lock) ||
+                                    lockdep_is_held(&ca->bucket_lock));
 }
 
 static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b)
 {
-       return __bucket(ca, b, true);
-}
+       struct bucket_array *buckets = gc_bucket_array(ca);
 
-static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
-{
-       return __bucket(ca, b, false);
+       BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets);
+       return buckets->b + b;
 }
 
 static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
@@ -70,7 +50,6 @@ static inline struct bucket_gens *bucket_gens(struct bch_dev *ca)
                                     percpu_rwsem_is_held(&ca->fs->mark_lock) ||
                                     lockdep_is_held(&ca->fs->gc_lock) ||
                                     lockdep_is_held(&ca->bucket_lock));
-
 }
 
 static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
@@ -81,16 +60,6 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
        return gens->b + b;
 }
 
-/*
- * bucket_gc_gen() returns the difference between the bucket's current gen and
- * the oldest gen of any pointer into that bucket in the btree.
- */
-
-static inline u8 bucket_gc_gen(struct bucket *g)
-{
-       return g->mark.gen - g->oldest_gen;
-}
-
 static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
                                   const struct bch_extent_ptr *ptr)
 {
@@ -141,62 +110,55 @@ static inline u8 ptr_stale(struct bch_dev *ca,
        return ret;
 }
 
-/* bucket gc marks */
-
-static inline bool is_available_bucket(struct bucket_mark mark)
-{
-       return !mark.dirty_sectors && !mark.stripe;
-}
-
 /* Device usage: */
 
 struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
 
 static inline u64 __dev_buckets_available(struct bch_dev *ca,
-                                         struct bch_dev_usage stats)
+                                         struct bch_dev_usage stats,
+                                         enum alloc_reserve reserve)
 {
-       u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+       s64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+       s64 reserved = 0;
+
+       switch (reserve) {
+       case RESERVE_NONE:
+               reserved += ca->mi.nbuckets >> 6;
+               fallthrough;
+       case RESERVE_MOVINGGC:
+               reserved += ca->nr_btree_reserve;
+               fallthrough;
+       case RESERVE_BTREE:
+               reserved += ca->nr_btree_reserve;
+               fallthrough;
+       case RESERVE_BTREE_MOVINGGC:
+               break;
+       default:
+               BUG();
+       }
 
        if (WARN_ONCE(stats.buckets_unavailable > total,
                      "buckets_unavailable overflow (%llu > %llu)\n",
                      stats.buckets_unavailable, total))
                return 0;
 
-       return total - stats.buckets_unavailable;
-}
-
-static inline u64 dev_buckets_available(struct bch_dev *ca)
-{
-       return __dev_buckets_available(ca, bch2_dev_usage_read(ca));
-}
-
-static inline u64 __dev_buckets_reclaimable(struct bch_dev *ca,
-                                           struct bch_dev_usage stats)
-{
-       struct bch_fs *c = ca->fs;
-       s64 available = __dev_buckets_available(ca, stats);
-       unsigned i;
-
-       spin_lock(&c->freelist_lock);
-       for (i = 0; i < RESERVE_NR; i++)
-               available -= fifo_used(&ca->free[i]);
-       available -= fifo_used(&ca->free_inc);
-       available -= ca->nr_open_buckets;
-       spin_unlock(&c->freelist_lock);
-
-       return max(available, 0LL);
+       return max_t(s64, 0,
+                    total -
+                    stats.buckets_unavailable -
+                    ca->nr_open_buckets -
+                    reserved);
 }
 
-static inline u64 dev_buckets_reclaimable(struct bch_dev *ca)
+static inline u64 dev_buckets_available(struct bch_dev *ca,
+                                       enum alloc_reserve reserve)
 {
-       return __dev_buckets_reclaimable(ca, bch2_dev_usage_read(ca));
+       return __dev_buckets_available(ca, bch2_dev_usage_read(ca), reserve);
 }
 
 /* Filesystem usage: */
 
 static inline unsigned fs_usage_u64s(struct bch_fs *c)
 {
-
        return sizeof(struct bch_fs_usage) / sizeof(u64) +
                READ_ONCE(c->replicas.nr);
 }
@@ -224,7 +186,6 @@ bch2_fs_usage_read_short(struct bch_fs *);
 
 void bch2_fs_usage_initialize(struct bch_fs *);
 
-void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, size_t, bool);
 void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
                               size_t, enum bch_data_type, unsigned,
                               struct gc_pos, unsigned);
index 2c73dc60b838f08f42da26ded8c6b8a0358a4b20..e79a33795bf936144675bda12f2659f5b49c90aa 100644 (file)
@@ -7,32 +7,15 @@
 
 #define BUCKET_JOURNAL_SEQ_BITS                16
 
-struct bucket_mark {
-       union {
-       atomic64_t      v;
-
-       struct {
-       u8              gen;
-       u8              data_type:3,
-                       owned_by_allocator:1,
-                       stripe:1;
-       u16             dirty_sectors;
-       u16             cached_sectors;
-       };
-       };
-};
-
 struct bucket {
-       union {
-               struct bucket_mark      _mark;
-               const struct bucket_mark mark;
-       };
-
-       u64                             io_time[2];
-       u8                              oldest_gen;
-       unsigned                        gen_valid:1;
-       u8                              stripe_redundancy;
-       u32                             stripe;
+       u8                      lock;
+       u8                      gen_valid:1;
+       u8                      data_type:7;
+       u8                      gen;
+       u8                      stripe_redundancy;
+       u32                     stripe;
+       u32                     dirty_sectors;
+       u32                     cached_sectors;
 };
 
 struct bucket_array {
@@ -111,7 +94,7 @@ struct copygc_heap_entry {
        u8                      dev;
        u8                      gen;
        u8                      replicas;
-       u16                     fragmentation;
+       u32                     fragmentation;
        u32                     sectors;
        u64                     offset;
 };
index 58b2c96f450c9ba8a4787431665e5e42b7b9833f..2fd5d9672a44287b42a10acfcdc81c5865830c07 100644 (file)
@@ -15,17 +15,26 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
-       unsigned ret = 0;
+       unsigned ret = 0, lru = 0;
 
        bkey_extent_entry_for_each(ptrs, entry) {
                switch (__extent_entry_type(entry)) {
                case BCH_EXTENT_ENTRY_ptr:
+                       /* Might also be updating LRU btree */
+                       if (entry->ptr.cached)
+                               lru++;
+
+                       fallthrough;
                case BCH_EXTENT_ENTRY_stripe_ptr:
                        ret++;
                }
        }
 
-       return ret;
+       /*
+        * Updating keys in the alloc btree may also update keys in the
+        * freespace or discard btrees:
+        */
+       return lru + ret * 2;
 }
 
 static int count_iters_for_insert(struct btree_trans *trans,
index eb556ecc511f987f0e1511c53148aa0c31652b20..340f0bed7391f11cbca3e850d471a37ccf88ae84 100644 (file)
@@ -15,8 +15,8 @@
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
+#include "journal_sb.h"
 #include "journal_seq_blacklist.h"
-#include "super-io.h"
 
 #include <trace/events/bcachefs.h>
 
@@ -767,86 +767,75 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                                         bool new_fs, struct closure *cl)
 {
        struct bch_fs *c = ca->fs;
+       struct journal *j = &c->journal;
        struct journal_device *ja = &ca->journal;
-       struct bch_sb_field_journal *journal_buckets;
        u64 *new_bucket_seq = NULL, *new_buckets = NULL;
+       struct open_bucket **ob = NULL;
+       long *bu = NULL;
+       unsigned i, nr_got = 0, nr_want = nr - ja->nr;
+       unsigned old_nr                 = ja->nr;
+       unsigned old_discard_idx        = ja->discard_idx;
+       unsigned old_dirty_idx_ondisk   = ja->dirty_idx_ondisk;
+       unsigned old_dirty_idx          = ja->dirty_idx;
+       unsigned old_cur_idx            = ja->cur_idx;
        int ret = 0;
 
-       /* don't handle reducing nr of buckets yet: */
-       if (nr <= ja->nr)
-               return 0;
+       bch2_journal_block(j);
+       bch2_journal_flush_all_pins(j);
 
+       bu              = kzalloc(nr_want * sizeof(*bu), GFP_KERNEL);
+       ob              = kzalloc(nr_want * sizeof(*ob), GFP_KERNEL);
        new_buckets     = kzalloc(nr * sizeof(u64), GFP_KERNEL);
        new_bucket_seq  = kzalloc(nr * sizeof(u64), GFP_KERNEL);
-       if (!new_buckets || !new_bucket_seq) {
+       if (!bu || !ob || !new_buckets || !new_bucket_seq) {
                ret = -ENOMEM;
-               goto err;
+               goto err_unblock;
        }
 
-       journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
-                                       nr + sizeof(*journal_buckets) / sizeof(u64));
-       if (!journal_buckets) {
-               ret = -ENOSPC;
-               goto err;
+       for (nr_got = 0; nr_got < nr_want; nr_got++) {
+               if (new_fs) {
+                       bu[nr_got] = bch2_bucket_alloc_new_fs(ca);
+                       if (bu[nr_got] < 0) {
+                               ret = -ENOSPC;
+                               break;
+                       }
+               } else {
+                       ob[nr_got] = bch2_bucket_alloc(c, ca, RESERVE_NONE,
+                                              false, cl);
+                       if (IS_ERR(ob[nr_got])) {
+                               ret = cl ? -EAGAIN : -ENOSPC;
+                               break;
+                       }
+
+                       bu[nr_got] = ob[nr_got]->bucket;
+               }
        }
 
+       if (!nr_got)
+               goto err_unblock;
+
        /*
         * We may be called from the device add path, before the new device has
         * actually been added to the running filesystem:
         */
        if (!new_fs)
-               spin_lock(&c->journal.lock);
+               spin_lock(&j->lock);
 
        memcpy(new_buckets,     ja->buckets,    ja->nr * sizeof(u64));
        memcpy(new_bucket_seq,  ja->bucket_seq, ja->nr * sizeof(u64));
        swap(new_buckets,       ja->buckets);
        swap(new_bucket_seq,    ja->bucket_seq);
 
-       if (!new_fs)
-               spin_unlock(&c->journal.lock);
-
-       while (ja->nr < nr) {
-               struct open_bucket *ob = NULL;
-               unsigned pos;
-               long b;
-
-               if (new_fs) {
-                       b = bch2_bucket_alloc_new_fs(ca);
-                       if (b < 0) {
-                               ret = -ENOSPC;
-                               goto err;
-                       }
-               } else {
-                       rcu_read_lock();
-                       ob = bch2_bucket_alloc(c, ca, RESERVE_NONE,
-                                              false, cl);
-                       rcu_read_unlock();
-                       if (IS_ERR(ob)) {
-                               ret = cl ? -EAGAIN : -ENOSPC;
-                               goto err;
-                       }
-
-                       b = ob->bucket;
-               }
-
-               if (c)
-                       spin_lock(&c->journal.lock);
-
-               /*
-                * XXX
-                * For resize at runtime, we should be writing the new
-                * superblock before inserting into the journal array
-                */
+       for (i = 0; i < nr_got; i++) {
+               unsigned pos = ja->discard_idx ?: ja->nr;
+               long b = bu[i];
 
-               pos = ja->discard_idx ?: ja->nr;
                __array_insert_item(ja->buckets,                ja->nr, pos);
                __array_insert_item(ja->bucket_seq,             ja->nr, pos);
-               __array_insert_item(journal_buckets->buckets,   ja->nr, pos);
                ja->nr++;
 
                ja->buckets[pos] = b;
                ja->bucket_seq[pos] = 0;
-               journal_buckets->buckets[pos] = cpu_to_le64(b);
 
                if (pos <= ja->discard_idx)
                        ja->discard_idx = (ja->discard_idx + 1) % ja->nr;
@@ -856,29 +845,54 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                        ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr;
                if (pos <= ja->cur_idx)
                        ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
+       }
 
-               if (c)
-                       spin_unlock(&c->journal.lock);
+       ret = bch2_journal_buckets_to_sb(c, ca);
+       if (ret) {
+               /* Revert: */
+               swap(new_buckets,       ja->buckets);
+               swap(new_bucket_seq,    ja->bucket_seq);
+               ja->nr                  = old_nr;
+               ja->discard_idx         = old_discard_idx;
+               ja->dirty_idx_ondisk    = old_dirty_idx_ondisk;
+               ja->dirty_idx           = old_dirty_idx;
+               ja->cur_idx             = old_cur_idx;
+       }
 
-               if (!new_fs) {
+       if (!new_fs)
+               spin_unlock(&j->lock);
+
+       bch2_journal_unblock(j);
+
+       if (ret)
+               goto err;
+
+       if (!new_fs) {
+               for (i = 0; i < nr_got; i++) {
                        ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
                                bch2_trans_mark_metadata_bucket(&trans, ca,
-                                               b, BCH_DATA_journal,
+                                               bu[i], BCH_DATA_journal,
                                                ca->mi.bucket_size));
-
-                       bch2_open_bucket_put(c, ob);
-
-                       if (ret)
+                       if (ret) {
+                               bch2_fs_inconsistent(c, "error marking new journal buckets: %i", ret);
                                goto err;
+                       }
                }
        }
 err:
-       bch2_sb_resize_journal(&ca->disk_sb,
-               ja->nr + sizeof(*journal_buckets) / sizeof(u64));
+       if (ob && !new_fs)
+               for (i = 0; i < nr_got; i++)
+                       bch2_open_bucket_put(c, ob[i]);
+
        kfree(new_bucket_seq);
        kfree(new_buckets);
+       kfree(ob);
+       kfree(bu);
 
        return ret;
+err_unblock:
+       bch2_journal_unblock(j);
+       goto err;
 }
 
 /*
@@ -891,11 +905,15 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
        struct journal_device *ja = &ca->journal;
        struct closure cl;
        unsigned current_nr;
-       int ret;
+       int ret = 0;
+
+       /* don't handle reducing nr of buckets yet: */
+       if (nr < ja->nr)
+               return 0;
 
        closure_init_stack(&cl);
 
-       do {
+       while (ja->nr != nr && (ret == 0 || ret == -EAGAIN)) {
                struct disk_reservation disk_res = { 0, 0 };
 
                closure_sync(&cl);
@@ -923,7 +941,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
                if (ja->nr != current_nr)
                        bch2_write_super(c);
                mutex_unlock(&c->sb_lock);
-       } while (ret == -EAGAIN);
+       }
 
        return ret;
 }
@@ -1092,9 +1110,20 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
        struct journal_device *ja = &ca->journal;
        struct bch_sb_field_journal *journal_buckets =
                bch2_sb_get_journal(sb);
+       struct bch_sb_field_journal_v2 *journal_buckets_v2 =
+               bch2_sb_get_journal_v2(sb);
        unsigned i;
 
-       ja->nr = bch2_nr_journal_buckets(journal_buckets);
+       ja->nr = 0;
+
+       if (journal_buckets_v2) {
+               unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+
+               for (i = 0; i < nr; i++)
+                       ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
+       } else if (journal_buckets) {
+               ja->nr = bch2_nr_journal_buckets(journal_buckets);
+       }
 
        ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
        if (!ja->bucket_seq)
@@ -1109,8 +1138,18 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
        if (!ja->buckets)
                return -ENOMEM;
 
-       for (i = 0; i < ja->nr; i++)
-               ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+       if (journal_buckets_v2) {
+               unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
+               unsigned j, dst = 0;
+
+               for (i = 0; i < nr; i++)
+                       for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
+                               ja->buckets[dst++] =
+                                       le64_to_cpu(journal_buckets_v2->d[i].start) + j;
+       } else if (journal_buckets) {
+               for (i = 0; i < ja->nr; i++)
+                       ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
+       }
 
        return 0;
 }
index fb24ca212b09448e21c752e3454e9017852df270..bacb8058f60ad47c6a6dc591de2319accb71b144 100644 (file)
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
+#include "alloc_background.h"
 #include "alloc_foreground.h"
 #include "btree_io.h"
 #include "btree_update_interior.h"
@@ -1372,6 +1373,9 @@ static void journal_write_done(struct closure *cl)
                if (!JSET_NO_FLUSH(w->data)) {
                        j->flushed_seq_ondisk = seq;
                        j->last_seq_ondisk = w->last_seq;
+
+                       bch2_do_discards(c);
+                       closure_wake_up(&c->freelist_wait);
                }
        } else if (!j->err_seq || seq < j->err_seq)
                j->err_seq      = seq;
diff --git a/libbcachefs/journal_sb.c b/libbcachefs/journal_sb.c
new file mode 100644 (file)
index 0000000..0a8a007
--- /dev/null
@@ -0,0 +1,222 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "journal_sb.h"
+
+#include <linux/sort.h>
+
+/* BCH_SB_FIELD_journal: */
+
+static int u64_cmp(const void *_l, const void *_r)
+{
+       const u64 *l = _l;
+       const u64 *r = _r;
+
+       return cmp_int(*l, *r);
+}
+
+static int bch2_sb_journal_validate(struct bch_sb *sb,
+                                   struct bch_sb_field *f,
+                                   struct printbuf *err)
+{
+       struct bch_sb_field_journal *journal = field_to_type(f, journal);
+       struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+       int ret = -EINVAL;
+       unsigned nr;
+       unsigned i;
+       u64 *b;
+
+       nr = bch2_nr_journal_buckets(journal);
+       if (!nr)
+               return 0;
+
+       b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
+       if (!b)
+               return -ENOMEM;
+
+       for (i = 0; i < nr; i++)
+               b[i] = le64_to_cpu(journal->buckets[i]);
+
+       sort(b, nr, sizeof(u64), u64_cmp, NULL);
+
+       if (!b[0]) {
+               pr_buf(err, "journal bucket at sector 0");
+               goto err;
+       }
+
+       if (b[0] < le16_to_cpu(m->first_bucket)) {
+               pr_buf(err, "journal bucket %llu before first bucket %u",
+                      b[0], le16_to_cpu(m->first_bucket));
+               goto err;
+       }
+
+       if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
+               pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+                      b[nr - 1], le64_to_cpu(m->nbuckets));
+               goto err;
+       }
+
+       for (i = 0; i + 1 < nr; i++)
+               if (b[i] == b[i + 1]) {
+                       pr_buf(err, "duplicate journal buckets %llu", b[i]);
+                       goto err;
+               }
+
+       ret = 0;
+err:
+       kfree(b);
+       return ret;
+}
+
+static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
+                                   struct bch_sb_field *f)
+{
+       struct bch_sb_field_journal *journal = field_to_type(f, journal);
+       unsigned i, nr = bch2_nr_journal_buckets(journal);
+
+       pr_buf(out, "Buckets: ");
+       for (i = 0; i < nr; i++)
+               pr_buf(out, " %llu", le64_to_cpu(journal->buckets[i]));
+       pr_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal = {
+       .validate       = bch2_sb_journal_validate,
+       .to_text        = bch2_sb_journal_to_text,
+};
+
+struct u64_range {
+       u64     start;
+       u64     end;
+};
+
+static int u64_range_cmp(const void *_l, const void *_r)
+{
+       const struct u64_range *l = _l;
+       const struct u64_range *r = _r;
+
+       return cmp_int(l->start, r->start);
+}
+
+static int bch2_sb_journal_v2_validate(struct bch_sb *sb,
+                                   struct bch_sb_field *f,
+                                   struct printbuf *err)
+{
+       struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+       struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
+       int ret = -EINVAL;
+       unsigned nr;
+       unsigned i;
+       struct u64_range *b;
+
+       nr = bch2_sb_field_journal_v2_nr_entries(journal);
+       if (!nr)
+               return 0;
+
+       b = kmalloc_array(sizeof(*b), nr, GFP_KERNEL);
+       if (!b)
+               return -ENOMEM;
+
+       for (i = 0; i < nr; i++) {
+               b[i].start = le64_to_cpu(journal->d[i].start);
+               b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr);
+       }
+
+       sort(b, nr, sizeof(*b), u64_range_cmp, NULL);
+
+       if (!b[0].start) {
+               pr_buf(err, "journal bucket at sector 0");
+               goto err;
+       }
+
+       if (b[0].start < le16_to_cpu(m->first_bucket)) {
+               pr_buf(err, "journal bucket %llu before first bucket %u",
+                      b[0], le16_to_cpu(m->first_bucket));
+               goto err;
+       }
+
+       if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) {
+               pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)",
+                      b[nr - 1], le64_to_cpu(m->nbuckets));
+               goto err;
+       }
+
+       for (i = 0; i + 1 < nr; i++) {
+               if (b[i].end == b[i + 1].start) {
+                       pr_buf(err, "contiguous journal buckets ranges %llu-%llu, %llu-%llu",
+                              b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
+                       goto err;
+               }
+
+               if (b[i].end > b[i + 1].start) {
+                       pr_buf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu",
+                              b[i].start, b[i].end, b[i + 1].start, b[i + 1].end);
+                       goto err;
+               }
+       }
+
+       ret = 0;
+err:
+       kfree(b);
+       return ret;
+}
+
+static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb,
+                                   struct bch_sb_field *f)
+{
+       struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2);
+       unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal);
+
+       pr_buf(out, "Buckets: ");
+       for (i = 0; i < nr; i++)
+               pr_buf(out, " %llu-%llu",
+                      le64_to_cpu(journal->d[i].start),
+                      le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr));
+       pr_newline(out);
+}
+
+const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = {
+       .validate       = bch2_sb_journal_v2_validate,
+       .to_text        = bch2_sb_journal_v2_to_text,
+};
+
+int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca)
+{
+       struct journal_device *ja = &ca->journal;
+       struct bch_sb_field_journal_v2 *j;
+       unsigned i, dst = 0, nr = 1;
+
+       lockdep_assert_held(&c->sb_lock);
+
+       if (!ja->nr) {
+               bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+               bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2);
+               return 0;
+       }
+
+       for (i = 0; i + 1 < ja->nr; i++)
+               if (ja->buckets[i] + 1 != ja->buckets[i + 1])
+                       nr++;
+
+       j = bch2_sb_resize_journal_v2(&ca->disk_sb,
+                                (sizeof(*j) + sizeof(j->d[0]) * nr) / sizeof(u64));
+       if (!j)
+               return -ENOSPC;
+
+       bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal);
+
+       j->d[dst].start = le64_to_cpu(ja->buckets[0]);
+       j->d[dst].nr    = le64_to_cpu(1);
+
+       for (i = 1; i < ja->nr; i++) {
+               if (ja->buckets[i] == ja->buckets[i - 1] + 1) {
+                       le64_add_cpu(&j->d[dst].nr, 1);
+               } else {
+                       dst++;
+                       j->d[dst].start = le64_to_cpu(ja->buckets[i]);
+                       j->d[dst].nr    = le64_to_cpu(1);
+               }
+       }
+
+       return 0;
+}
diff --git a/libbcachefs/journal_sb.h b/libbcachefs/journal_sb.h
new file mode 100644 (file)
index 0000000..a39192e
--- /dev/null
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include "super-io.h"
+#include "vstructs.h"
+
+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
+{
+       return j
+               ? (__le64 *) vstruct_end(&j->field) - j->buckets
+               : 0;
+}
+
+static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j)
+{
+       if (!j)
+               return 0;
+
+       return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0];
+}
+
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal;
+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2;
+
+int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *);
diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c
new file mode 100644 (file)
index 0000000..1772ccb
--- /dev/null
@@ -0,0 +1,203 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "btree_iter.h"
+#include "btree_update.h"
+#include "error.h"
+#include "lru.h"
+#include "recovery.h"
+
+const char *bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
+
+       if (bkey_val_bytes(k.k) < sizeof(*lru))
+               return "incorrect value size";
+
+       return NULL;
+}
+
+void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c,
+                     struct bkey_s_c k)
+{
+       const struct bch_lru *lru = bkey_s_c_to_lru(k).v;
+
+       pr_buf(out, "idx %llu", le64_to_cpu(lru->idx));
+}
+
+static int lru_delete(struct btree_trans *trans, u64 id, u64 idx, u64 time)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       u64 existing_idx;
+       int ret = 0;
+
+       if (!time)
+               return 0;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_lru,
+                            POS(id, time),
+                            BTREE_ITER_INTENT|
+                            BTREE_ITER_WITH_UPDATES);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       if (k.k->type != KEY_TYPE_lru) {
+               bch2_fs_inconsistent(c,
+                       "pointer to nonexistent lru %llu:%llu",
+                       id, time);
+               ret = -EIO;
+               goto err;
+       }
+
+       existing_idx = le64_to_cpu(bkey_s_c_to_lru(k).v->idx);
+       if (existing_idx != idx) {
+               bch2_fs_inconsistent(c,
+                       "lru %llu:%llu with wrong backpointer: got %llu, should be %llu",
+                       id, time, existing_idx, idx);
+               ret = -EIO;
+               goto err;
+       }
+
+       ret = bch2_btree_delete_at(trans, &iter, 0);
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+static int lru_set(struct btree_trans *trans, u64 lru_id, u64 idx, u64 *time)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_i_lru *lru;
+       int ret = 0;
+
+       if (!*time)
+               return 0;
+
+       for_each_btree_key_norestart(trans, iter, BTREE_ID_lru,
+                       POS(lru_id, *time),
+                       BTREE_ITER_SLOTS|
+                       BTREE_ITER_INTENT|
+                       BTREE_ITER_WITH_UPDATES, k, ret)
+               if (bkey_deleted(k.k))
+                       break;
+
+       if (ret)
+               goto err;
+
+       BUG_ON(iter.pos.inode != lru_id);
+       *time = iter.pos.offset;
+
+       lru = bch2_trans_kmalloc(trans, sizeof(*lru));
+       ret = PTR_ERR_OR_ZERO(lru);
+       if (ret)
+               goto err;
+
+       bkey_lru_init(&lru->k_i);
+       lru->k.p        = iter.pos;
+       lru->v.idx      = cpu_to_le64(idx);
+
+       ret = bch2_trans_update(trans, &iter, &lru->k_i, 0);
+       if (ret)
+               goto err;
+err:
+       bch2_trans_iter_exit(trans, &iter);
+       return ret;
+}
+
+int bch2_lru_change(struct btree_trans *trans, u64 id, u64 idx,
+                   u64 old_time, u64 *new_time)
+{
+       if (old_time == *new_time)
+               return 0;
+
+       return  lru_delete(trans, id, idx, old_time) ?:
+               lru_set(trans, id, idx, new_time);
+}
+
+static int bch2_check_lru_key(struct btree_trans *trans,
+                             struct btree_iter *lru_iter, bool initial)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter iter;
+       struct bkey_s_c lru_k, k;
+       struct bkey_alloc_unpacked a;
+       struct printbuf buf1 = PRINTBUF;
+       struct printbuf buf2 = PRINTBUF;
+       u64 idx;
+       int ret;
+
+       lru_k = bch2_btree_iter_peek(lru_iter);
+       if (!lru_k.k)
+               return 0;
+
+       ret = bkey_err(lru_k);
+       if (ret)
+               return ret;
+
+       idx = le64_to_cpu(bkey_s_c_to_lru(lru_k).v->idx);
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
+                            POS(lru_k.k->p.inode, idx), 0);
+       k = bch2_btree_iter_peek_slot(&iter);
+       ret = bkey_err(k);
+       if (ret)
+               goto err;
+
+       a = bch2_alloc_unpack(k);
+
+       if (fsck_err_on(bucket_state(a) != BUCKET_cached ||
+                       a.read_time != lru_k.k->p.offset, c,
+                       "incorrect lru entry %s\n"
+                       "  for %s",
+                       (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf),
+                       (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) {
+               struct bkey_i *update =
+                       bch2_trans_kmalloc(trans, sizeof(*update));
+
+               ret = PTR_ERR_OR_ZERO(update);
+               if (ret)
+                       goto err;
+
+               bkey_init(&update->k);
+               update->k.p = lru_iter->pos;
+
+               ret = bch2_trans_update(trans, lru_iter, update, 0);
+               if (ret)
+                       goto err;
+       }
+err:
+fsck_err:
+       bch2_trans_iter_exit(trans, &iter);
+       printbuf_exit(&buf2);
+       printbuf_exit(&buf1);
+       return ret;
+}
+
+int bch2_check_lrus(struct bch_fs *c, bool initial)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_lru, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+                       bch2_check_lru_key(&trans, &iter, initial));
+               if (ret)
+                       break;
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+
+       bch2_trans_exit(&trans);
+       return ret;
+
+}
diff --git a/libbcachefs/lru.h b/libbcachefs/lru.h
new file mode 100644 (file)
index 0000000..4db6a83
--- /dev/null
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LRU_H
+#define _BCACHEFS_LRU_H
+
+const char *bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+
+#define bch2_bkey_ops_lru (struct bkey_ops) {  \
+       .key_invalid    = bch2_lru_invalid,     \
+       .val_to_text    = bch2_lru_to_text,     \
+}
+
+int bch2_lru_change(struct btree_trans *, u64, u64, u64, u64 *);
+
+int bch2_check_lrus(struct bch_fs *, bool);
+
+#endif /* _BCACHEFS_LRU_H */
index c82ecff3efe2b198eb541616e2f13fd4e4f4564e..466975a3151f80e201face176790dc36a638d4cd 100644 (file)
@@ -119,18 +119,6 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
        return DATA_SKIP;
 }
 
-static bool have_copygc_reserve(struct bch_dev *ca)
-{
-       bool ret;
-
-       spin_lock(&ca->fs->freelist_lock);
-       ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) ||
-               ca->allocator_state != ALLOCATOR_running;
-       spin_unlock(&ca->fs->freelist_lock);
-
-       return ret;
-}
-
 static inline int fragmentation_cmp(copygc_heap *heap,
                                   struct copygc_heap_entry l,
                                   struct copygc_heap_entry r)
@@ -165,7 +153,7 @@ static int walk_buckets_to_copygc(struct bch_fs *c)
                        .dev            = iter.pos.inode,
                        .gen            = u.gen,
                        .replicas       = 1 + u.stripe_redundancy,
-                       .fragmentation  = u.dirty_sectors * (1U << 15)
+                       .fragmentation  = (u64) u.dirty_sectors * (1ULL << 31)
                                / ca->mi.bucket_size,
                        .sectors        = u.dirty_sectors,
                        .offset         = bucket_to_sector(ca, iter.pos.offset),
@@ -262,11 +250,10 @@ static int bch2_copygc(struct bch_fs *c)
        }
 
        for_each_rw_member(ca, c, dev_idx) {
-               closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
+               s64 avail = min(dev_buckets_available(ca, RESERVE_MOVINGGC),
+                               ca->mi.nbuckets >> 6);
 
-               spin_lock(&ca->fs->freelist_lock);
-               sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size;
-               spin_unlock(&ca->fs->freelist_lock);
+               sectors_reserved += avail * ca->mi.bucket_size;
        }
 
        ret = walk_buckets_to_copygc(c);
@@ -367,8 +354,8 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
        for_each_rw_member(ca, c, dev_idx) {
                struct bch_dev_usage usage = bch2_dev_usage_read(ca);
 
-               fragmented_allowed = ((__dev_buckets_reclaimable(ca, usage) *
-                                       ca->mi.bucket_size) >> 1);
+               fragmented_allowed = ((__dev_buckets_available(ca, usage, RESERVE_NONE) *
+                                      ca->mi.bucket_size) >> 1);
                fragmented = usage.d[BCH_DATA_user].fragmented;
 
                wait = min(wait, max(0LL, fragmented_allowed - fragmented));
index 033115f7a6f4b1b2111aef6631ca60d273211e04..70b507fb0de295371d2e79e3fb3aaaeffa0dd0ea 100644 (file)
@@ -265,7 +265,7 @@ enum opt_type {
        x(discard,                      u8,                             \
          OPT_FS|OPT_MOUNT|OPT_DEVICE,                                  \
          OPT_BOOL(),                                                   \
-         BCH2_NO_SB_OPT,               false,                          \
+         BCH2_NO_SB_OPT,               true,                           \
          NULL,         "Enable discard/TRIM support")                  \
        x(verbose,                      u8,                             \
          OPT_FS|OPT_MOUNT,                                             \
index 887971559214eb681e345328260ec12b67356823..fe2c5cb6d4305366ec006925755875433642c88c 100644 (file)
@@ -16,6 +16,7 @@
 #include "journal_io.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
+#include "lru.h"
 #include "move.h"
 #include "quota.h"
 #include "recovery.h"
@@ -1027,8 +1028,8 @@ int bch2_fs_recovery(struct bch_fs *c)
                        bch_info(c, "filesystem version is prior to subvol_dirent - upgrading");
                        c->opts.version_upgrade = true;
                        c->opts.fsck            = true;
-               } else if (c->sb.version < bcachefs_metadata_version_inode_v2) {
-                       bch_info(c, "filesystem version is prior to inode_v2 - upgrading");
+               } else if (c->sb.version < bcachefs_metadata_version_freespace) {
+                       bch_info(c, "filesystem version is prior to freespace - upgrading");
                        c->opts.version_upgrade = true;
                }
        }
@@ -1137,7 +1138,7 @@ use_clean:
        err = "error reading allocation information";
 
        down_read(&c->gc_lock);
-       ret = bch2_alloc_read(c, false, false);
+       ret = bch2_alloc_read(c);
        up_read(&c->gc_lock);
 
        if (ret)
@@ -1165,13 +1166,27 @@ use_clean:
                bool metadata_only = c->opts.norecovery;
 
                bch_info(c, "checking allocations");
-               err = "error in mark and sweep";
+               err = "error checking allocations";
                ret = bch2_gc(c, true, metadata_only);
                if (ret)
                        goto err;
                bch_verbose(c, "done checking allocations");
        }
 
+       if (c->opts.fsck &&
+           c->sb.version >= bcachefs_metadata_version_freespace) {
+               bch_info(c, "checking need_discard and freespace btrees");
+               err = "error checking need_discard and freespace btrees";
+               ret = bch2_check_alloc_info(c, true);
+               if (ret)
+                       goto err;
+
+               ret = bch2_check_lrus(c, true);
+               if (ret)
+                       goto err;
+               bch_verbose(c, "done checking need_discard and freespace btrees");
+       }
+
        bch2_stripes_heap_start(c);
 
        clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
@@ -1196,6 +1211,11 @@ use_clean:
        if (c->opts.verbose || !c->sb.clean)
                bch_info(c, "journal replay done");
 
+       err = "error initializing freespace";
+       ret = bch2_fs_freespace_init(c);
+       if (ret)
+               goto err;
+
        if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
                bch2_fs_lazy_rw(c);
 
@@ -1368,6 +1388,7 @@ int bch2_fs_initialize(struct bch_fs *c)
         * Write out the superblock and journal buckets, now that we can do
         * btree updates
         */
+       bch_verbose(c, "marking superblocks");
        err = "error marking superblock and journal";
        for_each_member_device(ca, c, i) {
                ret = bch2_trans_mark_dev_sb(c, ca);
@@ -1379,6 +1400,12 @@ int bch2_fs_initialize(struct bch_fs *c)
                ca->new_fs_bucket_idx = 0;
        }
 
+       bch_verbose(c, "initializing freespace");
+       err = "error initializing freespace";
+       ret = bch2_fs_freespace_init(c);
+       if (ret)
+               goto err;
+
        err = "error creating root snapshot node";
        ret = bch2_fs_initialize_subvolumes(c);
        if (ret)
index e17ce91c8486fac640f321d583fd8df8e03b4e1a..95af515a01cd9aece5572bfdd7b92e47411c3eb1 100644 (file)
@@ -10,6 +10,7 @@
 #include "io.h"
 #include "journal.h"
 #include "journal_io.h"
+#include "journal_sb.h"
 #include "journal_seq_blacklist.h"
 #include "replicas.h"
 #include "quota.h"
@@ -424,7 +425,7 @@ static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
        memcpy(dst->compat,     src->compat,    sizeof(dst->compat));
 
        for (i = 0; i < BCH_SB_FIELD_NR; i++) {
-               if (i == BCH_SB_FIELD_journal)
+               if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS)
                        continue;
 
                src_f = bch2_sb_field_get(src, i);
@@ -898,85 +899,6 @@ void __bch2_check_set_feature(struct bch_fs *c, unsigned feat)
        mutex_unlock(&c->sb_lock);
 }
 
-/* BCH_SB_FIELD_journal: */
-
-static int u64_cmp(const void *_l, const void *_r)
-{
-       u64 l = *((const u64 *) _l), r = *((const u64 *) _r);
-
-       return l < r ? -1 : l > r ? 1 : 0;
-}
-
-static int bch2_sb_journal_validate(struct bch_sb *sb,
-                                   struct bch_sb_field *f,
-                                   struct printbuf *err)
-{
-       struct bch_sb_field_journal *journal = field_to_type(f, journal);
-       struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx;
-       int ret = -EINVAL;
-       unsigned nr;
-       unsigned i;
-       u64 *b;
-
-       nr = bch2_nr_journal_buckets(journal);
-       if (!nr)
-               return 0;
-
-       b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL);
-       if (!b)
-               return -ENOMEM;
-
-       for (i = 0; i < nr; i++)
-               b[i] = le64_to_cpu(journal->buckets[i]);
-
-       sort(b, nr, sizeof(u64), u64_cmp, NULL);
-
-       if (!b[0]) {
-               pr_buf(err, "journal bucket at sector 0");
-               goto err;
-       }
-
-       if (b[0] < le16_to_cpu(m->first_bucket)) {
-               pr_buf(err, "journal bucket %llu before first bucket %u",
-                      b[0], le16_to_cpu(m->first_bucket));
-               goto err;
-       }
-
-       if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) {
-               pr_buf(err, "journal bucket %llu past end of device (nbuckets %llu)",
-                      b[nr - 1], le64_to_cpu(m->nbuckets));
-               goto err;
-       }
-
-       for (i = 0; i + 1 < nr; i++)
-               if (b[i] == b[i + 1]) {
-                       pr_buf(err, "duplicate journal buckets %llu", b[i]);
-                       goto err;
-               }
-
-       ret = 0;
-err:
-       kfree(b);
-       return ret;
-}
-
-static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb,
-                                   struct bch_sb_field *f)
-{
-       struct bch_sb_field_journal *journal = field_to_type(f, journal);
-       unsigned i, nr = bch2_nr_journal_buckets(journal);
-
-       pr_buf(out, "Buckets: ");
-       for (i = 0; i < nr; i++)
-               pr_buf(out, " %llu", le64_to_cpu(journal->buckets[i]));
-       pr_newline(out);
-}
-
-static const struct bch_sb_field_ops bch_sb_field_ops_journal = {
-       .validate       = bch2_sb_journal_validate,
-       .to_text        = bch2_sb_journal_to_text,
-};
-
 /* BCH_SB_FIELD_members: */
 
 static int bch2_sb_members_validate(struct bch_sb *sb,
@@ -1130,6 +1052,11 @@ static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb,
                pr_buf(out, "%llu", BCH_MEMBER_DISCARD(m));
                pr_newline(out);
 
+               pr_buf(out, "Freespace initialized:");
+               pr_tab(out);
+               pr_buf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m));
+               pr_newline(out);
+
                pr_indent_pop(out, 2);
        }
 }
index 50f31a3b9b1845208a23921f32f2db9a71389006..14a25f6fe29a5756bd6dd218e113564afc3ac32f 100644 (file)
@@ -75,15 +75,6 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
                __bch2_check_set_feature(c, feat);
 }
 
-/* BCH_SB_FIELD_journal: */
-
-static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j)
-{
-       return j
-               ? (__le64 *) vstruct_end(&j->field) - j->buckets
-               : 0;
-}
-
 /* BCH_SB_FIELD_members: */
 
 static inline bool bch2_member_exists(struct bch_member *m)
@@ -112,6 +103,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
                .durability     = BCH_MEMBER_DURABILITY(mi)
                        ? BCH_MEMBER_DURABILITY(mi) - 1
                        : 1,
+               .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
                .valid          = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)),
        };
 }
index 46947163a8dcead4ca6d189127d2929d7f16cbba..6464e8c08ebff1ec1d6045779f6359cd2c154e6e 100644 (file)
@@ -199,17 +199,9 @@ static void __bch2_fs_read_only(struct bch_fs *c)
         */
        bch2_journal_flush_all_pins(&c->journal);
 
-       /*
-        * If the allocator threads didn't all start up, the btree updates to
-        * write out alloc info aren't going to work:
-        */
-       if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags))
-               goto nowrote_alloc;
-
        bch_verbose(c, "flushing journal and stopping allocators");
 
        bch2_journal_flush_all_pins(&c->journal);
-       set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
 
        do {
                clean_passes++;
@@ -234,17 +226,11 @@ static void __bch2_fs_read_only(struct bch_fs *c)
        bch_verbose(c, "flushing journal and stopping allocators complete");
 
        set_bit(BCH_FS_ALLOC_CLEAN, &c->flags);
-nowrote_alloc:
+
        closure_wait_event(&c->btree_interior_update_wait,
                           !bch2_btree_interior_updates_nr_pending(c));
        flush_work(&c->btree_interior_update_work);
 
-       for_each_member_device(ca, c, i)
-               bch2_dev_allocator_stop(ca);
-
-       clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-       clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags);
-
        bch2_fs_journal_stop(&c->journal);
 
        /*
@@ -280,10 +266,6 @@ void bch2_fs_read_only(struct bch_fs *c)
        /*
         * Block new foreground-end write operations from starting - any new
         * writes will return -EROFS:
-        *
-        * (This is really blocking new _allocations_, writes to previously
-        * allocated space can still happen until stopping the allocator in
-        * bch2_dev_allocator_stop()).
         */
        percpu_ref_kill(&c->writes);
 
@@ -412,19 +394,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
                bch2_dev_allocator_add(c, ca);
        bch2_recalc_capacity(c);
 
-       for_each_rw_member(ca, c, i) {
-               ret = bch2_dev_allocator_start(ca);
-               if (ret) {
-                       bch_err(c, "error starting allocator threads");
-                       percpu_ref_put(&ca->io_ref);
-                       goto err;
-               }
-       }
-
-       set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-
-       for_each_rw_member(ca, c, i)
-               bch2_wake_allocator(ca);
+       bch2_do_discards(c);
 
        if (!early) {
                ret = bch2_fs_read_write_late(c);
@@ -941,20 +911,6 @@ int bch2_fs_start(struct bch_fs *c)
 
        set_bit(BCH_FS_STARTED, &c->flags);
 
-       /*
-        * Allocator threads don't start filling copygc reserve until after we
-        * set BCH_FS_STARTED - wake them now:
-        *
-        * XXX ugly hack:
-        * Need to set ca->allocator_state here instead of relying on the
-        * allocator threads to do it to avoid racing with the copygc threads
-        * checking it and thinking they have no alloc reserve:
-        */
-       for_each_online_member(ca, c, i) {
-               ca->allocator_state = ALLOCATOR_running;
-               bch2_wake_allocator(ca);
-       }
-
        if (c->opts.read_only || c->opts.nochanges) {
                bch2_fs_read_only(c);
        } else {
@@ -1046,8 +1002,6 @@ static void bch2_dev_release(struct kobject *kobj)
 
 static void bch2_dev_free(struct bch_dev *ca)
 {
-       bch2_dev_allocator_stop(ca);
-
        cancel_work_sync(&ca->io_error_work);
 
        if (ca->kobj.state_in_sysfs &&
@@ -1162,6 +1116,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
        ca->mi = bch2_mi_to_cpu(member);
        ca->uuid = member->uuid;
 
+       ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
+                            ca->mi.bucket_size / btree_sectors(c));
+
        if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
                            0, GFP_KERNEL) ||
            percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
@@ -1211,12 +1168,6 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 
        ca->fs = c;
 
-       if (ca->mi.state == BCH_MEMBER_STATE_rw &&
-           bch2_dev_allocator_start(ca)) {
-               bch2_dev_free(ca);
-               goto err;
-       }
-
        bch2_dev_attach(c, ca, dev_idx);
 out:
        pr_verbose_init(c->opts, "ret %i", ret);
@@ -1402,14 +1353,13 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
        /*
         * The allocator thread itself allocates btree nodes, so stop it first:
         */
-       bch2_dev_allocator_stop(ca);
        bch2_dev_allocator_remove(c, ca);
        bch2_dev_journal_stop(&c->journal, ca);
 
        bch2_copygc_start(c);
 }
 
-static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
+static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 {
        lockdep_assert_held(&c->state_lock);
 
@@ -1417,8 +1367,6 @@ static int __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 
        bch2_dev_allocator_add(c, ca);
        bch2_recalc_capacity(c);
-
-       return bch2_dev_allocator_start(ca);
 }
 
 int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
@@ -1445,7 +1393,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
        mutex_unlock(&c->sb_lock);
 
        if (new_state == BCH_MEMBER_STATE_rw)
-               ret = __bch2_dev_read_write(c, ca);
+               __bch2_dev_read_write(c, ca);
 
        rebalance_wakeup(c);
 
@@ -1468,30 +1416,20 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 
 static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
-       struct btree_trans trans;
-       size_t i;
+       struct bpos start       = POS(ca->dev_idx, 0);
+       struct bpos end         = POS(ca->dev_idx, U64_MAX);
        int ret;
 
-       bch2_trans_init(&trans, c, 0, 0);
-
-       for (i = 0; i < ca->mi.nbuckets; i++) {
-               ret = lockrestart_do(&trans,
-                       bch2_btree_key_cache_flush(&trans,
-                               BTREE_ID_alloc, POS(ca->dev_idx, i)));
-               if (ret)
-                       break;
-       }
-       bch2_trans_exit(&trans);
-
-       if (ret) {
+       ret =   bch2_btree_delete_range(c, BTREE_ID_alloc, start, end,
+                                       BTREE_TRIGGER_NORUN, NULL) ?:
+               bch2_btree_delete_range(c, BTREE_ID_freespace, start, end,
+                                       BTREE_TRIGGER_NORUN, NULL) ?:
+               bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end,
+                                       BTREE_TRIGGER_NORUN, NULL);
+       if (ret)
                bch_err(c, "error %i removing dev alloc info", ret);
-               return ret;
-       }
 
-       return bch2_btree_delete_range(c, BTREE_ID_alloc,
-                                      POS(ca->dev_idx, 0),
-                                      POS(ca->dev_idx + 1, 0),
-                                      0, NULL);
+       return ret;
 }
 
 int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
@@ -1709,15 +1647,16 @@ have_slot:
                goto err_late;
        }
 
+       ret = bch2_fs_freespace_init(c);
+       if (ret) {
+               bch_err(c, "device add error: error initializing free space: %i", ret);
+               goto err_late;
+       }
+
        ca->new_fs_bucket_idx = 0;
 
-       if (ca->mi.state == BCH_MEMBER_STATE_rw) {
-               ret = __bch2_dev_read_write(c, ca);
-               if (ret) {
-                       bch_err(c, "device add error: error going RW on new device: %i", ret);
-                       goto err_late;
-               }
-       }
+       if (ca->mi.state == BCH_MEMBER_STATE_rw)
+               __bch2_dev_read_write(c, ca);
 
        up_write(&c->state_lock);
        return 0;
@@ -1777,11 +1716,8 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
                goto err;
        }
 
-       if (ca->mi.state == BCH_MEMBER_STATE_rw) {
-               ret = __bch2_dev_read_write(c, ca);
-               if (ret)
-                       goto err;
-       }
+       if (ca->mi.state == BCH_MEMBER_STATE_rw)
+               __bch2_dev_read_write(c, ca);
 
        mutex_lock(&c->sb_lock);
        mi = bch2_sb_get_members(c->disk_sb.sb);
index d8b159a5b7f78ccacdd87236ffc32224bdc1baee..89419fc7930d004f5b68cc80a53630ac625003d3 100644 (file)
@@ -32,6 +32,7 @@ struct bch_member_cpu {
        u8                      discard;
        u8                      data_allowed;
        u8                      durability;
+       u8                      freespace_initialized;
        u8                      valid;
 };
 
index 3d6ece515a886f486ba533166269af28766e904d..bed48afb4ac9f1d35477939232ed12d617fb6ff0 100644 (file)
@@ -170,7 +170,6 @@ read_attribute(congested);
 
 read_attribute(btree_avg_write_size);
 
-read_attribute(reserve_stats);
 read_attribute(btree_cache_size);
 read_attribute(compression_stats);
 read_attribute(journal_debug);
@@ -185,11 +184,11 @@ read_attribute(internal_uuid);
 
 read_attribute(has_data);
 read_attribute(alloc_debug);
-write_attribute(wake_allocator);
 
 read_attribute(read_realloc_races);
 read_attribute(extent_migrate_done);
 read_attribute(extent_migrate_raced);
+read_attribute(bucket_alloc_fail);
 
 rw_attribute(discard);
 rw_attribute(label);
@@ -376,6 +375,8 @@ SHOW(bch2_fs)
                    atomic_long_read(&c->extent_migrate_done));
        sysfs_print(extent_migrate_raced,
                    atomic_long_read(&c->extent_migrate_raced));
+       sysfs_print(bucket_alloc_fail,
+                   atomic_long_read(&c->bucket_alloc_fail));
 
        sysfs_printf(btree_gc_periodic, "%u",   (int) c->btree_gc_periodic);
 
@@ -572,6 +573,7 @@ struct attribute *bch2_fs_internal_files[] = {
        &sysfs_read_realloc_races,
        &sysfs_extent_migrate_done,
        &sysfs_extent_migrate_raced,
+       &sysfs_bucket_alloc_fail,
 
        &sysfs_gc_gens_pos,
 
@@ -698,24 +700,6 @@ struct attribute *bch2_fs_time_stats_files[] = {
        NULL
 };
 
-static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca)
-{
-       enum alloc_reserve i;
-
-       spin_lock(&ca->fs->freelist_lock);
-
-       pr_buf(out, "free_inc:\t%zu\t%zu\n",
-              fifo_used(&ca->free_inc),
-              ca->free_inc.size);
-
-       for (i = 0; i < RESERVE_NR; i++)
-               pr_buf(out, "free[%u]:\t%zu\t%zu\n", i,
-                      fifo_used(&ca->free[i]),
-                      ca->free[i].size);
-
-       spin_unlock(&ca->fs->freelist_lock);
-}
-
 static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 {
        struct bch_fs *c = ca->fs;
@@ -741,9 +725,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
               "ec\t%16llu\n"
               "available%15llu\n"
               "\n"
-              "free_inc\t\t%zu/%zu\n"
-              "free[RESERVE_MOVINGGC]\t%zu/%zu\n"
-              "free[RESERVE_NONE]\t%zu/%zu\n"
               "freelist_wait\t\t%s\n"
               "open buckets allocated\t%u\n"
               "open buckets this dev\t%u\n"
@@ -751,13 +732,9 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
               "open_buckets_wait\t%s\n"
               "open_buckets_btree\t%u\n"
               "open_buckets_user\t%u\n"
-              "btree reserve cache\t%u\n"
-              "thread state:\t\t%s\n",
+              "btree reserve cache\t%u\n",
               stats.buckets_ec,
-              __dev_buckets_available(ca, stats),
-              fifo_used(&ca->free_inc),                ca->free_inc.size,
-              fifo_used(&ca->free[RESERVE_MOVINGGC]),  ca->free[RESERVE_MOVINGGC].size,
-              fifo_used(&ca->free[RESERVE_NONE]),      ca->free[RESERVE_NONE].size,
+              __dev_buckets_available(ca, stats, RESERVE_NONE),
               c->freelist_wait.list.first              ? "waiting" : "empty",
               OPEN_BUCKETS_COUNT - c->open_buckets_nr_free,
               ca->nr_open_buckets,
@@ -765,8 +742,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
               c->open_buckets_wait.list.first          ? "waiting" : "empty",
               nr[BCH_DATA_btree],
               nr[BCH_DATA_user],
-              c->btree_reserve_cache_nr,
-              bch2_allocator_states[ca->allocator_state]);
+              c->btree_reserve_cache_nr);
 }
 
 static const char * const bch2_rw[] = {
@@ -841,9 +817,6 @@ SHOW(bch2_dev)
                     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
                     * 100 / CONGESTED_MAX);
 
-       if (attr == &sysfs_reserve_stats)
-               reserve_stats_to_text(out, ca);
-
        if (attr == &sysfs_alloc_debug)
                dev_alloc_debug_to_text(out, ca);
 
@@ -883,9 +856,6 @@ STORE(bch2_dev)
                        return ret;
        }
 
-       if (attr == &sysfs_wake_allocator)
-               bch2_wake_allocator(ca);
-
        return size;
 }
 SYSFS_OPS(bch2_dev);
@@ -911,11 +881,8 @@ struct attribute *bch2_dev_files[] = {
        &sysfs_io_latency_stats_write,
        &sysfs_congested,
 
-       &sysfs_reserve_stats,
-
        /* debug: */
        &sysfs_alloc_debug,
-       &sysfs_wake_allocator,
        NULL
 };