]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 75e8a078b8 bcachefs: improved flush_held_btree_writes()
authorKent Overstreet <kent.overstreet@gmail.com>
Fri, 1 Mar 2019 02:34:16 +0000 (21:34 -0500)
committerKent Overstreet <kent.overstreet@gmail.com>
Fri, 1 Mar 2019 03:33:41 +0000 (22:33 -0500)
31 files changed:
.bcachefs_revision
cmd_migrate.c
libbcachefs/alloc_background.c
libbcachefs/alloc_background.h
libbcachefs/alloc_foreground.c
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/btree_gc.c
libbcachefs/btree_types.h
libbcachefs/btree_update.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/buckets_types.h
libbcachefs/chardev.c
libbcachefs/extents.c
libbcachefs/fifo.h
libbcachefs/journal.c
libbcachefs/journal.h
libbcachefs/journal_io.c
libbcachefs/journal_io.h
libbcachefs/journal_reclaim.c
libbcachefs/journal_reclaim.h
libbcachefs/journal_types.h
libbcachefs/recovery.c
libbcachefs/replicas.c
libbcachefs/str_hash.h
libbcachefs/super-io.c
libbcachefs/super.c
libbcachefs/sysfs.c

index 39d11479be4ef89a31478bf8f534a4355e3a8415..6766622b3442de308ea79bdec5e741ed5b134c86 100644 (file)
@@ -1 +1 @@
-09a546543006b60d44c4c51e7b40cd3ec7837a5e
+75e8a078b85703322fcf558f75a6845c0ef5dbb0
index e9594ab79bb869c2b16b986d7bf0368396b098d6..4b6ceaa7b4e909275287b2110a7f64c7c057facc 100644 (file)
@@ -319,6 +319,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
                struct bkey_i_extent *e;
                BKEY_PADDED(k) k;
                u64 b = sector_to_bucket(ca, physical);
+               struct bucket_mark m;
                struct disk_reservation res;
                unsigned sectors;
                int ret;
@@ -337,7 +338,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
                                        .gen = bucket(ca, b)->mark.gen,
                                  });
 
-               bucket_set_dirty(ca, b);
+               bucket_cmpxchg(bucket(ca, b), m, m.dirty = true);
 
                ret = bch2_disk_reservation_get(c, &res, sectors, 1,
                                                BCH_DISK_RESERVATION_NOFAIL);
index ce42202fdd14289dd9e73cd2f02e1520118a36a2..f246319b50e1f878b36a11f103d1798254790749 100644 (file)
@@ -128,6 +128,34 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p,
        *p += bytes;
 }
 
+struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *a)
+{
+       struct bkey_alloc_unpacked ret = { .gen = a->gen };
+       const void *d = a->data;
+       unsigned idx = 0;
+
+#define x(_name, _bits)        ret._name = get_alloc_field(a, &d, idx++);
+       BCH_ALLOC_FIELDS()
+#undef  x
+       return ret;
+}
+
+static void bch2_alloc_pack(struct bkey_i_alloc *dst,
+                           const struct bkey_alloc_unpacked src)
+{
+       unsigned idx = 0;
+       void *d = dst->v.data;
+
+       dst->v.fields   = 0;
+       dst->v.gen      = src.gen;
+
+#define x(_name, _bits)        put_alloc_field(dst, &d, idx++, src._name);
+       BCH_ALLOC_FIELDS()
+#undef  x
+
+       set_bkey_val_bytes(&dst->k, (void *) d - (void *) &dst->v);
+}
+
 static unsigned bch_alloc_val_u64s(const struct bch_alloc *a)
 {
        unsigned i, bytes = offsetof(struct bch_alloc, data);
@@ -173,15 +201,24 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a)
 {
        const void *d = a->data;
-       unsigned idx = 0;
+       unsigned idx = 0, data_type, dirty_sectors, cached_sectors;
+       struct bucket_mark m;
 
-       g->_mark.gen            = a->gen;
-       g->gen_valid            = 1;
        g->io_time[READ]        = get_alloc_field(a, &d, idx++);
        g->io_time[WRITE]       = get_alloc_field(a, &d, idx++);
-       g->_mark.data_type      = get_alloc_field(a, &d, idx++);
-       g->_mark.dirty_sectors  = get_alloc_field(a, &d, idx++);
-       g->_mark.cached_sectors = get_alloc_field(a, &d, idx++);
+       data_type               = get_alloc_field(a, &d, idx++);
+       dirty_sectors           = get_alloc_field(a, &d, idx++);
+       cached_sectors          = get_alloc_field(a, &d, idx++);
+       g->oldest_gen           = get_alloc_field(a, &d, idx++);
+
+       bucket_cmpxchg(g, m, ({
+               m.gen                   = a->gen;
+               m.data_type             = data_type;
+               m.dirty_sectors         = dirty_sectors;
+               m.cached_sectors        = cached_sectors;
+       }));
+
+       g->gen_valid            = 1;
 }
 
 static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
@@ -199,6 +236,7 @@ static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g,
        put_alloc_field(a, &d, idx++, m.data_type);
        put_alloc_field(a, &d, idx++, m.dirty_sectors);
        put_alloc_field(a, &d, idx++, m.cached_sectors);
+       put_alloc_field(a, &d, idx++, g->oldest_gen);
 
        set_bkey_val_bytes(&a->k, (void *) d - (void *) &a->v);
 }
@@ -315,6 +353,7 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca,
                                   BTREE_INSERT_NOFAIL|
                                   BTREE_INSERT_USE_RESERVE|
                                   BTREE_INSERT_USE_ALLOC_RESERVE|
+                                  BTREE_INSERT_NOMARK|
                                   flags,
                                   BTREE_INSERT_ENTRY(iter, &a->k_i));
        if (ret)
@@ -358,7 +397,8 @@ int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k)
                ? 0
                : bch2_btree_insert_at(c, NULL, NULL,
                                       BTREE_INSERT_NOFAIL|
-                                      BTREE_INSERT_JOURNAL_REPLAY,
+                                      BTREE_INSERT_JOURNAL_REPLAY|
+                                      BTREE_INSERT_NOMARK,
                                       BTREE_INSERT_ENTRY(&iter, k));
 err:
        bch2_btree_iter_unlock(&iter);
@@ -824,6 +864,142 @@ static inline long next_alloc_bucket(struct bch_dev *ca)
        return -1;
 }
 
+/*
+ * returns sequence number of most recent journal entry that updated this
+ * bucket:
+ */
+static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
+{
+       if (m.journal_seq_valid) {
+               u64 journal_seq = atomic64_read(&c->journal.seq);
+               u64 bucket_seq  = journal_seq;
+
+               bucket_seq &= ~((u64) U16_MAX);
+               bucket_seq |= m.journal_seq;
+
+               if (bucket_seq > journal_seq)
+                       bucket_seq -= 1 << 16;
+
+               return bucket_seq;
+       } else {
+               return 0;
+       }
+}
+
+static int bch2_invalidate_one_bucket2(struct bch_fs *c, struct bch_dev *ca,
+                                      struct btree_iter *iter,
+                                      u64 *journal_seq, unsigned flags)
+{
+#if 0
+       __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key;
+#else
+       /* hack: */
+       __BKEY_PADDED(k, 8) alloc_key;
+#endif
+       struct bkey_i_alloc *a;
+       struct bkey_alloc_unpacked u;
+       struct bucket_mark m;
+       struct bkey_s_c k;
+       bool invalidating_cached_data;
+       size_t b;
+       int ret;
+
+       BUG_ON(!ca->alloc_heap.used ||
+              !ca->alloc_heap.data[0].nr);
+       b = ca->alloc_heap.data[0].bucket;
+
+       /* first, put on free_inc and mark as owned by allocator: */
+       percpu_down_read_preempt_disable(&c->mark_lock);
+       spin_lock(&c->freelist_lock);
+
+       verify_not_on_freelist(c, ca, b);
+
+       BUG_ON(!fifo_push(&ca->free_inc, b));
+
+       bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0);
+       m = bucket(ca, b)->mark;
+
+       spin_unlock(&c->freelist_lock);
+       percpu_up_read_preempt_enable(&c->mark_lock);
+
+       bch2_btree_iter_cond_resched(iter);
+
+       BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8);
+
+       bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b));
+retry:
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = btree_iter_err(k);
+       if (ret)
+               return ret;
+
+       if (k.k && k.k->type == KEY_TYPE_alloc)
+               u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
+       else
+               memset(&u, 0, sizeof(u));
+
+       invalidating_cached_data = u.cached_sectors != 0;
+
+       //BUG_ON(u.dirty_sectors);
+       u.data_type     = 0;
+       u.dirty_sectors = 0;
+       u.cached_sectors = 0;
+       u.read_time     = c->bucket_clock[READ].hand;
+       u.write_time    = c->bucket_clock[WRITE].hand;
+       u.gen++;
+
+       a = bkey_alloc_init(&alloc_key.k);
+       a->k.p = iter->pos;
+       bch2_alloc_pack(a, u);
+
+       ret = bch2_btree_insert_at(c, NULL,
+                       invalidating_cached_data ? journal_seq : NULL,
+                       BTREE_INSERT_ATOMIC|
+                       BTREE_INSERT_NOCHECK_RW|
+                       BTREE_INSERT_NOFAIL|
+                       BTREE_INSERT_USE_RESERVE|
+                       BTREE_INSERT_USE_ALLOC_RESERVE|
+                       flags,
+                       BTREE_INSERT_ENTRY(iter, &a->k_i));
+       if (ret == -EINTR)
+               goto retry;
+
+       if (!ret) {
+               /* remove from alloc_heap: */
+               struct alloc_heap_entry e, *top = ca->alloc_heap.data;
+
+               top->bucket++;
+               top->nr--;
+
+               if (!top->nr)
+                       heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
+
+               /*
+                * Make sure we flush the last journal entry that updated this
+                * bucket (i.e. deleting the last reference) before writing to
+                * this bucket again:
+                */
+               *journal_seq = max(*journal_seq, bucket_journal_seq(c, m));
+       } else {
+               size_t b2;
+
+               /* remove from free_inc: */
+               percpu_down_read_preempt_disable(&c->mark_lock);
+               spin_lock(&c->freelist_lock);
+
+               bch2_mark_alloc_bucket(c, ca, b, false,
+                                      gc_pos_alloc(c, NULL), 0);
+
+               BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
+               BUG_ON(b != b2);
+
+               spin_unlock(&c->freelist_lock);
+               percpu_up_read_preempt_enable(&c->mark_lock);
+       }
+
+       return ret;
+}
+
 static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
                                       size_t bucket, u64 *flush_seq)
 {
@@ -844,18 +1020,7 @@ static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
 
        percpu_up_read_preempt_enable(&c->mark_lock);
 
-       if (m.journal_seq_valid) {
-               u64 journal_seq = atomic64_read(&c->journal.seq);
-               u64 bucket_seq  = journal_seq;
-
-               bucket_seq &= ~((u64) U16_MAX);
-               bucket_seq |= m.journal_seq;
-
-               if (bucket_seq > journal_seq)
-                       bucket_seq -= 1 << 16;
-
-               *flush_seq = max(*flush_seq, bucket_seq);
-       }
+       *flush_seq = max(*flush_seq, bucket_journal_seq(c, m));
 
        return m.cached_sectors != 0;
 }
@@ -868,7 +1033,6 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
        struct btree_iter iter;
        u64 journal_seq = 0;
        int ret = 0;
-       long b;
 
        bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS(ca->dev_idx, 0),
                             BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
@@ -876,14 +1040,11 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
        /* Only use nowait if we've already invalidated at least one bucket: */
        while (!ret &&
               !fifo_full(&ca->free_inc) &&
-              (b = next_alloc_bucket(ca)) >= 0) {
-               bool must_flush =
-                       bch2_invalidate_one_bucket(c, ca, b, &journal_seq);
-
-               ret = __bch2_alloc_write_key(c, ca, b, &iter,
-                               must_flush ? &journal_seq : NULL,
-                               !fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0);
-       }
+              ca->alloc_heap.used)
+               ret = bch2_invalidate_one_bucket2(c, ca, &iter, &journal_seq,
+                               BTREE_INSERT_GC_LOCK_HELD|
+                               (!fifo_empty(&ca->free_inc)
+                                ? BTREE_INSERT_NOWAIT : 0));
 
        bch2_btree_iter_unlock(&iter);
 
@@ -1305,24 +1466,16 @@ int bch2_dev_allocator_start(struct bch_dev *ca)
        return 0;
 }
 
-static void flush_held_btree_writes(struct bch_fs *c)
+static bool flush_done(struct bch_fs *c)
 {
        struct bucket_table *tbl;
        struct rhash_head *pos;
        struct btree *b;
-       bool nodes_blocked;
+       bool nodes_unwritten;
        size_t i;
-       struct closure cl;
-
-       closure_init_stack(&cl);
-
-       clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
 again:
-       pr_debug("flushing dirty btree nodes");
        cond_resched();
-       closure_wait(&c->btree_interior_update_wait, &cl);
-
-       nodes_blocked = false;
+       nodes_unwritten = false;
 
        rcu_read_lock();
        for_each_cached_btree(b, c, tbl, i, pos)
@@ -1334,24 +1487,25 @@ again:
                                six_unlock_read(&b->lock);
                                goto again;
                        } else {
-                               nodes_blocked = true;
+                               nodes_unwritten = true;
                        }
                }
        rcu_read_unlock();
 
-       if (c->btree_roots_dirty)
+       if (c->btree_roots_dirty) {
                bch2_journal_meta(&c->journal);
-
-       if (nodes_blocked) {
-               closure_sync(&cl);
                goto again;
        }
 
-       closure_wake_up(&c->btree_interior_update_wait);
-       closure_sync(&cl);
+       return !nodes_unwritten &&
+               !bch2_btree_interior_updates_nr_pending(c);
+}
 
-       closure_wait_event(&c->btree_interior_update_wait,
-                          !bch2_btree_interior_updates_nr_pending(c));
+static void flush_held_btree_writes(struct bch_fs *c)
+{
+       clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags);
+
+       closure_wait_event(&c->btree_interior_update_wait, flush_done(c));
 }
 
 static void allocator_start_issue_discards(struct bch_fs *c)
@@ -1470,7 +1624,6 @@ not_enough:
                                                           &journal_seq);
 
                                fifo_push(&ca->free[RESERVE_BTREE], bu);
-                               bucket_set_dirty(ca, bu);
                        }
                }
 
@@ -1517,7 +1670,6 @@ int bch2_fs_allocator_start(struct bch_fs *c)
 {
        struct bch_dev *ca;
        unsigned i;
-       bool wrote;
        int ret;
 
        down_read(&c->gc_lock);
@@ -1536,8 +1688,7 @@ int bch2_fs_allocator_start(struct bch_fs *c)
        }
 
        set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags);
-
-       return bch2_alloc_write(c, false, &wrote);
+       return 0;
 }
 
 void bch2_fs_allocator_background_init(struct bch_fs *c)
index 26561b3bafb8336cfde59bcea506d77795064923..65e9b373a35033700ae4c72d985333c29b6ef1a8 100644 (file)
@@ -5,6 +5,15 @@
 #include "alloc_types.h"
 #include "debug.h"
 
+struct bkey_alloc_unpacked {
+       u8              gen;
+#define x(_name, _bits)        u##_bits _name;
+       BCH_ALLOC_FIELDS()
+#undef  x
+};
+
+struct bkey_alloc_unpacked bch2_alloc_unpack(const struct bch_alloc *);
+
 #define ALLOC_SCAN_BATCH(ca)           max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
 
 const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
index f2f9015dbb00ca144fe0eb2cf31d2f4e53f2b909..6568e8ac200324eda64ac70c180ee1b18cb405af 100644 (file)
@@ -723,7 +723,7 @@ static struct write_point *__writepoint_find(struct hlist_head *head,
 static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor)
 {
        u64 stranded    = c->write_points_nr * c->bucket_size_max;
-       u64 free        = bch2_fs_sectors_free(c);
+       u64 free        = bch2_fs_usage_read_short(c).free;
 
        return stranded * factor > free;
 }
index 245d832218d216f91a56613877a8dd6017b0ceb2..052ec263618e42706066f5ee0a088ad97086f91a 100644 (file)
@@ -396,8 +396,6 @@ struct bch_dev {
        struct bucket_array __rcu *buckets[2];
        unsigned long           *buckets_nouse;
        unsigned long           *buckets_written;
-       /* most out of date gen in the btree */
-       u8                      *oldest_gens;
        struct rw_semaphore     bucket_lock;
 
        struct bch_dev_usage __percpu *usage[2];
index d020cf74e9e975f8ee88f04652e011aeb1df7c38..56bf69eb66d45cd34616832f3a700f575574cdd6 100644 (file)
@@ -821,11 +821,12 @@ struct bch_alloc {
 } __attribute__((packed, aligned(8)));
 
 #define BCH_ALLOC_FIELDS()                     \
-       x(read_time, 2)                         \
-       x(write_time, 2)                        \
-       x(data_type, 1)                         \
-       x(dirty_sectors, 2)                     \
-       x(cached_sectors, 2)
+       x(read_time,            16)             \
+       x(write_time,           16)             \
+       x(data_type,            8)              \
+       x(dirty_sectors,        16)             \
+       x(cached_sectors,       16)             \
+       x(oldest_gen,           8)
 
 enum {
 #define x(name, bytes) BCH_ALLOC_FIELD_##name,
@@ -835,12 +836,12 @@ enum {
 };
 
 static const unsigned BCH_ALLOC_FIELD_BYTES[] = {
-#define x(name, bytes) [BCH_ALLOC_FIELD_##name] = bytes,
+#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8,
        BCH_ALLOC_FIELDS()
 #undef x
 };
 
-#define x(name, bytes) + bytes
+#define x(name, bits) + (bits / 8)
 static const unsigned BKEY_ALLOC_VAL_U64s_MAX =
        DIV_ROUND_UP(offsetof(struct bch_alloc, data)
                     BCH_ALLOC_FIELDS(), sizeof(u64));
index b1f5e8b1071e5e2f65e98f270c49d16573b56979..5d6f63646d9f82f1c521c1e89b29c1ba08de875f 100644 (file)
@@ -138,24 +138,24 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 
                bkey_for_each_ptr(ptrs, ptr) {
                        struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-                       size_t b = PTR_BUCKET_NR(ca, ptr);
-                       struct bucket *g = PTR_BUCKET(ca, ptr);
+                       struct bucket *g = PTR_BUCKET(ca, ptr, true);
+                       struct bucket *g2 = PTR_BUCKET(ca, ptr, false);
 
                        if (mustfix_fsck_err_on(!g->gen_valid, c,
                                        "found ptr with missing gen in alloc btree,\n"
                                        "type %u gen %u",
                                        k.k->type, ptr->gen)) {
-                               g->_mark.gen = ptr->gen;
-                               g->gen_valid = 1;
-                               bucket_set_dirty(ca, b);
+                               g2->_mark.gen   = g->_mark.gen          = ptr->gen;
+                               g2->_mark.dirty = g->_mark.dirty        = true;
+                               g2->gen_valid   = g->gen_valid          = true;
                        }
 
                        if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
                                        "%u ptr gen in the future: %u > %u",
                                        k.k->type, ptr->gen, g->mark.gen)) {
-                               g->_mark.gen = ptr->gen;
-                               g->gen_valid = 1;
-                               bucket_set_dirty(ca, b);
+                               g2->_mark.gen   = g->_mark.gen          = ptr->gen;
+                               g2->_mark.dirty = g->_mark.dirty        = true;
+                               g2->gen_valid   = g->gen_valid          = true;
                                set_bit(BCH_FS_FIXED_GENS, &c->flags);
                        }
                }
@@ -163,10 +163,10 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
 
        bkey_for_each_ptr(ptrs, ptr) {
                struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-               size_t b = PTR_BUCKET_NR(ca, ptr);
+               struct bucket *g = PTR_BUCKET(ca, ptr, true);
 
-               if (gen_after(ca->oldest_gens[b], ptr->gen))
-                       ca->oldest_gens[b] = ptr->gen;
+               if (gen_after(g->oldest_gen, ptr->gen))
+                       g->oldest_gen = ptr->gen;
 
                *max_stale = max(*max_stale, ptr_stale(ca, ptr));
        }
@@ -230,12 +230,12 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
                bch2_verify_btree_nr_keys(b);
 
+               gc_pos_set(c, gc_pos_btree_node(b));
+
                ret = btree_gc_mark_node(c, b, &max_stale, initial);
                if (ret)
                        break;
 
-               gc_pos_set(c, gc_pos_btree_node(b));
-
                if (!initial) {
                        if (max_stale > 64)
                                bch2_btree_node_rewrite(c, &iter,
@@ -483,88 +483,38 @@ static void bch2_gc_free(struct bch_fs *c)
        percpu_up_write(&c->mark_lock);
 }
 
-static void bch2_gc_done_nocheck(struct bch_fs *c)
-{
-       struct bch_dev *ca;
-       unsigned i;
-
-       {
-               struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
-               struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
-               struct stripe *dst, *src;
-
-               c->ec_stripes_heap.used = 0;
-
-               while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) &&
-                      (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) {
-                       *dst = *src;
-
-                       if (dst->alive)
-                               bch2_stripes_heap_insert(c, dst, dst_iter.pos);
-
-                       genradix_iter_advance(&dst_iter, &c->stripes[0]);
-                       genradix_iter_advance(&src_iter, &c->stripes[1]);
-               }
-       }
-
-       for_each_member_device(ca, c, i) {
-               struct bucket_array *src = __bucket_array(ca, 1);
-
-               memcpy(__bucket_array(ca, 0), src,
-                      sizeof(struct bucket_array) +
-                      sizeof(struct bucket) * src->nbuckets);
-       };
-
-       for_each_member_device(ca, c, i) {
-               unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
-               struct bch_dev_usage *dst = (void *)
-                       bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
-               struct bch_dev_usage *src = (void *)
-                       bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
-
-               *dst = *src;
-       }
-
-       {
-               unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
-                       c->replicas.nr;
-               struct bch_fs_usage *dst = (void *)
-                       bch2_acc_percpu_u64s((void *) c->usage[0], nr);
-               struct bch_fs_usage *src = (void *)
-                       bch2_acc_percpu_u64s((void *) c->usage[1], nr);
-
-               memcpy(&dst->s.gc_start[0],
-                      &src->s.gc_start[0],
-                      nr * sizeof(u64) - offsetof(typeof(*dst), s.gc_start));
-       }
-}
-
 static void bch2_gc_done(struct bch_fs *c, bool initial)
 {
        struct bch_dev *ca;
+       bool verify = !initial ||
+               (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO));
        unsigned i;
 
 #define copy_field(_f, _msg, ...)                                      \
        if (dst->_f != src->_f) {                                       \
-               bch_err(c, _msg ": got %llu, should be %llu, fixing"    \
-                       , ##__VA_ARGS__, dst->_f, src->_f);             \
+               if (verify)                                             \
+                       bch_err(c, _msg ": got %llu, should be %llu, fixing"\
+                               , ##__VA_ARGS__, dst->_f, src->_f);     \
                dst->_f = src->_f;                                      \
        }
 #define copy_stripe_field(_f, _msg, ...)                               \
        if (dst->_f != src->_f) {                                       \
-               bch_err_ratelimited(c, "stripe %zu has wrong "_msg      \
-                       ": got %u, should be %u, fixing",               \
-                       dst_iter.pos, ##__VA_ARGS__,                    \
-                       dst->_f, src->_f);                              \
+               if (verify)                                             \
+                       bch_err_ratelimited(c, "stripe %zu has wrong "_msg\
+                               ": got %u, should be %u, fixing",       \
+                               dst_iter.pos, ##__VA_ARGS__,            \
+                               dst->_f, src->_f);                      \
                dst->_f = src->_f;                                      \
                dst->dirty = true;                                      \
        }
 #define copy_bucket_field(_f)                                          \
        if (dst->b[b].mark._f != src->b[b].mark._f) {                   \
-               bch_err_ratelimited(c, "dev %u bucket %zu has wrong " #_f\
-                       ": got %u, should be %u, fixing",               \
-                       i, b, dst->b[b].mark._f, src->b[b].mark._f);    \
+               if (verify)                                             \
+                       bch_err_ratelimited(c, "dev %u bucket %zu has wrong " #_f\
+                               ": got %u, should be %u, fixing", i, b, \
+                               dst->b[b].mark._f, src->b[b].mark._f);  \
                dst->b[b]._mark._f = src->b[b].mark._f;                 \
+               dst->b[b]._mark.dirty = true;                           \
        }
 #define copy_dev_field(_f, _msg, ...)                                  \
        copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
@@ -573,12 +523,6 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 
        percpu_down_write(&c->mark_lock);
 
-       if (initial &&
-           !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))) {
-               bch2_gc_done_nocheck(c);
-               goto out;
-       }
-
        {
                struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0);
                struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0);
@@ -629,6 +573,11 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
                        copy_bucket_field(stripe);
                        copy_bucket_field(dirty_sectors);
                        copy_bucket_field(cached_sectors);
+
+                       if (dst->b[b].oldest_gen != src->b[b].oldest_gen) {
+                               dst->b[b].oldest_gen = src->b[b].oldest_gen;
+                               dst->b[b]._mark.dirty = true;
+                       }
                }
        };
 
@@ -641,44 +590,46 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
                unsigned b;
 
                for (b = 0; b < BCH_DATA_NR; b++)
-                       copy_dev_field(buckets[b],
-                                      "buckets[%s]", bch2_data_types[b]);
-               copy_dev_field(buckets_alloc, "buckets_alloc");
-               copy_dev_field(buckets_ec, "buckets_ec");
+                       copy_dev_field(buckets[b],      "buckets[%s]",
+                                      bch2_data_types[b]);
+               copy_dev_field(buckets_alloc,           "buckets_alloc");
+               copy_dev_field(buckets_ec,              "buckets_ec");
+               copy_dev_field(buckets_unavailable,     "buckets_unavailable");
 
                for (b = 0; b < BCH_DATA_NR; b++)
-                       copy_dev_field(sectors[b],
-                                      "sectors[%s]", bch2_data_types[b]);
-               copy_dev_field(sectors_fragmented,
-                              "sectors_fragmented");
+                       copy_dev_field(sectors[b],      "sectors[%s]",
+                                      bch2_data_types[b]);
+               copy_dev_field(sectors_fragmented,      "sectors_fragmented");
        }
 
        {
-               unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
-                       c->replicas.nr;
+               unsigned nr = fs_usage_u64s(c);
                struct bch_fs_usage *dst = (void *)
                        bch2_acc_percpu_u64s((void *) c->usage[0], nr);
                struct bch_fs_usage *src = (void *)
                        bch2_acc_percpu_u64s((void *) c->usage[1], nr);
 
-               copy_fs_field(s.hidden,         "hidden");
-               copy_fs_field(s.data,           "data");
-               copy_fs_field(s.cached,         "cached");
-               copy_fs_field(s.reserved,       "reserved");
-               copy_fs_field(s.nr_inodes,      "nr_inodes");
+               copy_fs_field(hidden,           "hidden");
+               copy_fs_field(data,             "data");
+               copy_fs_field(cached,           "cached");
+               copy_fs_field(reserved,         "reserved");
+               copy_fs_field(nr_inodes,        "nr_inodes");
 
                for (i = 0; i < BCH_REPLICAS_MAX; i++)
                        copy_fs_field(persistent_reserved[i],
                                      "persistent_reserved[%i]", i);
 
                for (i = 0; i < c->replicas.nr; i++) {
-                       /*
-                        * XXX: print out replicas entry
-                        */
-                       copy_fs_field(data[i], "data[%i]", i);
+                       struct bch_replicas_entry *e =
+                               cpu_replicas_entry(&c->replicas, i);
+                       char buf[80];
+
+                       bch2_replicas_entry_to_text(&PBUF(buf), e);
+
+                       copy_fs_field(replicas[i], "%s", buf);
                }
        }
-out:
+
        percpu_up_write(&c->mark_lock);
 
 #undef copy_fs_field
@@ -693,19 +644,18 @@ static int bch2_gc_start(struct bch_fs *c)
        struct bch_dev *ca;
        unsigned i;
 
+       percpu_down_write(&c->mark_lock);
+
        /*
         * indicate to stripe code that we need to allocate for the gc stripes
         * radix tree, too
         */
        gc_pos_set(c, gc_phase(GC_PHASE_START));
 
-       percpu_down_write(&c->mark_lock);
        BUG_ON(c->usage[1]);
 
-       c->usage[1] = __alloc_percpu_gfp(sizeof(struct bch_fs_usage) +
-                                        sizeof(u64) * c->replicas.nr,
-                                        sizeof(u64),
-                                        GFP_KERNEL);
+       c->usage[1] = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
+                                        sizeof(u64), GFP_KERNEL);
        percpu_up_write(&c->mark_lock);
 
        if (!c->usage[1])
@@ -740,8 +690,12 @@ static int bch2_gc_start(struct bch_fs *c)
                dst->first_bucket       = src->first_bucket;
                dst->nbuckets           = src->nbuckets;
 
-               for (b = 0; b < src->nbuckets; b++)
-                       dst->b[b]._mark.gen = src->b[b].mark.gen;
+               for (b = 0; b < src->nbuckets; b++) {
+                       dst->b[b]._mark.gen =
+                               dst->b[b].oldest_gen =
+                               src->b[b].mark.gen;
+                       dst->b[b].gen_valid = src->b[b].gen_valid;
+               }
        };
 
        percpu_up_write(&c->mark_lock);
@@ -800,6 +754,8 @@ out:
                if (iter++ <= 2) {
                        bch_info(c, "Fixed gens, restarting mark and sweep:");
                        clear_bit(BCH_FS_FIXED_GENS, &c->flags);
+                       __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
+                       bch2_gc_free(c);
                        goto again;
                }
 
index 18596dc8d7ba9b7f946566d8c935facbe29c8ac1..b38722da18ebe45c0e03357934c2184a644603c7 100644 (file)
@@ -455,6 +455,7 @@ static inline bool btree_node_is_extents(struct btree *b)
 static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 {
        switch (type) {
+       case BKEY_TYPE_ALLOC:
        case BKEY_TYPE_BTREE:
        case BKEY_TYPE_EXTENTS:
        case BKEY_TYPE_INODES:
@@ -489,7 +490,6 @@ enum btree_insert_ret {
        /* leaf node needs to be split */
        BTREE_INSERT_BTREE_NODE_FULL,
        BTREE_INSERT_ENOSPC,
-       BTREE_INSERT_NEED_GC_LOCK,
        BTREE_INSERT_NEED_MARK_REPLICAS,
 };
 
index 4bd0725846d002229748fecd387577a0d02b209e..faacde9ae4bb46e9e4e92a4a027320681536ac8e 100644 (file)
@@ -81,6 +81,7 @@ enum {
        __BTREE_INSERT_USE_RESERVE,
        __BTREE_INSERT_USE_ALLOC_RESERVE,
        __BTREE_INSERT_JOURNAL_REPLAY,
+       __BTREE_INSERT_NOMARK,
        __BTREE_INSERT_NOWAIT,
        __BTREE_INSERT_GC_LOCK_HELD,
        __BCH_HASH_SET_MUST_CREATE,
@@ -107,12 +108,12 @@ enum {
 #define BTREE_INSERT_USE_RESERVE       (1 << __BTREE_INSERT_USE_RESERVE)
 #define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
 
-/*
- * Insert is for journal replay: don't get journal reservations, or mark extents
- * (bch_mark_key)
- */
+/* Insert is for journal replay - don't get journal reservations: */
 #define BTREE_INSERT_JOURNAL_REPLAY    (1 << __BTREE_INSERT_JOURNAL_REPLAY)
 
+/* Don't call bch2_mark_key: */
+#define BTREE_INSERT_NOMARK            (1 << __BTREE_INSERT_NOMARK)
+
 /* Don't block on allocation failure (for new btree nodes: */
 #define BTREE_INSERT_NOWAIT            (1 << __BTREE_INSERT_NOWAIT)
 #define BTREE_INSERT_GC_LOCK_HELD      (1 << __BTREE_INSERT_GC_LOCK_HELD)
index 33b5cf40a5f48377f2a5fe3a6357fc77cba02eba..b1b858dedaf0d3a6d0ccf83a7150711386f0a872 100644 (file)
@@ -483,7 +483,7 @@ static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c,
        struct btree *b;
        struct disk_reservation disk_res = { 0, 0 };
        unsigned sectors = nr_nodes * c->opts.btree_node_size;
-       int ret, disk_res_flags = BCH_DISK_RESERVATION_GC_LOCK_HELD;
+       int ret, disk_res_flags = 0;
 
        if (flags & BTREE_INSERT_NOFAIL)
                disk_res_flags |= BCH_DISK_RESERVATION_NOFAIL;
@@ -1086,8 +1086,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
                bch2_btree_node_free_index(as, NULL,
                                           bkey_i_to_s_c(&old->key),
                                           fs_usage);
-       bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
-                           gc_pos_btree_root(b->btree_id));
+       bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
 
        percpu_up_read_preempt_enable(&c->mark_lock);
        mutex_unlock(&c->btree_interior_update_lock);
@@ -1188,8 +1187,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
                                           bkey_disassemble(b, k, &tmp),
                                           fs_usage);
 
-       bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
-                           gc_pos_btree_node(b));
+       bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
 
        percpu_up_read_preempt_enable(&c->mark_lock);
        mutex_unlock(&c->btree_interior_update_lock);
@@ -1564,7 +1562,8 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
        closure_init_stack(&cl);
 
        /* Hack, because gc and splitting nodes doesn't mix yet: */
-       if (!down_read_trylock(&c->gc_lock)) {
+       if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
+           !down_read_trylock(&c->gc_lock)) {
                if (flags & BTREE_INSERT_NOUNLOCK)
                        return -EINTR;
 
@@ -1607,7 +1606,8 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
         */
        __bch2_btree_iter_downgrade(iter, 1);
 out:
-       up_read(&c->gc_lock);
+       if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
+               up_read(&c->gc_lock);
        closure_sync(&cl);
        return ret;
 }
@@ -1685,7 +1685,8 @@ retry:
        }
 
        /* We're changing btree topology, doesn't mix with gc: */
-       if (!down_read_trylock(&c->gc_lock))
+       if (!(flags & BTREE_INSERT_GC_LOCK_HELD) &&
+           !down_read_trylock(&c->gc_lock))
                goto err_cycle_gc_lock;
 
        if (!bch2_btree_iter_upgrade(iter, U8_MAX,
@@ -1745,7 +1746,8 @@ retry:
 
        bch2_btree_update_done(as);
 
-       up_read(&c->gc_lock);
+       if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
+               up_read(&c->gc_lock);
 out:
        bch2_btree_iter_verify_locks(iter);
 
@@ -1776,7 +1778,8 @@ err_cycle_gc_lock:
 
 err_unlock:
        six_unlock_intent(&m->lock);
-       up_read(&c->gc_lock);
+       if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
+               up_read(&c->gc_lock);
 err:
        BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
 
@@ -1942,8 +1945,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
        ret = bch2_disk_reservation_add(c, &as->reserve->disk_res,
                        c->opts.btree_node_size *
                        bch2_bkey_nr_ptrs(bkey_i_to_s_c(&new_key->k_i)),
-                       BCH_DISK_RESERVATION_NOFAIL|
-                       BCH_DISK_RESERVATION_GC_LOCK_HELD);
+                       BCH_DISK_RESERVATION_NOFAIL);
        BUG_ON(ret);
 
        parent = btree_node_parent(iter, b);
@@ -1989,8 +1991,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
                bch2_btree_node_free_index(as, NULL,
                                           bkey_i_to_s_c(&b->key),
                                           fs_usage);
-               bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
-                                   gc_pos_btree_root(b->btree_id));
+               bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res);
 
                percpu_up_read_preempt_enable(&c->mark_lock);
                mutex_unlock(&c->btree_interior_update_lock);
index 0df894fcf1ae67682760a7948c9175f0cefefad7..da8c69871c7663a8f15e45d85f73f1255ee4aeb0 100644 (file)
@@ -415,6 +415,25 @@ static inline int btree_trans_cmp(struct btree_insert_entry l,
                btree_iter_cmp(l.iter, r.iter);
 }
 
+static bool btree_trans_relock(struct btree_insert *trans)
+{
+       struct btree_insert_entry *i;
+
+       trans_for_each_iter(trans, i)
+               return bch2_btree_iter_relock(i->iter);
+       return true;
+}
+
+static void btree_trans_unlock(struct btree_insert *trans)
+{
+       struct btree_insert_entry *i;
+
+       trans_for_each_iter(trans, i) {
+               bch2_btree_iter_unlock(i->iter);
+               break;
+       }
+}
+
 /* Normal update interface: */
 
 static enum btree_insert_ret
@@ -466,49 +485,12 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
        struct btree_iter *linked;
        unsigned u64s;
        int ret;
-
+retry:
        trans_for_each_iter(trans, i)
                BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK);
 
-       /* reserve space for deferred updates */
-       __trans_for_each_entry(trans, i, i->deferred) {
-
-       }
-
        memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 
-       if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
-               u64s = 0;
-               trans_for_each_entry(trans, i)
-                       u64s += jset_u64s(i->k->k.u64s);
-
-               while ((ret = bch2_journal_res_get(&c->journal,
-                                       &trans->journal_res, u64s,
-                                       JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) {
-                       struct btree_iter *iter = NULL;
-
-                       trans_for_each_iter(trans, i)
-                               iter = i->iter;
-
-                       if (iter)
-                               bch2_btree_iter_unlock(iter);
-
-                       ret = bch2_journal_res_get(&c->journal,
-                                       &trans->journal_res, u64s,
-                                       JOURNAL_RES_GET_CHECK);
-                       if (ret)
-                               return ret;
-
-                       if (iter && !bch2_btree_iter_relock(iter)) {
-                               trans_restart(" (iter relock after journal res get blocked)");
-                               return -EINTR;
-                       }
-               }
-
-               if (ret)
-                       return ret;
-       }
-
        multi_lock_write(c, trans);
 
        if (race_fault()) {
@@ -536,6 +518,36 @@ static inline int do_btree_insert_at(struct btree_insert *trans,
                }
        }
 
+       if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+               u64s = 0;
+               trans_for_each_entry(trans, i)
+                       u64s += jset_u64s(i->k->k.u64s);
+
+               ret = bch2_journal_res_get(&c->journal,
+                               &trans->journal_res, u64s,
+                               JOURNAL_RES_GET_NONBLOCK);
+               if (likely(!ret))
+                       goto got_journal_res;
+               if (ret != -EAGAIN)
+                       goto out;
+
+               multi_unlock_write(trans);
+               btree_trans_unlock(trans);
+
+               ret = bch2_journal_res_get(&c->journal,
+                               &trans->journal_res, u64s,
+                               JOURNAL_RES_GET_CHECK);
+               if (ret)
+                       return ret;
+
+               if (!btree_trans_relock(trans)) {
+                       trans_restart(" (iter relock after journal res get blocked)");
+                       return -EINTR;
+               }
+
+               goto retry;
+       }
+got_journal_res:
        if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
                if (journal_seq_verify(c))
                        trans_for_each_entry(trans, i)
@@ -623,6 +635,9 @@ int __bch2_btree_insert_at(struct btree_insert *trans)
        /* for the sake of sanity: */
        BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
 
+       if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
+               lockdep_assert_held(&c->gc_lock);
+
        bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
 
        trans_for_each_entry(trans, i)
@@ -715,18 +730,6 @@ err:
                        ret = -EINTR;
                }
                break;
-       case BTREE_INSERT_NEED_GC_LOCK:
-               ret = -EINTR;
-
-               if (!down_read_trylock(&c->gc_lock)) {
-                       if (flags & BTREE_INSERT_NOUNLOCK)
-                               goto out;
-
-                       bch2_btree_iter_unlock(trans->entries[0].iter);
-                       down_read(&c->gc_lock);
-               }
-               up_read(&c->gc_lock);
-               break;
        case BTREE_INSERT_ENOSPC:
                ret = -ENOSPC;
                break;
index 9f4872a9be18d67e72d5dece13b8d6bc94383ed7..377a8b0f7f7dab34d91e5ea6290c7251f7d48a44 100644 (file)
@@ -116,14 +116,14 @@ void bch2_bucket_seq_cleanup(struct bch_fs *c)
 void bch2_fs_usage_initialize(struct bch_fs *c)
 {
        struct bch_fs_usage *usage;
-       unsigned i, nr;
+       unsigned i;
 
        percpu_down_write(&c->mark_lock);
-       nr = sizeof(struct bch_fs_usage) / sizeof(u64) + c->replicas.nr;
-       usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0], nr);
+       usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0],
+                                             fs_usage_u64s(c));
 
        for (i = 0; i < BCH_REPLICAS_MAX; i++)
-               usage->s.reserved += usage->persistent_reserved[i];
+               usage->reserved += usage->persistent_reserved[i];
 
        for (i = 0; i < c->replicas.nr; i++) {
                struct bch_replicas_entry *e =
@@ -132,10 +132,10 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
                switch (e->data_type) {
                case BCH_DATA_BTREE:
                case BCH_DATA_USER:
-                       usage->s.data   += usage->data[i];
+                       usage->data     += usage->replicas[i];
                        break;
                case BCH_DATA_CACHED:
-                       usage->s.cached += usage->data[i];
+                       usage->cached   += usage->replicas[i];
                        break;
                }
        }
@@ -143,44 +143,38 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
        percpu_up_write(&c->mark_lock);
 }
 
-#define bch2_usage_read_raw(_stats)                                    \
-({                                                                     \
-       typeof(*this_cpu_ptr(_stats)) _acc;                             \
-                                                                       \
-       memset(&_acc, 0, sizeof(_acc));                                 \
-       acc_u64s_percpu((u64 *) &_acc,                                  \
-                       (u64 __percpu *) _stats,                        \
-                       sizeof(_acc) / sizeof(u64));                    \
-                                                                       \
-       _acc;                                                           \
-})
-
 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
 {
-       return bch2_usage_read_raw(ca->usage[0]);
+       struct bch_dev_usage ret;
+
+       memset(&ret, 0, sizeof(ret));
+       acc_u64s_percpu((u64 *) &ret,
+                       (u64 __percpu *) ca->usage[0],
+                       sizeof(ret) / sizeof(u64));
+
+       return ret;
 }
 
 struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
 {
        struct bch_fs_usage *ret;
-       unsigned nr = READ_ONCE(c->replicas.nr);
+       unsigned v, u64s = fs_usage_u64s(c);
 retry:
-       ret = kzalloc(sizeof(*ret) + nr * sizeof(u64), GFP_NOFS);
+       ret = kzalloc(u64s * sizeof(u64), GFP_NOFS);
        if (unlikely(!ret))
                return NULL;
 
        percpu_down_read_preempt_disable(&c->mark_lock);
 
-       if (unlikely(nr < c->replicas.nr)) {
-               nr = c->replicas.nr;
+       v = fs_usage_u64s(c);
+       if (unlikely(u64s != v)) {
+               u64s = v;
                percpu_up_read_preempt_enable(&c->mark_lock);
                kfree(ret);
                goto retry;
        }
 
-       acc_u64s_percpu((u64 *) ret,
-                       (u64 __percpu *) c->usage[0],
-                       sizeof(*ret) / sizeof(u64) + nr);
+       acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s);
 
        return ret;
 }
@@ -197,27 +191,44 @@ static u64 avail_factor(u64 r)
        return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
 }
 
-u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage)
 {
-       return min(fs_usage.s.hidden +
-                  fs_usage.s.data +
-                  reserve_factor(fs_usage.s.reserved +
-                                 fs_usage.s.online_reserved),
+       return min(fs_usage->hidden +
+                  fs_usage->data +
+                  reserve_factor(fs_usage->reserved +
+                                 fs_usage->online_reserved),
                   c->capacity);
 }
 
+static struct bch_fs_usage_short
+__bch2_fs_usage_read_short(struct bch_fs *c)
+{
+       struct bch_fs_usage_short ret;
+       u64 data, reserved;
+
+       ret.capacity = c->capacity -
+               percpu_u64_get(&c->usage[0]->hidden);
+
+       data            = percpu_u64_get(&c->usage[0]->data);
+       reserved        = percpu_u64_get(&c->usage[0]->reserved) +
+               percpu_u64_get(&c->usage[0]->online_reserved);
+
+       ret.used        = min(ret.capacity, data + reserve_factor(reserved));
+       ret.free        = ret.capacity - ret.used;
+
+       ret.nr_inodes   = percpu_u64_get(&c->usage[0]->nr_inodes);
+
+       return ret;
+}
+
 struct bch_fs_usage_short
 bch2_fs_usage_read_short(struct bch_fs *c)
 {
-       struct bch_fs_usage_summarized usage =
-               bch2_usage_read_raw(&c->usage[0]->s);
        struct bch_fs_usage_short ret;
 
-       ret.capacity    = READ_ONCE(c->capacity) - usage.hidden;
-       ret.used        = min(ret.capacity, usage.data +
-                             reserve_factor(usage.reserved +
-                                            usage.online_reserved));
-       ret.nr_inodes   = usage.nr_inodes;
+       percpu_down_read_preempt_disable(&c->mark_lock);
+       ret = __bch2_fs_usage_read_short(c);
+       percpu_up_read_preempt_enable(&c->mark_lock);
 
        return ret;
 }
@@ -254,10 +265,9 @@ static bool bucket_became_unavailable(struct bucket_mark old,
 
 int bch2_fs_usage_apply(struct bch_fs *c,
                        struct bch_fs_usage *fs_usage,
-                       struct disk_reservation *disk_res,
-                       struct gc_pos gc_pos)
+                       struct disk_reservation *disk_res)
 {
-       s64 added = fs_usage->s.data + fs_usage->s.reserved;
+       s64 added = fs_usage->data + fs_usage->reserved;
        s64 should_not_have_added;
        int ret = 0;
 
@@ -277,19 +287,11 @@ int bch2_fs_usage_apply(struct bch_fs *c,
 
        if (added > 0) {
                disk_res->sectors               -= added;
-               fs_usage->s.online_reserved     -= added;
+               fs_usage->online_reserved       -= added;
        }
 
        acc_u64s((u64 *) this_cpu_ptr(c->usage[0]),
-                (u64 *) fs_usage,
-                sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
-
-       if (gc_visited(c, gc_pos)) {
-               BUG_ON(!c->usage[1]);
-               acc_u64s((u64 *) this_cpu_ptr(c->usage[1]),
-                        (u64 *) fs_usage,
-                        sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
-       }
+                (u64 *) fs_usage, fs_usage_u64s(c));
 
        return ret;
 }
@@ -300,7 +302,7 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
                                  int nr, s64 size)
 {
        if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
-               fs_usage->s.hidden      += size;
+               fs_usage->hidden        += size;
 
        dev_usage->buckets[type]        += nr;
 }
@@ -384,10 +386,10 @@ static inline void update_replicas(struct bch_fs *c,
        BUG_ON(!sectors);
 
        if (r->data_type == BCH_DATA_CACHED)
-               fs_usage->s.cached      += sectors;
+               fs_usage->cached        += sectors;
        else
-               fs_usage->s.data        += sectors;
-       fs_usage->data[idx]             += sectors;
+               fs_usage->data          += sectors;
+       fs_usage->replicas[idx]         += sectors;
 }
 
 static inline void update_cached_sectors(struct bch_fs *c,
@@ -401,15 +403,28 @@ static inline void update_cached_sectors(struct bch_fs *c,
        update_replicas(c, fs_usage, &r.e, sectors);
 }
 
-static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
-                                    size_t b, struct bucket_mark *old,
-                                    bool gc)
+#define do_mark_fn(fn, c, pos, flags, ...)                             \
+({                                                                     \
+       int gc, ret = 0;                                                \
+                                                                       \
+       percpu_rwsem_assert_held(&c->mark_lock);                        \
+                                                                       \
+       for (gc = 0; gc < 2 && !ret; gc++)                              \
+               if (!gc == !(flags & BCH_BUCKET_MARK_GC) ||             \
+                   (gc && gc_visited(c, pos)))                         \
+                       ret = fn(c, __VA_ARGS__, gc);                   \
+       ret;                                                            \
+})
+
+static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+                                   size_t b, struct bucket_mark *ret,
+                                   bool gc)
 {
        struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
        struct bucket *g = __bucket(ca, b, gc);
-       struct bucket_mark new;
+       struct bucket_mark old, new;
 
-       *old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
+       old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
                BUG_ON(!is_available_bucket(new));
 
                new.owned_by_allocator  = true;
@@ -420,26 +435,29 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
                new.gen++;
        }));
 
-       if (old->cached_sectors)
+       if (old.cached_sectors)
                update_cached_sectors(c, fs_usage, ca->dev_idx,
-                                     -old->cached_sectors);
+                                     -((s64) old.cached_sectors));
+
+       if (!gc)
+               *ret = old;
+       return 0;
 }
 
 void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
                            size_t b, struct bucket_mark *old)
 {
-       percpu_rwsem_assert_held(&c->mark_lock);
-
-       __bch2_invalidate_bucket(c, ca, b, old, false);
+       do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0,
+                  ca, b, old);
 
        if (!old->owned_by_allocator && old->cached_sectors)
                trace_invalidate(ca, bucket_to_sector(ca, b),
                                 old->cached_sectors);
 }
 
-static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
-                                    size_t b, bool owned_by_allocator,
-                                    bool gc)
+static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+                                   size_t b, bool owned_by_allocator,
+                                   bool gc)
 {
        struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
        struct bucket *g = __bucket(ca, b, gc);
@@ -451,20 +469,70 @@ static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
 
        BUG_ON(!gc &&
               !owned_by_allocator && !old.owned_by_allocator);
+
+       return 0;
 }
 
 void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
                            size_t b, bool owned_by_allocator,
                            struct gc_pos pos, unsigned flags)
 {
-       percpu_rwsem_assert_held(&c->mark_lock);
+       do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags,
+                  ca, b, owned_by_allocator);
+}
 
-       if (!(flags & BCH_BUCKET_MARK_GC))
-               __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false);
+static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
+                          bool inserting,
+                          struct bch_fs_usage *fs_usage,
+                          unsigned journal_seq, unsigned flags,
+                          bool gc)
+{
+       struct bkey_alloc_unpacked u;
+       struct bch_dev *ca;
+       struct bucket *g;
+       struct bucket_mark old, m;
+
+       if (!inserting)
+               return 0;
+
+       /*
+        * alloc btree is read in by bch2_alloc_read, not gc:
+        */
+       if (flags & BCH_BUCKET_MARK_GC)
+               return 0;
+
+       u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
+       ca = bch_dev_bkey_exists(c, k.k->p.inode);
+       g = __bucket(ca, k.k->p.offset, gc);
+
+       /*
+        * this should currently only be getting called from the bucket
+        * invalidate path:
+        */
+       BUG_ON(u.dirty_sectors);
+       BUG_ON(u.cached_sectors);
+       BUG_ON(!g->mark.owned_by_allocator);
+
+       old = bucket_data_cmpxchg(c, ca, fs_usage, g, m, ({
+               m.gen                   = u.gen;
+               m.data_type             = u.data_type;
+               m.dirty_sectors         = u.dirty_sectors;
+               m.cached_sectors        = u.cached_sectors;
+       }));
 
-       if ((flags & BCH_BUCKET_MARK_GC) ||
-           gc_visited(c, pos))
-               __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, true);
+       g->io_time[READ]        = u.read_time;
+       g->io_time[WRITE]       = u.write_time;
+       g->oldest_gen           = u.oldest_gen;
+       g->gen_valid            = 1;
+
+       if (old.cached_sectors) {
+               update_cached_sectors(c, fs_usage, ca->dev_idx,
+                                     -old.cached_sectors);
+               trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset),
+                                old.cached_sectors);
+       }
+
+       return 0;
 }
 
 #define checked_add(a, b)                                      \
@@ -474,9 +542,9 @@ do {                                                                \
        BUG_ON((a) != _res);                                    \
 } while (0)
 
-static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
-                                       size_t b, enum bch_data_type type,
-                                       unsigned sectors, bool gc)
+static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+                                      size_t b, enum bch_data_type type,
+                                      unsigned sectors, bool gc)
 {
        struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
        struct bucket *g = __bucket(ca, b, gc);
@@ -490,6 +558,8 @@ static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
                new.data_type   = type;
                checked_add(new.dirty_sectors, sectors);
        }));
+
+       return 0;
 }
 
 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
@@ -501,15 +571,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
               type != BCH_DATA_JOURNAL);
 
        if (likely(c)) {
-               percpu_rwsem_assert_held(&c->mark_lock);
-
-               if (!(flags & BCH_BUCKET_MARK_GC))
-                       __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
-                                                   false);
-               if ((flags & BCH_BUCKET_MARK_GC) ||
-                   gc_visited(c, pos))
-                       __bch2_mark_metadata_bucket(c, ca, b, type, sectors,
-                                                   true);
+               do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
+                          ca, b, type, sectors);
        } else {
                struct bucket *g;
                struct bucket_mark new;
@@ -553,7 +616,7 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
  * loop, to avoid racing with the start of gc clearing all the marks - GC does
  * that with the gc pos seqlock held.
  */
-static void bch2_mark_pointer(struct bch_fs *c,
+static bool bch2_mark_pointer(struct bch_fs *c,
                              struct extent_ptr_decoded p,
                              s64 sectors, enum bch_data_type data_type,
                              struct bch_fs_usage *fs_usage,
@@ -581,7 +644,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
                        BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
                        EBUG_ON(!p.ptr.cached &&
                                test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
-                       return;
+                       return true;
                }
 
                if (!p.ptr.cached)
@@ -612,6 +675,8 @@ static void bch2_mark_pointer(struct bch_fs *c,
        bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
 
        BUG_ON(!gc && bucket_became_unavailable(old, new));
+
+       return false;
 }
 
 static int bch2_mark_stripe_ptr(struct bch_fs *c,
@@ -694,13 +759,13 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
                s64 disk_sectors = data_type == BCH_DATA_BTREE
                        ? sectors
                        : ptr_disk_sectors_delta(p, sectors);
-
-               bch2_mark_pointer(c, p, disk_sectors, data_type,
-                                 fs_usage, journal_seq, flags, gc);
+               bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type,
+                                       fs_usage, journal_seq, flags, gc);
 
                if (p.ptr.cached) {
-                       update_cached_sectors(c, fs_usage, p.ptr.dev,
-                                             disk_sectors);
+                       if (disk_sectors && !stale)
+                               update_cached_sectors(c, fs_usage, p.ptr.dev,
+                                                     disk_sectors);
                } else if (!p.ec_nr) {
                        dirty_sectors          += disk_sectors;
                        r.e.devs[r.e.nr_devs++] = p.ptr.dev;
@@ -826,30 +891,31 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
                           unsigned journal_seq, unsigned flags,
                           bool gc)
 {
-       int ret = 0;
+       if (!fs_usage || gc)
+               fs_usage = this_cpu_ptr(c->usage[gc]);
 
        switch (k.k->type) {
+       case KEY_TYPE_alloc:
+               return bch2_mark_alloc(c, k, inserting,
+                               fs_usage, journal_seq, flags, gc);
        case KEY_TYPE_btree_ptr:
-               ret = bch2_mark_extent(c, k, inserting
-                                      ?  c->opts.btree_node_size
-                                      : -c->opts.btree_node_size,
-                                      BCH_DATA_BTREE,
-                                      fs_usage, journal_seq, flags, gc);
-               break;
+               return bch2_mark_extent(c, k, inserting
+                               ?  c->opts.btree_node_size
+                               : -c->opts.btree_node_size,
+                               BCH_DATA_BTREE,
+                               fs_usage, journal_seq, flags, gc);
        case KEY_TYPE_extent:
-               ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
-                                      fs_usage, journal_seq, flags, gc);
-               break;
+               return bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
+                               fs_usage, journal_seq, flags, gc);
        case KEY_TYPE_stripe:
-               ret = bch2_mark_stripe(c, k, inserting,
-                                      fs_usage, journal_seq, flags, gc);
-               break;
+               return bch2_mark_stripe(c, k, inserting,
+                               fs_usage, journal_seq, flags, gc);
        case KEY_TYPE_inode:
                if (inserting)
-                       fs_usage->s.nr_inodes++;
+                       fs_usage->nr_inodes++;
                else
-                       fs_usage->s.nr_inodes--;
-               break;
+                       fs_usage->nr_inodes--;
+               return 0;
        case KEY_TYPE_reservation: {
                unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 
@@ -857,15 +923,13 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
                replicas = clamp_t(unsigned, replicas, 1,
                                   ARRAY_SIZE(fs_usage->persistent_reserved));
 
-               fs_usage->s.reserved                            += sectors;
+               fs_usage->reserved                              += sectors;
                fs_usage->persistent_reserved[replicas - 1]     += sectors;
-               break;
+               return 0;
        }
        default:
-               break;
+               return 0;
        }
-
-       return ret;
 }
 
 int bch2_mark_key_locked(struct bch_fs *c,
@@ -875,26 +939,9 @@ int bch2_mark_key_locked(struct bch_fs *c,
                   struct bch_fs_usage *fs_usage,
                   u64 journal_seq, unsigned flags)
 {
-       int ret;
-
-       if (!(flags & BCH_BUCKET_MARK_GC)) {
-               ret = __bch2_mark_key(c, k, inserting, sectors,
-                                     fs_usage ?: this_cpu_ptr(c->usage[0]),
-                                     journal_seq, flags, false);
-               if (ret)
-                       return ret;
-       }
-
-       if ((flags & BCH_BUCKET_MARK_GC) ||
-           gc_visited(c, pos)) {
-               ret = __bch2_mark_key(c, k, inserting, sectors,
-                                     this_cpu_ptr(c->usage[1]),
-                                     journal_seq, flags, true);
-               if (ret)
-                       return ret;
-       }
-
-       return 0;
+       return do_mark_fn(__bch2_mark_key, c, pos, flags,
+                         k, inserting, sectors, fs_usage,
+                         journal_seq, flags);
 }
 
 int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
@@ -932,7 +979,7 @@ void bch2_mark_update(struct btree_insert *trans,
        percpu_down_read_preempt_disable(&c->mark_lock);
        fs_usage = bch2_fs_usage_get_scratch(c);
 
-       if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+       if (!(trans->flags & BTREE_INSERT_NOMARK))
                bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
                        bpos_min(insert->k->k.p, b->key.k.p).offset -
                        bkey_start_offset(&insert->k->k),
@@ -985,7 +1032,7 @@ void bch2_mark_update(struct btree_insert *trans,
                bch2_btree_node_iter_advance(&node_iter, b);
        }
 
-       if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos) &&
+       if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res) &&
            !warned_disk_usage &&
            !xchg(&warned_disk_usage, 1)) {
                char buf[200];
@@ -1026,13 +1073,13 @@ static u64 bch2_recalc_sectors_available(struct bch_fs *c)
 {
        percpu_u64_set(&c->pcpu->sectors_available, 0);
 
-       return avail_factor(bch2_fs_sectors_free(c));
+       return avail_factor(__bch2_fs_usage_read_short(c).free);
 }
 
 void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
 {
        percpu_down_read_preempt_disable(&c->mark_lock);
-       this_cpu_sub(c->usage[0]->s.online_reserved,
+       this_cpu_sub(c->usage[0]->online_reserved,
                     res->sectors);
        percpu_up_read_preempt_enable(&c->mark_lock);
 
@@ -1071,38 +1118,22 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
 
 out:
        pcpu->sectors_available         -= sectors;
-       this_cpu_add(c->usage[0]->s.online_reserved, sectors);
+       this_cpu_add(c->usage[0]->online_reserved, sectors);
        res->sectors                    += sectors;
 
        percpu_up_read_preempt_enable(&c->mark_lock);
        return 0;
 
 recalculate:
-       /*
-        * GC recalculates sectors_available when it starts, so that hopefully
-        * we don't normally end up blocking here:
-        */
-
-       /*
-        * Piss fuck, we can be called from extent_insert_fixup() with btree
-        * locks held:
-        */
-
-       if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) {
-               if (!(flags & BCH_DISK_RESERVATION_BTREE_LOCKS_HELD))
-                       down_read(&c->gc_lock);
-               else if (!down_read_trylock(&c->gc_lock))
-                       return -EINTR;
-       }
-
        percpu_down_write(&c->mark_lock);
+
        sectors_available = bch2_recalc_sectors_available(c);
 
        if (sectors <= sectors_available ||
            (flags & BCH_DISK_RESERVATION_NOFAIL)) {
                atomic64_set(&c->sectors_available,
                             max_t(s64, 0, sectors_available - sectors));
-               this_cpu_add(c->usage[0]->s.online_reserved, sectors);
+               this_cpu_add(c->usage[0]->online_reserved, sectors);
                res->sectors                    += sectors;
                ret = 0;
        } else {
@@ -1112,9 +1143,6 @@ recalculate:
 
        percpu_up_write(&c->mark_lock);
 
-       if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD))
-               up_read(&c->gc_lock);
-
        return ret;
 }
 
@@ -1135,7 +1163,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
        struct bucket_array *buckets = NULL, *old_buckets = NULL;
        unsigned long *buckets_nouse = NULL;
        unsigned long *buckets_written = NULL;
-       u8 *oldest_gens = NULL;
        alloc_fifo      free[RESERVE_NR];
        alloc_fifo      free_inc;
        alloc_heap      alloc_heap;
@@ -1161,8 +1188,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
        if (!(buckets           = kvpmalloc(sizeof(struct bucket_array) +
                                            nbuckets * sizeof(struct bucket),
                                            GFP_KERNEL|__GFP_ZERO)) ||
-           !(oldest_gens       = kvpmalloc(nbuckets * sizeof(u8),
-                                           GFP_KERNEL|__GFP_ZERO)) ||
            !(buckets_nouse     = kvpmalloc(BITS_TO_LONGS(nbuckets) *
                                            sizeof(unsigned long),
                                            GFP_KERNEL|__GFP_ZERO)) ||
@@ -1197,9 +1222,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
                memcpy(buckets->b,
                       old_buckets->b,
                       n * sizeof(struct bucket));
-               memcpy(oldest_gens,
-                      ca->oldest_gens,
-                      n * sizeof(u8));
                memcpy(buckets_nouse,
                       ca->buckets_nouse,
                       BITS_TO_LONGS(n) * sizeof(unsigned long));
@@ -1211,7 +1233,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
        rcu_assign_pointer(ca->buckets[0], buckets);
        buckets = old_buckets;
 
-       swap(ca->oldest_gens, oldest_gens);
        swap(ca->buckets_nouse, buckets_nouse);
        swap(ca->buckets_written, buckets_written);
 
@@ -1255,8 +1276,6 @@ err:
                BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
        kvpfree(buckets_written,
                BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
-       kvpfree(oldest_gens,
-               nbuckets * sizeof(u8));
        if (buckets)
                call_rcu(&old_buckets->rcu, buckets_free_rcu);
 
@@ -1276,7 +1295,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
                BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
        kvpfree(ca->buckets_nouse,
                BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
-       kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8));
        kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
                sizeof(struct bucket_array) +
                ca->mi.nbuckets * sizeof(struct bucket));
index 19cf652570a2d093dbc82c9b2728442a71fc51b4..0725aa94428b005413406d671ac506fa6471e596 100644 (file)
 
 #define bucket_cmpxchg(g, new, expr)                           \
 ({                                                             \
+       struct bucket *_g = g;                                  \
        u64 _v = atomic64_read(&(g)->_mark.v);                  \
        struct bucket_mark _old;                                \
                                                                \
        do {                                                    \
                (new).v.counter = _old.v.counter = _v;          \
                expr;                                           \
-       } while ((_v = atomic64_cmpxchg(&(g)->_mark.v,          \
+       } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v,         \
                               _old.v.counter,                  \
                               (new).v.counter)) != _old.v.counter);\
        _old;                                                   \
@@ -56,18 +57,6 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b)
        return __bucket(ca, b, false);
 }
 
-static inline void bucket_set_dirty(struct bch_dev *ca, size_t b)
-{
-       struct bucket *g;
-       struct bucket_mark m;
-
-       rcu_read_lock();
-       g = bucket(ca, b);
-       bucket_cmpxchg(g, m, m.dirty = true);
-       rcu_read_unlock();
-
-}
-
 static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca,
                                         size_t b, int rw)
 {
@@ -86,7 +75,9 @@ static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw)
 
 static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b)
 {
-       return bucket(ca, b)->mark.gen - ca->oldest_gens[b];
+       struct bucket *g = bucket(ca, b);
+
+       return g->mark.gen - g->oldest_gen;
 }
 
 static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
@@ -96,9 +87,10 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
 }
 
 static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
-                                       const struct bch_extent_ptr *ptr)
+                                       const struct bch_extent_ptr *ptr,
+                                       bool gc)
 {
-       return bucket(ca, PTR_BUCKET_NR(ca, ptr));
+       return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc);
 }
 
 static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca,
@@ -219,31 +211,28 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
 
 /* Filesystem usage: */
 
-static inline struct bch_fs_usage *bch2_fs_usage_get_scratch(struct bch_fs *c)
+static inline unsigned fs_usage_u64s(struct bch_fs *c)
 {
-       struct bch_fs_usage *ret;
 
-       ret = this_cpu_ptr(c->usage_scratch);
+       return sizeof(struct bch_fs_usage) / sizeof(u64) +
+               READ_ONCE(c->replicas.nr);
+}
 
-       memset(ret, 0, sizeof(*ret) + c->replicas.nr * sizeof(u64));
+static inline struct bch_fs_usage *bch2_fs_usage_get_scratch(struct bch_fs *c)
+{
+       struct bch_fs_usage *ret = this_cpu_ptr(c->usage_scratch);
 
+       memset(ret, 0, fs_usage_u64s(c) * sizeof(u64));
        return ret;
 }
 
 struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *);
 
-u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *);
 
 struct bch_fs_usage_short
 bch2_fs_usage_read_short(struct bch_fs *);
 
-static inline u64 bch2_fs_sectors_free(struct bch_fs *c)
-{
-       struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c);
-
-       return usage.capacity - usage.used;
-}
-
 /* key/bucket marking: */
 
 void bch2_bucket_seq_cleanup(struct bch_fs *);
@@ -257,8 +246,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
                               size_t, enum bch_data_type, unsigned,
                               struct gc_pos, unsigned);
 
-#define BCH_BUCKET_MARK_NOATOMIC               (1 << 0)
-#define BCH_BUCKET_MARK_GC                     (1 << 1)
+#define BCH_BUCKET_MARK_GC                     (1 << 0)
+#define BCH_BUCKET_MARK_NOATOMIC               (1 << 1)
 
 int bch2_mark_key_locked(struct bch_fs *, struct bkey_s_c,
                  bool, s64, struct gc_pos,
@@ -268,7 +257,7 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
                  struct bch_fs_usage *, u64, unsigned);
 void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
 int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
-                       struct disk_reservation *, struct gc_pos);
+                       struct disk_reservation *);
 
 /* disk reservations: */
 
@@ -282,8 +271,6 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c,
 }
 
 #define BCH_DISK_RESERVATION_NOFAIL            (1 << 0)
-#define BCH_DISK_RESERVATION_GC_LOCK_HELD      (1 << 1)
-#define BCH_DISK_RESERVATION_BTREE_LOCKS_HELD  (1 << 2)
 
 int bch2_disk_reservation_add(struct bch_fs *,
                             struct disk_reservation *,
index 56863c2371265603a2ae8e3e55a66c5475c1b8cf..869a13146c29b0a58dc57ee3fed05121289694b4 100644 (file)
@@ -38,6 +38,7 @@ struct bucket {
        };
 
        u16                             io_time[2];
+       u8                              oldest_gen;
        unsigned                        gen_valid:1;
 };
 
@@ -62,35 +63,33 @@ struct bch_dev_usage {
 struct bch_fs_usage {
        /* all fields are in units of 512 byte sectors: */
 
-       /* summarized: */
-       struct bch_fs_usage_summarized {
-               u64             online_reserved;
+       u64                     online_reserved;
 
-               /* fields after online_reserved are cleared/recalculated by gc: */
-               u64             gc_start[0];
+       /* fields after online_reserved are cleared/recalculated by gc: */
+       u64                     gc_start[0];
 
-               u64             hidden;
-               u64             data;
-               u64             cached;
-               u64             reserved;
-               u64             nr_inodes;
+       u64                     hidden;
+       u64                     data;
+       u64                     cached;
+       u64                     reserved;
+       u64                     nr_inodes;
 
-               /* XXX: add stats for compression ratio */
+       /* XXX: add stats for compression ratio */
 #if 0
-               u64             uncompressed;
-               u64             compressed;
+       u64                     uncompressed;
+       u64                     compressed;
 #endif
-       } s;
 
        /* broken out: */
 
        u64                     persistent_reserved[BCH_REPLICAS_MAX];
-       u64                     data[];
+       u64                     replicas[];
 };
 
 struct bch_fs_usage_short {
        u64                     capacity;
        u64                     used;
+       u64                     free;
        u64                     nr_inodes;
 };
 
index b84ae5c937e5ebae81c769279adb2d311d25efd3..4e33e7b8187228ede85e76793a2e88d97dd13060 100644 (file)
@@ -402,10 +402,10 @@ static long bch2_ioctl_usage(struct bch_fs *c,
                if (!src)
                        return -ENOMEM;
 
-               percpu_up_read_preempt_enable(&c->mark_lock);
+               dst.used                = bch2_fs_sectors_used(c, src);
+               dst.online_reserved     = src->online_reserved;
 
-               dst.used                = bch2_fs_sectors_used(c, *src);
-               dst.online_reserved     = src->s.online_reserved;
+               percpu_up_read_preempt_enable(&c->mark_lock);
 
                for (i = 0; i < BCH_REPLICAS_MAX; i++) {
                        dst.persistent_reserved[i] =
index 0f075fa1d3600b7cb4153c0c3198b1de3325dcc1..369b100a0a587fa317727e08099703b6eac9addc 100644 (file)
@@ -979,10 +979,8 @@ bch2_extent_can_insert(struct btree_insert *trans,
 
        if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
            (sectors = bch2_extent_is_compressed(k))) {
-               int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
-
-               if (trans->flags & BTREE_INSERT_NOFAIL)
-                       flags |= BCH_DISK_RESERVATION_NOFAIL;
+               int flags = trans->flags & BTREE_INSERT_NOFAIL
+                       ? BCH_DISK_RESERVATION_NOFAIL : 0;
 
                switch (bch2_disk_reservation_add(trans->c,
                                trans->disk_res,
@@ -991,8 +989,6 @@ bch2_extent_can_insert(struct btree_insert *trans,
                        break;
                case -ENOSPC:
                        return BTREE_INSERT_ENOSPC;
-               case -EINTR:
-                       return BTREE_INSERT_NEED_GC_LOCK;
                default:
                        BUG();
                }
index 9715ddbdae562a8365a28f7d46a9b2bd81054f12..0982af022ff9e07c5123bda6292cc8d1164f56d5 100644 (file)
@@ -100,7 +100,7 @@ do {                                                                        \
 ({                                                                     \
        bool _r = !fifo_empty((fifo));                                  \
        if (_r)                                                         \
-               (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]       \
+               (i) = (fifo)->data[--(fifo)->back & (fifo)->mask];      \
        _r;                                                             \
 })
 
index 8ff8cfa8bf7628b6d97dda7ee1367a67a8f68791..f108a28233c779e25011ac289af22832aca045e1 100644 (file)
 
 #include <trace/events/bcachefs.h>
 
-static bool journal_entry_is_open(struct journal *j)
+static bool __journal_entry_is_open(union journal_res_state state)
 {
-       return j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+       return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
 }
 
-void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set)
+static bool journal_entry_is_open(struct journal *j)
 {
-       struct journal_buf *w = journal_prev_buf(j);
-
-       atomic_dec_bug(&journal_seq_pin(j, le64_to_cpu(w->data->seq))->count);
-
-       if (!need_write_just_set &&
-           test_bit(JOURNAL_NEED_WRITE, &j->flags))
-               bch2_time_stats_update(j->delay_time,
-                                      j->need_write_time);
-
-       closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
+       return __journal_entry_is_open(j->reservations);
 }
 
 static void journal_pin_new_entry(struct journal *j, int count)
@@ -77,39 +68,71 @@ static inline bool journal_entry_empty(struct jset *j)
        return true;
 }
 
-static enum {
-       JOURNAL_ENTRY_ERROR,
-       JOURNAL_ENTRY_INUSE,
-       JOURNAL_ENTRY_CLOSED,
-       JOURNAL_UNLOCKED,
-} journal_buf_switch(struct journal *j, bool need_write_just_set)
+void bch2_journal_halt(struct journal *j)
+{
+       union journal_res_state old, new;
+       u64 v = atomic64_read(&j->reservations.counter);
+
+       do {
+               old.v = new.v = v;
+               if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
+                       return;
+
+               new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
+       } while ((v = atomic64_cmpxchg(&j->reservations.counter,
+                                      old.v, new.v)) != old.v);
+
+       journal_wake(j);
+       closure_wake_up(&journal_cur_buf(j)->wait);
+}
+
+/* journal entry close/open: */
+
+void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set)
+{
+       if (!need_write_just_set &&
+           test_bit(JOURNAL_NEED_WRITE, &j->flags))
+               bch2_time_stats_update(j->delay_time,
+                                      j->need_write_time);
+
+       clear_bit(JOURNAL_NEED_WRITE, &j->flags);
+
+       closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
+}
+
+/*
+ * Returns true if journal entry is now closed:
+ */
+static bool __journal_entry_close(struct journal *j)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct journal_buf *buf = journal_cur_buf(j);
        union journal_res_state old, new;
        u64 v = atomic64_read(&j->reservations.counter);
+       bool set_need_write = false;
+       unsigned sectors;
 
        lockdep_assert_held(&j->lock);
 
        do {
                old.v = new.v = v;
                if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL)
-                       return JOURNAL_ENTRY_CLOSED;
+                       return true;
 
                if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) {
                        /* this entry will never be written: */
                        closure_wake_up(&buf->wait);
-                       return JOURNAL_ENTRY_ERROR;
+                       return true;
                }
 
-               if (new.prev_buf_unwritten)
-                       return JOURNAL_ENTRY_INUSE;
+               if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) {
+                       set_bit(JOURNAL_NEED_WRITE, &j->flags);
+                       j->need_write_time = local_clock();
+                       set_need_write = true;
+               }
 
-               /*
-                * avoid race between setting buf->data->u64s and
-                * journal_res_put starting write:
-                */
-               journal_state_inc(&new);
+               if (new.prev_buf_unwritten)
+                       return false;
 
                new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL;
                new.idx++;
@@ -119,15 +142,12 @@ static enum {
        } while ((v = atomic64_cmpxchg(&j->reservations.counter,
                                       old.v, new.v)) != old.v);
 
-       clear_bit(JOURNAL_NEED_WRITE, &j->flags);
-
        buf->data->u64s         = cpu_to_le32(old.cur_entry_offset);
 
-       j->prev_buf_sectors =
-               vstruct_blocks_plus(buf->data, c->block_bits,
-                                   buf->u64s_reserved) *
-               c->opts.block_size;
-       BUG_ON(j->prev_buf_sectors > j->cur_buf_sectors);
+       sectors = vstruct_blocks_plus(buf->data, c->block_bits,
+                                     buf->u64s_reserved) << c->block_bits;
+       BUG_ON(sectors > buf->sectors);
+       buf->sectors = sectors;
 
        bkey_extent_init(&buf->key);
 
@@ -150,7 +170,6 @@ static enum {
         * Hence, we want update/set last_seq on the current journal entry right
         * before we open a new one:
         */
-       bch2_journal_reclaim_fast(j);
        buf->data->last_seq     = cpu_to_le64(journal_last_seq(j));
 
        if (journal_entry_empty(buf->data))
@@ -163,32 +182,22 @@ static enum {
        bch2_journal_buf_init(j);
 
        cancel_delayed_work(&j->write_work);
-       spin_unlock(&j->lock);
 
-       /* ugh - might be called from __journal_res_get() under wait_event() */
-       __set_current_state(TASK_RUNNING);
-       bch2_journal_buf_put(j, old.idx, need_write_just_set);
+       bch2_journal_space_available(j);
 
-       return JOURNAL_UNLOCKED;
+       bch2_journal_buf_put(j, old.idx, set_need_write);
+       return true;
 }
 
-void bch2_journal_halt(struct journal *j)
+static bool journal_entry_close(struct journal *j)
 {
-       union journal_res_state old, new;
-       u64 v = atomic64_read(&j->reservations.counter);
-
-       do {
-               old.v = new.v = v;
-               if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-                       return;
+       bool ret;
 
-               new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL;
-       } while ((v = atomic64_cmpxchg(&j->reservations.counter,
-                                      old.v, new.v)) != old.v);
+       spin_lock(&j->lock);
+       ret = __journal_entry_close(j);
+       spin_unlock(&j->lock);
 
-       journal_wake(j);
-       closure_wake_up(&journal_cur_buf(j)->wait);
-       closure_wake_up(&journal_prev_buf(j)->wait);
+       return ret;
 }
 
 /*
@@ -196,46 +205,39 @@ void bch2_journal_halt(struct journal *j)
  * journal reservation - journal entry is open means journal is dirty:
  *
  * returns:
- * 1:          success
- * 0:          journal currently full (must wait)
- * -EROFS:     insufficient rw devices
- * -EIO:       journal error
+ * 0:          success
+ * -ENOSPC:    journal currently full, must invoke reclaim
+ * -EAGAIN:    journal blocked, must wait
+ * -EROFS:     insufficient rw devices or journal error
  */
 static int journal_entry_open(struct journal *j)
 {
        struct journal_buf *buf = journal_cur_buf(j);
        union journal_res_state old, new;
-       ssize_t u64s;
-       int sectors;
+       int u64s;
        u64 v;
 
        lockdep_assert_held(&j->lock);
        BUG_ON(journal_entry_is_open(j));
 
-       if (!fifo_free(&j->pin))
-               return 0;
+       if (j->blocked)
+               return -EAGAIN;
 
-       sectors = bch2_journal_entry_sectors(j);
-       if (sectors <= 0)
-               return sectors;
+       if (j->cur_entry_error)
+               return j->cur_entry_error;
 
-       buf->disk_sectors       = sectors;
-       buf->u64s_reserved      = j->entry_u64s_reserved;
+       BUG_ON(!j->cur_entry_sectors);
 
-       sectors = min_t(unsigned, sectors, buf->size >> 9);
-       j->cur_buf_sectors      = sectors;
-
-       u64s = (sectors << 9) / sizeof(u64);
-
-       /* Subtract the journal header */
-       u64s -= sizeof(struct jset) / sizeof(u64);
-       u64s -= buf->u64s_reserved;
-       u64s  = max_t(ssize_t, 0L, u64s);
+       buf->u64s_reserved      = j->entry_u64s_reserved;
+       buf->disk_sectors       = j->cur_entry_sectors;
+       buf->sectors            = min(buf->disk_sectors, buf->buf_size >> 9);
 
-       BUG_ON(u64s >= JOURNAL_ENTRY_CLOSED_VAL);
+       u64s = (int) (buf->sectors << 9) / sizeof(u64) -
+               journal_entry_overhead(j);
+       u64s  = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1);
 
        if (u64s <= le32_to_cpu(buf->data->u64s))
-               return 0;
+               return -ENOSPC;
 
        /*
         * Must be set before marking the journal entry as open:
@@ -246,11 +248,14 @@ static int journal_entry_open(struct journal *j)
        do {
                old.v = new.v = v;
 
+               EBUG_ON(journal_state_count(new, new.idx));
+
                if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL)
-                       return -EIO;
+                       return -EROFS;
 
                /* Handle any already added entries */
                new.cur_entry_offset = le32_to_cpu(buf->data->u64s);
+               journal_state_inc(&new);
        } while ((v = atomic64_cmpxchg(&j->reservations.counter,
                                       old.v, new.v)) != old.v);
 
@@ -263,37 +268,22 @@ static int journal_entry_open(struct journal *j)
                         &j->write_work,
                         msecs_to_jiffies(j->write_delay_ms));
        journal_wake(j);
-       return 1;
+       return 0;
 }
 
-static bool __journal_entry_close(struct journal *j)
+static bool journal_quiesced(struct journal *j)
 {
-       bool set_need_write;
-
-       if (!journal_entry_is_open(j)) {
-               spin_unlock(&j->lock);
-               return true;
-       }
-
-       set_need_write = !test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags);
-       if (set_need_write)
-               j->need_write_time = local_clock();
+       union journal_res_state state = READ_ONCE(j->reservations);
+       bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state);
 
-       switch (journal_buf_switch(j, set_need_write)) {
-       case JOURNAL_ENTRY_INUSE:
-               spin_unlock(&j->lock);
-               return false;
-       default:
-               spin_unlock(&j->lock);
-       case JOURNAL_UNLOCKED:
-               return true;
-       }
+       if (!ret)
+               journal_entry_close(j);
+       return ret;
 }
 
-static bool journal_entry_close(struct journal *j)
+static void journal_quiesce(struct journal *j)
 {
-       spin_lock(&j->lock);
-       return __journal_entry_close(j);
+       wait_event(j->wait, journal_quiesced(j));
 }
 
 static void journal_write_work(struct work_struct *work)
@@ -337,7 +327,11 @@ retry:
        if (journal_res_get_fast(j, res, flags))
                return 0;
 
+       if (bch2_journal_error(j))
+               return -EROFS;
+
        spin_lock(&j->lock);
+
        /*
         * Recheck after taking the lock, so we don't race with another thread
         * that just did journal_entry_open() and call journal_entry_close()
@@ -355,56 +349,43 @@ retry:
         */
        buf = journal_cur_buf(j);
        if (journal_entry_is_open(j) &&
-           buf->size >> 9 < buf->disk_sectors &&
-           buf->size < JOURNAL_ENTRY_SIZE_MAX)
-               j->buf_size_want = max(j->buf_size_want, buf->size << 1);
+           buf->buf_size >> 9 < buf->disk_sectors &&
+           buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
+               j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
 
-       /*
-        * Close the current journal entry if necessary, then try to start a new
-        * one:
-        */
-       switch (journal_buf_switch(j, false)) {
-       case JOURNAL_ENTRY_ERROR:
-               spin_unlock(&j->lock);
-               return -EROFS;
-       case JOURNAL_ENTRY_INUSE:
+       if (journal_entry_is_open(j) &&
+           !__journal_entry_close(j)) {
                /*
-                * The current journal entry is still open, but we failed to get
-                * a journal reservation because there's not enough space in it,
-                * and we can't close it and start another because we haven't
-                * finished writing out the previous entry:
+                * We failed to get a reservation on the current open journal
+                * entry because it's full, and we can't close it because
+                * there's still a previous one in flight:
                 */
-               spin_unlock(&j->lock);
                trace_journal_entry_full(c);
-               goto blocked;
-       case JOURNAL_ENTRY_CLOSED:
-               break;
-       case JOURNAL_UNLOCKED:
-               goto retry;
+               ret = -EAGAIN;
+       } else {
+               ret = journal_entry_open(j);
        }
 
-       /* We now have a new, closed journal buf - see if we can open it: */
-       ret = journal_entry_open(j);
+       if ((ret == -EAGAIN || ret == -ENOSPC) &&
+           !j->res_get_blocked_start)
+               j->res_get_blocked_start = local_clock() ?: 1;
+
        spin_unlock(&j->lock);
 
-       if (ret < 0)
-               return ret;
-       if (ret)
+       if (!ret)
                goto retry;
+       if (ret == -ENOSPC) {
+               /*
+                * Journal is full - can't rely on reclaim from work item due to
+                * freezing:
+                */
+               trace_journal_full(c);
+               if (!(flags & JOURNAL_RES_GET_NONBLOCK))
+                       bch2_journal_reclaim_work(&j->reclaim_work.work);
+               ret = -EAGAIN;
+       }
 
-       /* Journal's full, we have to wait */
-
-       /*
-        * Direct reclaim - can't rely on reclaim from work item
-        * due to freezing..
-        */
-       bch2_journal_reclaim_work(&j->reclaim_work.work);
-
-       trace_journal_full(c);
-blocked:
-       if (!j->res_get_blocked_start)
-               j->res_get_blocked_start = local_clock() ?: 1;
-       return -EAGAIN;
+       return ret;
 }
 
 /*
@@ -422,7 +403,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
 {
        int ret;
 
-       wait_event(j->wait,
+       closure_wait_event(&j->async_wait,
                   (ret = __journal_res_get(j, res, flags)) != -EAGAIN ||
                   (flags & JOURNAL_RES_GET_NONBLOCK));
        return ret;
@@ -441,9 +422,9 @@ void bch2_journal_entry_res_resize(struct journal *j,
 
        j->entry_u64s_reserved += d;
        if (d <= 0)
-               goto out_unlock;
+               goto out;
 
-       j->cur_entry_u64s -= d;
+       j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d);
        smp_mb();
        state = READ_ONCE(j->reservations);
 
@@ -454,15 +435,12 @@ void bch2_journal_entry_res_resize(struct journal *j,
                 * Not enough room in current journal entry, have to flush it:
                 */
                __journal_entry_close(j);
-               goto out;
+       } else {
+               journal_cur_buf(j)->u64s_reserved += d;
        }
-
-       journal_cur_buf(j)->u64s_reserved += d;
-out_unlock:
-       spin_unlock(&j->lock);
 out:
+       spin_unlock(&j->lock);
        res->u64s += d;
-       return;
 }
 
 /* journal flushing: */
@@ -492,47 +470,47 @@ int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        int ret;
-retry:
+
        spin_lock(&j->lock);
 
-       if (seq < journal_cur_seq(j) ||
+       /*
+        * Can't try to open more than one sequence number ahead:
+        */
+       BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j));
+
+       if (journal_cur_seq(j) > seq ||
            journal_entry_is_open(j)) {
                spin_unlock(&j->lock);
                return 0;
        }
 
-       if (journal_cur_seq(j) < seq) {
-               switch (journal_buf_switch(j, false)) {
-               case JOURNAL_ENTRY_ERROR:
-                       spin_unlock(&j->lock);
-                       return -EROFS;
-               case JOURNAL_ENTRY_INUSE:
-                       /* haven't finished writing out the previous one: */
-                       trace_journal_entry_full(c);
-                       goto blocked;
-               case JOURNAL_ENTRY_CLOSED:
-                       break;
-               case JOURNAL_UNLOCKED:
-                       goto retry;
-               }
-       }
-
-       BUG_ON(journal_cur_seq(j) < seq);
+       if (journal_cur_seq(j) < seq &&
+           !__journal_entry_close(j)) {
+               /* haven't finished writing out the previous one: */
+               trace_journal_entry_full(c);
+               ret = -EAGAIN;
+       } else {
+               BUG_ON(journal_cur_seq(j) != seq);
 
-       ret = journal_entry_open(j);
-       if (ret) {
-               spin_unlock(&j->lock);
-               return ret < 0 ? ret : 0;
+               ret = journal_entry_open(j);
        }
-blocked:
-       if (!j->res_get_blocked_start)
+
+       if ((ret == -EAGAIN || ret == -ENOSPC) &&
+           !j->res_get_blocked_start)
                j->res_get_blocked_start = local_clock() ?: 1;
 
-       closure_wait(&j->async_wait, cl);
+       if (ret == -EAGAIN || ret == -ENOSPC)
+               closure_wait(&j->async_wait, cl);
+
        spin_unlock(&j->lock);
 
-       bch2_journal_reclaim_work(&j->reclaim_work.work);
-       return -EAGAIN;
+       if (ret == -ENOSPC) {
+               trace_journal_full(c);
+               bch2_journal_reclaim_work(&j->reclaim_work.work);
+               ret = -EAGAIN;
+       }
+
+       return ret;
 }
 
 static int journal_seq_error(struct journal *j, u64 seq)
@@ -615,8 +593,7 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq,
 
        if (seq == journal_cur_seq(j))
                __journal_entry_close(j);
-       else
-               spin_unlock(&j->lock);
+       spin_unlock(&j->lock);
 }
 
 static int journal_seq_flushed(struct journal *j, u64 seq)
@@ -628,8 +605,7 @@ static int journal_seq_flushed(struct journal *j, u64 seq)
 
        if (seq == journal_cur_seq(j))
                __journal_entry_close(j);
-       else
-               spin_unlock(&j->lock);
+       spin_unlock(&j->lock);
 
        return ret;
 }
@@ -721,6 +697,26 @@ int bch2_journal_flush(struct journal *j)
        return bch2_journal_flush_seq(j, seq);
 }
 
+/* block/unlock the journal: */
+
+void bch2_journal_unblock(struct journal *j)
+{
+       spin_lock(&j->lock);
+       j->blocked--;
+       spin_unlock(&j->lock);
+
+       journal_wake(j);
+}
+
+void bch2_journal_block(struct journal *j)
+{
+       spin_lock(&j->lock);
+       j->blocked++;
+       spin_unlock(&j->lock);
+
+       journal_quiesce(j);
+}
+
 /* allocate journal on a device: */
 
 static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
@@ -743,7 +739,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                goto err;
 
        journal_buckets = bch2_sb_resize_journal(&ca->disk_sb,
-                               nr + sizeof(*journal_buckets) / sizeof(u64));
+                                                nr + sizeof(*journal_buckets) / sizeof(u64));
        if (!journal_buckets)
                goto err;
 
@@ -806,9 +802,9 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                ja->nr++;
 
                bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL,
-                               ca->mi.bucket_size,
-                               gc_phase(GC_PHASE_SB),
-                               0);
+                                         ca->mi.bucket_size,
+                                         gc_phase(GC_PHASE_SB),
+                                         0);
 
                if (c) {
                        spin_unlock(&c->journal.lock);
@@ -859,7 +855,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
                 */
 
                if (bch2_disk_reservation_get(c, &disk_res,
-                               bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
+                                             bucket_to_sector(ca, nr - ja->nr), 1, 0)) {
                        mutex_unlock(&c->sb_lock);
                        return -ENOSPC;
                }
@@ -930,8 +926,7 @@ void bch2_fs_journal_stop(struct journal *j)
            c->btree_roots_dirty)
                bch2_journal_meta(j);
 
-       BUG_ON(journal_entry_is_open(j) ||
-              j->reservations.prev_buf_unwritten);
+       journal_quiesce(j);
 
        BUG_ON(!bch2_journal_error(j) &&
               test_bit(JOURNAL_NOT_EMPTY, &j->flags));
@@ -957,7 +952,7 @@ void bch2_fs_journal_start(struct journal *j)
                journal_pin_new_entry(j, 0);
 
        /*
-        * journal_buf_switch() only inits the next journal entry when it
+        * __journal_entry_close() only inits the next journal entry when it
         * closes an open journal entry - the very first journal entry gets
         * initialized here:
         */
@@ -966,6 +961,7 @@ void bch2_fs_journal_start(struct journal *j)
 
        c->last_bucket_seq_cleanup = journal_cur_seq(j);
 
+       bch2_journal_space_available(j);
        spin_unlock(&j->lock);
 
        /*
@@ -975,7 +971,7 @@ void bch2_fs_journal_start(struct journal *j)
         */
        bch2_journal_seq_blacklist_write(j);
 
-       queue_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+       queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
 }
 
 /* init/exit: */
@@ -1021,8 +1017,8 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 
 void bch2_fs_journal_exit(struct journal *j)
 {
-       kvpfree(j->buf[1].data, j->buf[1].size);
-       kvpfree(j->buf[0].data, j->buf[0].size);
+       kvpfree(j->buf[1].data, j->buf[1].buf_size);
+       kvpfree(j->buf[0].data, j->buf[0].buf_size);
        free_fifo(&j->pin);
 }
 
@@ -1046,8 +1042,8 @@ int bch2_fs_journal_init(struct journal *j)
 
        lockdep_init_map(&j->res_map, "journal res", &res_key, 0);
 
-       j->buf[0].size          = JOURNAL_ENTRY_SIZE_MIN;
-       j->buf[1].size          = JOURNAL_ENTRY_SIZE_MIN;
+       j->buf[0].buf_size      = JOURNAL_ENTRY_SIZE_MIN;
+       j->buf[1].buf_size      = JOURNAL_ENTRY_SIZE_MIN;
        j->write_delay_ms       = 1000;
        j->reclaim_delay_ms     = 100;
 
@@ -1060,8 +1056,8 @@ int bch2_fs_journal_init(struct journal *j)
                 { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v);
 
        if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
-           !(j->buf[0].data = kvpmalloc(j->buf[0].size, GFP_KERNEL)) ||
-           !(j->buf[1].data = kvpmalloc(j->buf[1].size, GFP_KERNEL))) {
+           !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) ||
+           !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) {
                ret = -ENOMEM;
                goto out;
        }
@@ -1078,35 +1074,54 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
 {
        struct printbuf out = _PBUF(buf, PAGE_SIZE);
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       union journal_res_state *s = &j->reservations;
+       union journal_res_state s;
        struct bch_dev *ca;
        unsigned iter;
 
        rcu_read_lock();
        spin_lock(&j->lock);
+       s = READ_ONCE(j->reservations);
 
        pr_buf(&out,
               "active journal entries:\t%llu\n"
               "seq:\t\t\t%llu\n"
               "last_seq:\t\t%llu\n"
               "last_seq_ondisk:\t%llu\n"
-              "reservation count:\t%u\n"
-              "reservation offset:\t%u\n"
-              "current entry u64s:\t%u\n"
-              "io in flight:\t\t%i\n"
-              "need write:\t\t%i\n"
-              "dirty:\t\t\t%i\n"
-              "replay done:\t\t%i\n",
+              "current entry:\t\t",
               fifo_used(&j->pin),
               journal_cur_seq(j),
               journal_last_seq(j),
-              j->last_seq_ondisk,
-              journal_state_count(*s, s->idx),
-              s->cur_entry_offset,
-              j->cur_entry_u64s,
-              s->prev_buf_unwritten,
+              j->last_seq_ondisk);
+
+       switch (s.cur_entry_offset) {
+       case JOURNAL_ENTRY_ERROR_VAL:
+               pr_buf(&out, "error\n");
+               break;
+       case JOURNAL_ENTRY_CLOSED_VAL:
+               pr_buf(&out, "closed\n");
+               break;
+       default:
+               pr_buf(&out, "%u/%u\n",
+                      s.cur_entry_offset,
+                      j->cur_entry_u64s);
+               break;
+       }
+
+       pr_buf(&out,
+              "current entry refs:\t%u\n"
+              "prev entry unwritten:\t",
+              journal_state_count(s, s.idx));
+
+       if (s.prev_buf_unwritten)
+               pr_buf(&out, "yes, ref %u\n",
+                      journal_state_count(s, !s.idx));
+       else
+               pr_buf(&out, "no\n");
+
+       pr_buf(&out,
+              "need write:\t\t%i\n"
+              "replay done:\t\t%i\n",
               test_bit(JOURNAL_NEED_WRITE,     &j->flags),
-              journal_entry_is_open(j),
               test_bit(JOURNAL_REPLAY_DONE,    &j->flags));
 
        for_each_member_device_rcu(ca, c, iter,
@@ -1119,9 +1134,12 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf)
                pr_buf(&out,
                       "dev %u:\n"
                       "\tnr\t\t%u\n"
+                      "\tavailable\t%u:%u\n"
                       "\tcur_idx\t\t%u (seq %llu)\n"
                       "\tlast_idx\t%u (seq %llu)\n",
                       iter, ja->nr,
+                      bch2_journal_dev_buckets_available(j, ja),
+                      ja->sectors_free,
                       ja->cur_idx,     ja->bucket_seq[ja->cur_idx],
                       ja->last_idx,    ja->bucket_seq[ja->last_idx]);
        }
index 50d864a3cae3566bcb31da609ccd48fe6043db6c..71929bd67a7c9472b2acb4a7761f8dd4ff60224d 100644 (file)
@@ -178,6 +178,11 @@ static inline unsigned jset_u64s(unsigned u64s)
        return u64s + sizeof(struct jset_entry) / sizeof(u64);
 }
 
+static inline int journal_entry_overhead(struct journal *j)
+{
+       return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved;
+}
+
 static inline struct jset_entry *
 bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s)
 {
@@ -222,7 +227,7 @@ static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *
                               id, 0, k, k->k.u64s);
 }
 
-void bch2_journal_buf_put_slowpath(struct journal *, bool);
+void __bch2_journal_buf_put(struct journal *, bool);
 
 static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
                                       bool need_write_just_set)
@@ -233,17 +238,10 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx,
                                    .buf0_count = idx == 0,
                                    .buf1_count = idx == 1,
                                    }).v, &j->reservations.counter);
-
-       EBUG_ON(s.idx != idx && !s.prev_buf_unwritten);
-
-       /*
-        * Do not initiate a journal write if the journal is in an error state
-        * (previous journal entry write may have failed)
-        */
-       if (s.idx != idx &&
-           !journal_state_count(s, idx) &&
-           s.cur_entry_offset != JOURNAL_ENTRY_ERROR_VAL)
-               bch2_journal_buf_put_slowpath(j, need_write_just_set);
+       if (!journal_state_count(s, idx)) {
+               EBUG_ON(s.idx == idx || !s.prev_buf_unwritten);
+               __bch2_journal_buf_put(j, need_write_just_set);
+       }
 }
 
 /*
@@ -291,6 +289,8 @@ static inline int journal_res_get_fast(struct journal *j,
                if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s)
                        return 0;
 
+               EBUG_ON(!journal_state_count(new, new.idx));
+
                if (flags & JOURNAL_RES_GET_CHECK)
                        return 1;
 
@@ -330,6 +330,8 @@ out:
        return 0;
 }
 
+/* journal_entry_res: */
+
 void bch2_journal_entry_res_resize(struct journal *,
                                   struct journal_entry_res *,
                                   unsigned);
@@ -367,6 +369,9 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
        set_bit(JOURNAL_REPLAY_DONE, &j->flags);
 }
 
+void bch2_journal_unblock(struct journal *);
+void bch2_journal_block(struct journal *);
+
 ssize_t bch2_journal_print_debug(struct journal *, char *);
 ssize_t bch2_journal_print_pins(struct journal *, char *);
 
index 0f1f8e1507c40b3515f98698de458042d1a976aa..16cb6be87cbf17b07cc5e8d878fd94b69ea9e67f 100644 (file)
@@ -825,7 +825,6 @@ fsck_err:
 int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
 {
        struct journal *j = &c->journal;
-       struct journal_entry_pin_list *pin_list;
        struct bkey_i *k, *_n;
        struct jset_entry *entry;
        struct journal_replay *i, *n;
@@ -854,7 +853,8 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
                                ret = bch2_btree_insert(c, entry->btree_id, k,
                                                &disk_res, NULL,
                                                BTREE_INSERT_NOFAIL|
-                                               BTREE_INSERT_JOURNAL_REPLAY);
+                                               BTREE_INSERT_JOURNAL_REPLAY|
+                                               BTREE_INSERT_NOMARK);
                        }
 
                        if (ret) {
@@ -866,10 +866,7 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
                        cond_resched();
                }
 
-               pin_list = journal_seq_pin(j, j->replay_journal_seq);
-
-               if (atomic_dec_and_test(&pin_list->count))
-                       journal_wake(j);
+               bch2_journal_pin_put(j, j->replay_journal_seq);
        }
 
        j->replay_journal_seq = 0;
@@ -884,82 +881,6 @@ err:
 
 /* journal write: */
 
-static unsigned journal_dev_buckets_available(struct journal *j,
-                                             struct journal_device *ja)
-{
-       unsigned next = (ja->cur_idx + 1) % ja->nr;
-       unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
-
-       /*
-        * Don't use the last bucket unless writing the new last_seq
-        * will make another bucket available:
-        */
-       if (available &&
-           journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
-               --available;
-
-       return available;
-}
-
-/* returns number of sectors available for next journal entry: */
-int bch2_journal_entry_sectors(struct journal *j)
-{
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       struct bch_dev *ca;
-       unsigned sectors_available = UINT_MAX;
-       unsigned i, nr_online = 0, nr_devs = 0;
-
-       lockdep_assert_held(&j->lock);
-
-       rcu_read_lock();
-       for_each_member_device_rcu(ca, c, i,
-                                  &c->rw_devs[BCH_DATA_JOURNAL]) {
-               struct journal_device *ja = &ca->journal;
-               unsigned buckets_this_device, sectors_this_device;
-
-               if (!ja->nr)
-                       continue;
-
-               buckets_this_device = journal_dev_buckets_available(j, ja);
-               sectors_this_device = ja->sectors_free;
-
-               nr_online++;
-
-               /*
-                * We that we don't allocate the space for a journal entry
-                * until we write it out - thus, account for it here:
-                */
-               if (j->prev_buf_sectors >= sectors_this_device) {
-                       if (!buckets_this_device)
-                               continue;
-
-                       buckets_this_device--;
-                       sectors_this_device = ca->mi.bucket_size;
-               }
-
-               sectors_this_device -= j->prev_buf_sectors;
-
-               if (buckets_this_device)
-                       sectors_this_device = ca->mi.bucket_size;
-
-               if (!sectors_this_device)
-                       continue;
-
-               sectors_available = min(sectors_available,
-                                       sectors_this_device);
-               nr_devs++;
-       }
-       rcu_read_unlock();
-
-       if (nr_online < c->opts.metadata_replicas_required)
-               return -EROFS;
-
-       if (nr_devs < min_t(unsigned, nr_online, c->opts.metadata_replicas))
-               return 0;
-
-       return sectors_available;
-}
-
 static void __journal_write_alloc(struct journal *j,
                                  struct journal_buf *w,
                                  struct dev_alloc_list *devs_sorted,
@@ -1033,7 +954,6 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
        devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
                                          &c->rw_devs[BCH_DATA_JOURNAL]);
 
-       spin_lock(&j->lock);
        __journal_write_alloc(j, w, &devs_sorted,
                              sectors, &replicas, replicas_want);
 
@@ -1049,7 +969,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 
                if (sectors > ja->sectors_free &&
                    sectors <= ca->mi.bucket_size &&
-                   journal_dev_buckets_available(j, ja)) {
+                   bch2_journal_dev_buckets_available(j, ja)) {
                        ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
                        ja->sectors_free = ca->mi.bucket_size;
                }
@@ -1058,10 +978,6 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
        __journal_write_alloc(j, w, &devs_sorted,
                              sectors, &replicas, replicas_want);
 done:
-       if (replicas >= replicas_want)
-               j->prev_buf_sectors = 0;
-
-       spin_unlock(&j->lock);
        rcu_read_unlock();
 
        return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
@@ -1116,17 +1032,17 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
        unsigned new_size = READ_ONCE(j->buf_size_want);
        void *new_buf;
 
-       if (buf->size >= new_size)
+       if (buf->buf_size >= new_size)
                return;
 
        new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
        if (!new_buf)
                return;
 
-       memcpy(new_buf, buf->data, buf->size);
-       kvpfree(buf->data, buf->size);
+       memcpy(new_buf, buf->data, buf->buf_size);
+       kvpfree(buf->data, buf->buf_size);
        buf->data       = new_buf;
-       buf->size       = new_size;
+       buf->buf_size   = new_size;
 }
 
 static void journal_write_done(struct closure *cl)
@@ -1166,7 +1082,7 @@ static void journal_write_done(struct closure *cl)
         * Must come before signaling write completion, for
         * bch2_fs_journal_stop():
         */
-       mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0);
+       mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0);
 out:
        /* also must come before signalling write completion: */
        closure_debug_destroy(cl);
@@ -1220,20 +1136,22 @@ void bch2_journal_write(struct closure *cl)
        struct bch_extent_ptr *ptr;
        bool validate_before_checksum = false;
        unsigned i, sectors, bytes, u64s;
+       int ret;
+
+       bch2_journal_pin_put(j, le64_to_cpu(w->data->seq));
 
        journal_buf_realloc(j, w);
        jset = w->data;
 
        j->write_start_time = local_clock();
 
-       start   = vstruct_last(w->data);
+       start   = vstruct_last(jset);
        end     = bch2_journal_super_entries_add_common(c, start);
        u64s    = (u64 *) end - (u64 *) start;
        BUG_ON(u64s > j->entry_u64s_reserved);
 
-       le32_add_cpu(&w->data->u64s, u64s);
-       BUG_ON(vstruct_sectors(jset, c->block_bits) >
-              w->disk_sectors);
+       le32_add_cpu(&jset->u64s, u64s);
+       BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
 
        journal_write_compact(jset);
 
@@ -1271,12 +1189,28 @@ void bch2_journal_write(struct closure *cl)
                goto err;
 
        sectors = vstruct_sectors(jset, c->block_bits);
-       BUG_ON(sectors > j->prev_buf_sectors);
+       BUG_ON(sectors > w->sectors);
+
+       bytes = vstruct_bytes(jset);
+       memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
+
+       spin_lock(&j->lock);
+       ret = journal_write_alloc(j, w, sectors);
 
-       bytes = vstruct_bytes(w->data);
-       memset((void *) w->data + bytes, 0, (sectors << 9) - bytes);
+       /*
+        * write is allocated, no longer need to account for it in
+        * bch2_journal_space_available():
+        */
+       w->sectors = 0;
+
+       /*
+        * journal entry has been compacted and allocated, recalculate space
+        * available:
+        */
+       bch2_journal_space_available(j);
+       spin_unlock(&j->lock);
 
-       if (journal_write_alloc(j, w, sectors)) {
+       if (ret) {
                bch2_journal_halt(j);
                bch_err(c, "Unable to allocate journal write");
                bch2_fatal_error(c);
@@ -1316,7 +1250,7 @@ void bch2_journal_write(struct closure *cl)
                trace_journal_write(bio);
                closure_bio_submit(bio, cl);
 
-               ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(w->data->seq);
+               ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq);
        }
 
        for_each_rw_member(ca, c, i)
index d0a652cf93564d9c7725946d2e34e80094517091..ec7b49b8f85ca3a6046abeb6929740b6ea0bef8c 100644 (file)
@@ -39,7 +39,6 @@ int bch2_journal_read(struct bch_fs *, struct list_head *);
 void bch2_journal_entries_free(struct list_head *);
 int bch2_journal_replay(struct bch_fs *, struct list_head *);
 
-int bch2_journal_entry_sectors(struct journal *);
 void bch2_journal_write(struct closure *);
 
 #endif /* _BCACHEFS_JOURNAL_IO_H */
index a795e888c56b4d699641db79efea1019ca9855f5..b928b8c8705f11f2a39c931179d416be34353cd8 100644 (file)
 
 #include "bcachefs.h"
 #include "journal.h"
+#include "journal_io.h"
 #include "journal_reclaim.h"
 #include "replicas.h"
 #include "super.h"
 
+/* Free space calculations: */
+
+unsigned bch2_journal_dev_buckets_available(struct journal *j,
+                                           struct journal_device *ja)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       unsigned next = (ja->cur_idx + 1) % ja->nr;
+       unsigned available = (ja->last_idx + ja->nr - next) % ja->nr;
+
+       /*
+        * Allocator startup needs some journal space before we can do journal
+        * replay:
+        */
+       if (available &&
+           test_bit(BCH_FS_ALLOCATOR_STARTED, &c->flags))
+               available--;
+
+       /*
+        * Don't use the last bucket unless writing the new last_seq
+        * will make another bucket available:
+        */
+       if (available &&
+           journal_last_seq(j) <= ja->bucket_seq[ja->last_idx])
+               --available;
+
+       return available;
+}
+
+void bch2_journal_space_available(struct journal *j)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct bch_dev *ca;
+       unsigned sectors_next_entry     = UINT_MAX;
+       unsigned sectors_total          = UINT_MAX;
+       unsigned max_entry_size         = min(j->buf[0].buf_size >> 9,
+                                             j->buf[1].buf_size >> 9);
+       unsigned i, nr_online = 0, nr_devs = 0;
+       unsigned unwritten_sectors = j->reservations.prev_buf_unwritten
+               ? journal_prev_buf(j)->sectors
+               : 0;
+       int ret = 0;
+
+       lockdep_assert_held(&j->lock);
+
+       rcu_read_lock();
+       for_each_member_device_rcu(ca, c, i,
+                                  &c->rw_devs[BCH_DATA_JOURNAL]) {
+               struct journal_device *ja = &ca->journal;
+               unsigned buckets_this_device, sectors_this_device;
+
+               if (!ja->nr)
+                       continue;
+
+               nr_online++;
+
+               buckets_this_device = bch2_journal_dev_buckets_available(j, ja);
+               sectors_this_device = ja->sectors_free;
+
+               /*
+                * We that we don't allocate the space for a journal entry
+                * until we write it out - thus, account for it here:
+                */
+               if (unwritten_sectors >= sectors_this_device) {
+                       if (!buckets_this_device)
+                               continue;
+
+                       buckets_this_device--;
+                       sectors_this_device = ca->mi.bucket_size;
+               }
+
+               sectors_this_device -= unwritten_sectors;
+
+               if (sectors_this_device < ca->mi.bucket_size &&
+                   buckets_this_device) {
+                       buckets_this_device--;
+                       sectors_this_device = ca->mi.bucket_size;
+               }
+
+               if (!sectors_this_device)
+                       continue;
+
+               sectors_next_entry = min(sectors_next_entry,
+                                        sectors_this_device);
+
+               sectors_total = min(sectors_total,
+                       buckets_this_device * ca->mi.bucket_size +
+                       sectors_this_device);
+
+               max_entry_size = min_t(unsigned, max_entry_size,
+                                      ca->mi.bucket_size);
+
+               nr_devs++;
+       }
+       rcu_read_unlock();
+
+       if (nr_online < c->opts.metadata_replicas_required) {
+               ret = -EROFS;
+               sectors_next_entry = 0;
+       } else if (!sectors_next_entry ||
+                  nr_devs < min_t(unsigned, nr_online,
+                                  c->opts.metadata_replicas)) {
+               ret = -ENOSPC;
+               sectors_next_entry = 0;
+       } else if (!fifo_free(&j->pin)) {
+               ret = -ENOSPC;
+               sectors_next_entry = 0;
+       }
+
+       j->cur_entry_sectors    = sectors_next_entry;
+       j->cur_entry_error      = ret;
+
+       if (!ret)
+               journal_wake(j);
+}
+
+/* Discards - last part of journal reclaim: */
+
+static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+{
+       bool ret;
+
+       spin_lock(&j->lock);
+       ret = ja->nr &&
+               ja->last_idx != ja->cur_idx &&
+               ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk;
+       spin_unlock(&j->lock);
+
+       return ret;
+}
+
+/*
+ * Advance ja->last_idx as long as it points to buckets that are no longer
+ * dirty, issuing discards if necessary:
+ */
+static void journal_do_discards(struct journal *j)
+{
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+       struct bch_dev *ca;
+       unsigned iter;
+
+       mutex_lock(&j->reclaim_lock);
+
+       for_each_rw_member(ca, c, iter) {
+               struct journal_device *ja = &ca->journal;
+
+               while (should_discard_bucket(j, ja)) {
+                       if (ca->mi.discard &&
+                           blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
+                               blkdev_issue_discard(ca->disk_sb.bdev,
+                                       bucket_to_sector(ca,
+                                               ja->buckets[ja->last_idx]),
+                                       ca->mi.bucket_size, GFP_NOIO, 0);
+
+                       spin_lock(&j->lock);
+                       ja->last_idx = (ja->last_idx + 1) % ja->nr;
+
+                       bch2_journal_space_available(j);
+                       spin_unlock(&j->lock);
+               }
+       }
+
+       mutex_unlock(&j->reclaim_lock);
+}
+
 /*
  * Journal entry pinning - machinery for holding a reference on a given journal
  * entry, holding it open to ensure it gets replayed during recovery:
  */
 
+static void bch2_journal_reclaim_fast(struct journal *j)
+{
+       struct journal_entry_pin_list temp;
+       bool popped = false;
+
+       lockdep_assert_held(&j->lock);
+
+       /*
+        * Unpin journal entries whose reference counts reached zero, meaning
+        * all btree nodes got written out
+        */
+       while (!fifo_empty(&j->pin) &&
+              !atomic_read(&fifo_peek_front(&j->pin).count)) {
+               BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
+               BUG_ON(!fifo_pop(&j->pin, temp));
+               popped = true;
+       }
+
+       if (popped)
+               bch2_journal_space_available(j);
+}
+
+void bch2_journal_pin_put(struct journal *j, u64 seq)
+{
+       struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+       if (atomic_dec_and_test(&pin_list->count)) {
+               spin_lock(&j->lock);
+               bch2_journal_reclaim_fast(j);
+               spin_unlock(&j->lock);
+       }
+}
+
 static inline void __journal_pin_add(struct journal *j,
                                     u64 seq,
                                     struct journal_entry_pin *pin,
@@ -24,10 +222,7 @@ static inline void __journal_pin_add(struct journal *j,
        pin->seq        = seq;
        pin->flush      = flush_fn;
 
-       if (flush_fn)
-               list_add(&pin->list, &pin_list->list);
-       else
-               INIT_LIST_HEAD(&pin->list);
+       list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed);
 
        /*
         * If the journal is currently full,  we might want to call flush_fn
@@ -129,86 +324,53 @@ void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin)
  * data off of a specific device:
  */
 
-/**
- * bch2_journal_reclaim_fast - do the fast part of journal reclaim
- *
- * Called from IO submission context, does not block. Cleans up after btree
- * write completions by advancing the journal pin and each cache's last_idx,
- * kicking off discards and background reclaim as necessary.
- */
-void bch2_journal_reclaim_fast(struct journal *j)
-{
-       struct journal_entry_pin_list temp;
-       bool popped = false;
-
-       lockdep_assert_held(&j->lock);
-
-       /*
-        * Unpin journal entries whose reference counts reached zero, meaning
-        * all btree nodes got written out
-        */
-       while (!fifo_empty(&j->pin) &&
-              !atomic_read(&fifo_peek_front(&j->pin).count)) {
-               BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list));
-               BUG_ON(!fifo_pop(&j->pin, temp));
-               popped = true;
-       }
-
-       if (popped)
-               journal_wake(j);
-}
-
-static void journal_pin_mark_flushing(struct journal *j,
-                                     struct journal_entry_pin *pin,
-                                     u64 seq)
-{
-       lockdep_assert_held(&j->reclaim_lock);
-
-       list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
-       BUG_ON(j->flush_in_progress);
-       j->flush_in_progress = pin;
-}
-
-static void journal_pin_flush(struct journal *j,
-                             struct journal_entry_pin *pin,
-                             u64 seq)
-{
-       pin->flush(j, pin, seq);
-
-       BUG_ON(j->flush_in_progress != pin);
-       j->flush_in_progress = NULL;
-       wake_up(&j->pin_flush_wait);
-}
-
 static struct journal_entry_pin *
-journal_get_next_pin(struct journal *j, u64 seq_to_flush, u64 *seq)
+journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
 {
        struct journal_entry_pin_list *pin_list;
        struct journal_entry_pin *ret = NULL;
 
-       /* no need to iterate over empty fifo entries: */
-       bch2_journal_reclaim_fast(j);
+       spin_lock(&j->lock);
+
+       BUG_ON(!atomic_read(&fifo_peek_front(&j->pin).count));
 
        fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
-               if (*seq > seq_to_flush ||
+               if (*seq > max_seq ||
                    (ret = list_first_entry_or_null(&pin_list->list,
                                struct journal_entry_pin, list)))
                        break;
 
+       if (ret) {
+               list_move(&ret->list, &pin_list->flushed);
+               BUG_ON(j->flush_in_progress);
+               j->flush_in_progress = ret;
+               j->last_flushed = jiffies;
+       }
+
+       spin_unlock(&j->lock);
+
        return ret;
 }
 
-static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
+static void journal_flush_pins(struct journal *j, u64 seq_to_flush,
+                              unsigned min_nr)
 {
-       bool ret;
+       struct journal_entry_pin *pin;
+       u64 seq;
 
-       spin_lock(&j->lock);
-       ret = ja->nr &&
-               (ja->last_idx != ja->cur_idx &&
-                ja->bucket_seq[ja->last_idx] < j->last_seq_ondisk);
-       spin_unlock(&j->lock);
+       lockdep_assert_held(&j->reclaim_lock);
 
-       return ret;
+       while ((pin = journal_get_next_pin(j, min_nr
+                               ? U64_MAX : seq_to_flush, &seq))) {
+               if (min_nr)
+                       min_nr--;
+
+               pin->flush(j, pin, seq);
+
+               BUG_ON(j->flush_in_progress != pin);
+               j->flush_in_progress = NULL;
+               wake_up(&j->pin_flush_wait);
+       }
 }
 
 /**
@@ -235,104 +397,44 @@ void bch2_journal_reclaim_work(struct work_struct *work)
                                struct bch_fs, journal.reclaim_work);
        struct journal *j = &c->journal;
        struct bch_dev *ca;
-       struct journal_entry_pin *pin;
-       u64 seq, seq_to_flush = 0;
-       unsigned iter, bucket_to_flush;
-       unsigned long next_flush;
-       bool reclaim_lock_held = false, need_flush;
+       unsigned iter, bucket_to_flush, min_nr = 0;
+       u64 seq_to_flush = 0;
+
+       journal_do_discards(j);
+
+       mutex_lock(&j->reclaim_lock);
+       spin_lock(&j->lock);
 
-       /*
-        * Advance last_idx to point to the oldest journal entry containing
-        * btree node updates that have not yet been written out
-        */
        for_each_rw_member(ca, c, iter) {
                struct journal_device *ja = &ca->journal;
 
                if (!ja->nr)
                        continue;
 
-               while (should_discard_bucket(j, ja)) {
-                       if (!reclaim_lock_held) {
-                               /*
-                                * ugh:
-                                * might be called from __journal_res_get()
-                                * under wait_event() - have to go back to
-                                * TASK_RUNNING before doing something that
-                                * would block, but only if we're doing work:
-                                */
-                               __set_current_state(TASK_RUNNING);
-
-                               mutex_lock(&j->reclaim_lock);
-                               reclaim_lock_held = true;
-                               /* recheck under reclaim_lock: */
-                               continue;
-                       }
 
-                       if (ca->mi.discard &&
-                           blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
-                               blkdev_issue_discard(ca->disk_sb.bdev,
-                                       bucket_to_sector(ca,
-                                               ja->buckets[ja->last_idx]),
-                                       ca->mi.bucket_size, GFP_NOIO, 0);
-
-                       spin_lock(&j->lock);
-                       ja->last_idx = (ja->last_idx + 1) % ja->nr;
-                       spin_unlock(&j->lock);
-
-                       journal_wake(j);
-               }
-
-               /*
-                * Write out enough btree nodes to free up 50% journal
-                * buckets
-                */
-               spin_lock(&j->lock);
+               /* Try to keep the journal at most half full: */
                bucket_to_flush = (ja->cur_idx + (ja->nr >> 1)) % ja->nr;
                seq_to_flush = max_t(u64, seq_to_flush,
                                     ja->bucket_seq[bucket_to_flush]);
-               spin_unlock(&j->lock);
        }
 
        /* Also flush if the pin fifo is more than half full */
-       spin_lock(&j->lock);
        seq_to_flush = max_t(s64, seq_to_flush,
                             (s64) journal_cur_seq(j) -
                             (j->pin.size >> 1));
+       spin_unlock(&j->lock);
 
        /*
         * If it's been longer than j->reclaim_delay_ms since we last flushed,
         * make sure to flush at least one journal pin:
         */
-       next_flush = j->last_flushed + msecs_to_jiffies(j->reclaim_delay_ms);
-       need_flush = time_after(jiffies, next_flush);
-
-       while ((pin = journal_get_next_pin(j, need_flush
-                                          ? U64_MAX
-                                          : seq_to_flush, &seq))) {
-               if (!reclaim_lock_held) {
-                       spin_unlock(&j->lock);
-                       __set_current_state(TASK_RUNNING);
-                       mutex_lock(&j->reclaim_lock);
-                       reclaim_lock_held = true;
-                       spin_lock(&j->lock);
-                       continue;
-               }
+       if (time_after(jiffies, j->last_flushed +
+                      msecs_to_jiffies(j->reclaim_delay_ms)))
+               min_nr = 1;
 
-               journal_pin_mark_flushing(j, pin, seq);
-               spin_unlock(&j->lock);
-
-               journal_pin_flush(j, pin, seq);
-
-               need_flush = false;
-               j->last_flushed = jiffies;
+       journal_flush_pins(j, seq_to_flush, min_nr);
 
-               spin_lock(&j->lock);
-       }
-
-       spin_unlock(&j->lock);
-
-       if (reclaim_lock_held)
-               mutex_unlock(&j->reclaim_lock);
+       mutex_unlock(&j->reclaim_lock);
 
        if (!test_bit(BCH_FS_RO, &c->flags))
                queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work,
@@ -341,8 +443,6 @@ void bch2_journal_reclaim_work(struct work_struct *work)
 
 static int journal_flush_done(struct journal *j, u64 seq_to_flush)
 {
-       struct journal_entry_pin *pin;
-       u64 pin_seq;
        int ret;
 
        ret = bch2_journal_error(j);
@@ -350,16 +450,10 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush)
                return ret;
 
        mutex_lock(&j->reclaim_lock);
-       spin_lock(&j->lock);
-
-       while ((pin = journal_get_next_pin(j, seq_to_flush, &pin_seq))) {
-               journal_pin_mark_flushing(j, pin, pin_seq);
-               spin_unlock(&j->lock);
 
-               journal_pin_flush(j, pin, pin_seq);
+       journal_flush_pins(j, seq_to_flush, 0);
 
-               spin_lock(&j->lock);
-       }
+       spin_lock(&j->lock);
        /*
         * If journal replay hasn't completed, the unreplayed journal entries
         * hold refs on their corresponding sequence numbers
index 287590cd37ee134060a7927e54122f3e95e5b3a8..1d688d6fa2cce091603c882d1684a027c0c6d596 100644 (file)
@@ -3,6 +3,10 @@
 
 #define JOURNAL_PIN    (32 * 1024)
 
+unsigned bch2_journal_dev_buckets_available(struct journal *,
+                                           struct journal_device *);
+void bch2_journal_space_available(struct journal *);
+
 static inline bool journal_pin_active(struct journal_entry_pin *pin)
 {
        return pin->seq != 0;
@@ -16,6 +20,8 @@ journal_seq_pin(struct journal *j, u64 seq)
        return &j->pin.data[seq & j->pin.mask];
 }
 
+void bch2_journal_pin_put(struct journal *, u64);
+
 void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *,
                          journal_pin_flush_fn);
 void bch2_journal_pin_update(struct journal *, u64, struct journal_entry_pin *,
@@ -27,7 +33,6 @@ void bch2_journal_pin_add_if_older(struct journal *,
                                  journal_pin_flush_fn);
 void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *);
 
-void bch2_journal_reclaim_fast(struct journal *);
 void bch2_journal_reclaim_work(struct work_struct *);
 
 void bch2_journal_flush_pins(struct journal *, u64);
index a91662f6a61badc865336adf7bad48b665fd76ae..8772e53fb64cf1251c2e19b2a7190ed41017290d 100644 (file)
@@ -21,8 +21,10 @@ struct journal_buf {
 
        struct closure_waitlist wait;
 
-       unsigned                size;
-       unsigned                disk_sectors;
+       unsigned                buf_size;       /* size in bytes of @data */
+       unsigned                sectors;        /* maximum size for current entry */
+       unsigned                disk_sectors;   /* maximum size entry could have been, if
+                                                  buf_size was bigger */
        unsigned                u64s_reserved;
        /* bloom filter: */
        unsigned long           has_inode[1024 / sizeof(unsigned long)];
@@ -128,9 +130,20 @@ struct journal {
        unsigned long           flags;
 
        union journal_res_state reservations;
+
+       /* Max size of current journal entry */
        unsigned                cur_entry_u64s;
-       unsigned                prev_buf_sectors;
-       unsigned                cur_buf_sectors;
+       unsigned                cur_entry_sectors;
+
+       /*
+        * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if
+        * insufficient devices:
+        */
+       int                     cur_entry_error;
+
+       /* Reserved space in journal entry to be used just prior to write */
+       unsigned                entry_u64s_reserved;
+
        unsigned                buf_size_want;
 
        /*
@@ -141,6 +154,9 @@ struct journal {
 
        spinlock_t              lock;
 
+       /* if nonzero, we may not open a new journal entry: */
+       unsigned                blocked;
+
        /* Used when waiting because the journal was full */
        wait_queue_head_t       wait;
        struct closure_waitlist async_wait;
@@ -155,9 +171,6 @@ struct journal {
        u64                     seq_ondisk;
        u64                     last_seq_ondisk;
 
-       /* Reserved space in journal entry to be used just prior to write */
-       unsigned                entry_u64s_reserved;
-
        /*
         * FIFO of journal entries whose btree updates have not yet been
         * written out.
index 7e50547cc51f6b0b31ced5c9bf6142eebb291440..77ab464a8242df51526f6ba436d4a0b34ae76544 100644 (file)
@@ -82,7 +82,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
                                               le64_to_cpu(u->v));
                        break;
                case FS_USAGE_INODES:
-                       percpu_u64_set(&c->usage[0]->s.nr_inodes,
+                       percpu_u64_set(&c->usage[0]->nr_inodes,
                                       le64_to_cpu(u->v));
                        break;
                case FS_USAGE_KEY_VERSION:
@@ -406,22 +406,19 @@ int bch2_fs_initialize(struct bch_fs *c)
        mutex_unlock(&c->sb_lock);
 
        set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+       set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
 
        for (i = 0; i < BTREE_ID_NR; i++)
                bch2_btree_root_alloc(c, i);
 
-       ret = bch2_gc(c, &journal, true);
-       if (ret)
-               goto err;
-
-       set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-
        err = "unable to allocate journal buckets";
-       for_each_online_member(ca, c, i)
-               if (bch2_dev_journal_alloc(ca)) {
+       for_each_online_member(ca, c, i) {
+               ret = bch2_dev_journal_alloc(ca);
+               if (ret) {
                        percpu_ref_put(&ca->io_ref);
                        goto err;
                }
+       }
 
        /*
         * journal_res_get() will crash if called before this has
index 4d0c9718c109360af32b318ef42ca81a87b933ef..99283b1047fffa77792a7c454ad778a2db079ab6 100644 (file)
@@ -244,14 +244,14 @@ static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p,
        *dst = *src;
 
        for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
-               if (!src->data[src_idx])
+               if (!src->replicas[src_idx])
                        continue;
 
                dst_idx = __replicas_entry_idx(dst_r,
                                cpu_replicas_entry(src_r, src_idx));
                BUG_ON(dst_idx < 0);
 
-               dst->data[dst_idx] = src->data[src_idx];
+               dst->replicas[dst_idx] = src->replicas[src_idx];
        }
 }
 
@@ -261,39 +261,37 @@ static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p,
 static int replicas_table_update(struct bch_fs *c,
                                 struct bch_replicas_cpu *new_r)
 {
-       struct bch_fs_usage __percpu *new_usage[3] = { NULL, NULL, NULL };
+       struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL };
+       struct bch_fs_usage __percpu *new_scratch = NULL;
        unsigned bytes = sizeof(struct bch_fs_usage) +
                sizeof(u64) * new_r->nr;
-       unsigned i;
        int ret = -ENOMEM;
 
-       for (i = 0; i < 3; i++) {
-               if (i < 2 && !c->usage[i])
-                       continue;
-
-               new_usage[i] = __alloc_percpu_gfp(bytes, sizeof(u64),
-                                                 GFP_NOIO);
-               if (!new_usage[i])
-                       goto err;
-       }
-
-       for (i = 0; i < 2; i++) {
-               if (!c->usage[i])
-                       continue;
-
-               __replicas_table_update(new_usage[i],   new_r,
-                                       c->usage[i],    &c->replicas);
-
-               swap(c->usage[i], new_usage[i]);
-       }
-
-       swap(c->usage_scratch, new_usage[2]);
+       if (!(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64),
+                                               GFP_NOIO)) ||
+           (c->usage[1] &&
+            !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64),
+                                                GFP_NOIO))) ||
+           !(new_scratch  = __alloc_percpu_gfp(bytes, sizeof(u64),
+                                               GFP_NOIO)))
+               goto err;
 
-       swap(c->replicas, *new_r);
+       if (c->usage[0])
+               __replicas_table_update(new_usage[0],   new_r,
+                                       c->usage[0],    &c->replicas);
+       if (c->usage[1])
+               __replicas_table_update(new_usage[1],   new_r,
+                                       c->usage[1],    &c->replicas);
+
+       swap(c->usage[0],       new_usage[0]);
+       swap(c->usage[1],       new_usage[1]);
+       swap(c->usage_scratch,  new_scratch);
+       swap(c->replicas,       *new_r);
        ret = 0;
 err:
-       for (i = 0; i < 3; i++)
-               free_percpu(new_usage[i]);
+       free_percpu(new_scratch);
+       free_percpu(new_usage[1]);
+       free_percpu(new_usage[0]);
        return ret;
 }
 
@@ -456,7 +454,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
                if (__replicas_has_entry(&c->replicas_gc, e))
                        continue;
 
-               v = percpu_u64_get(&c->usage[0]->data[i]);
+               v = percpu_u64_get(&c->usage[0]->replicas[i]);
                if (!v)
                        continue;
 
@@ -557,7 +555,7 @@ int bch2_replicas_set_usage(struct bch_fs *c,
                BUG_ON(ret < 0);
        }
 
-       percpu_u64_set(&c->usage[0]->data[idx], sectors);
+       percpu_u64_set(&c->usage[0]->replicas[idx], sectors);
 
        return 0;
 }
@@ -974,5 +972,6 @@ int bch2_fs_replicas_init(struct bch_fs *c)
 {
        c->journal.entry_u64s_reserved +=
                reserve_journal_replicas(c, &c->replicas);
-       return 0;
+
+       return replicas_table_update(c, &c->replicas);
 }
index 1f343e64ca283d14fd3429856ca70be095130f3a..a1ca837b302147581c0e1282fa70ec77c5aa0883 100644 (file)
@@ -125,7 +125,7 @@ struct bch_hash_desc {
        bool            (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c);
 };
 
-static inline struct btree_iter *
+static __always_inline struct btree_iter *
 bch2_hash_lookup(struct btree_trans *trans,
                 const struct bch_hash_desc desc,
                 const struct bch_hash_info *info,
@@ -159,7 +159,7 @@ bch2_hash_lookup(struct btree_trans *trans,
        return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOENT);
 }
 
-static inline struct btree_iter *
+static __always_inline struct btree_iter *
 bch2_hash_hole(struct btree_trans *trans,
               const struct bch_hash_desc desc,
               const struct bch_hash_info *info,
@@ -185,10 +185,11 @@ bch2_hash_hole(struct btree_trans *trans,
        return IS_ERR(k.k) ? ERR_CAST(k.k) : ERR_PTR(-ENOSPC);
 }
 
-static inline int bch2_hash_needs_whiteout(struct btree_trans *trans,
-                                          const struct bch_hash_desc desc,
-                                          const struct bch_hash_info *info,
-                                          struct btree_iter *start)
+static __always_inline
+int bch2_hash_needs_whiteout(struct btree_trans *trans,
+                            const struct bch_hash_desc desc,
+                            const struct bch_hash_info *info,
+                            struct btree_iter *start)
 {
        struct btree_iter *iter;
        struct bkey_s_c k;
@@ -211,10 +212,11 @@ static inline int bch2_hash_needs_whiteout(struct btree_trans *trans,
        return btree_iter_err(k);
 }
 
-static inline int __bch2_hash_set(struct btree_trans *trans,
-                                 const struct bch_hash_desc desc,
-                                 const struct bch_hash_info *info,
-                                 u64 inode, struct bkey_i *insert, int flags)
+static __always_inline
+int __bch2_hash_set(struct btree_trans *trans,
+                   const struct bch_hash_desc desc,
+                   const struct bch_hash_info *info,
+                   u64 inode, struct bkey_i *insert, int flags)
 {
        struct btree_iter *iter, *slot = NULL;
        struct bkey_s_c k;
@@ -276,10 +278,11 @@ static inline int bch2_hash_set(const struct bch_hash_desc desc,
                                        inode, insert, flags));
 }
 
-static inline int bch2_hash_delete_at(struct btree_trans *trans,
-                                     const struct bch_hash_desc desc,
-                                     const struct bch_hash_info *info,
-                                     struct btree_iter *iter)
+static __always_inline
+int bch2_hash_delete_at(struct btree_trans *trans,
+                       const struct bch_hash_desc desc,
+                       const struct bch_hash_info *info,
+                       struct btree_iter *iter)
 {
        struct bkey_i *delete;
        int ret;
@@ -300,10 +303,11 @@ static inline int bch2_hash_delete_at(struct btree_trans *trans,
        return 0;
 }
 
-static inline int bch2_hash_delete(struct btree_trans *trans,
-                                  const struct bch_hash_desc desc,
-                                  const struct bch_hash_info *info,
-                                  u64 inode, const void *key)
+static __always_inline
+int bch2_hash_delete(struct btree_trans *trans,
+                    const struct bch_hash_desc desc,
+                    const struct bch_hash_info *info,
+                    u64 inode, const void *key)
 {
        struct btree_iter *iter;
 
index b88750ff1bb77ad81ba226ffc402df79d4f7d004..71d97c57830fd33df4463e631942aae16c610a0b 100644 (file)
@@ -136,7 +136,7 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
                sb->bio = bio;
        }
 
-       new_sb = (void *) __get_free_pages(GFP_KERNEL|__GFP_ZERO, order);
+       new_sb = (void *) __get_free_pages(GFP_NOFS|__GFP_ZERO, order);
        if (!new_sb)
                return -ENOMEM;
 
@@ -923,7 +923,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
        percpu_down_read_preempt_disable(&c->mark_lock);
 
        {
-               u64 nr_inodes = percpu_u64_get(&c->usage[0]->s.nr_inodes);
+               u64 nr_inodes = percpu_u64_get(&c->usage[0]->nr_inodes);
                struct jset_entry_usage *u =
                        container_of(entry, struct jset_entry_usage, entry);
 
@@ -970,7 +970,7 @@ bch2_journal_super_entries_add_common(struct bch_fs *c,
        for (i = 0; i < c->replicas.nr; i++) {
                struct bch_replicas_entry *e =
                        cpu_replicas_entry(&c->replicas, i);
-               u64 sectors = percpu_u64_get(&c->usage[0]->data[i]);
+               u64 sectors = percpu_u64_get(&c->usage[0]->replicas[i]);
                struct jset_entry_data_usage *u =
                        container_of(entry, struct jset_entry_data_usage, entry);
 
index a8eb161585c1f162cd951f5e9747d769a930310e..1528f77e6d30037bfab4de0cd657c04c7e19065d 100644 (file)
@@ -567,7 +567,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 {
        struct bch_sb_field_members *mi;
        struct bch_fs *c;
-       unsigned i, iter_size, fs_usage_size;
+       unsigned i, iter_size;
        const char *err;
 
        pr_verbose_init(opts, "");
@@ -661,9 +661,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                (btree_blocks(c) + 1) * 2 *
                sizeof(struct btree_node_iter_set);
 
-       fs_usage_size = sizeof(struct bch_fs_usage) +
-               sizeof(u64) * c->replicas.nr;
-
        if (!(c->wq = alloc_workqueue("bcachefs",
                                WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
            !(c->copygc_wq = alloc_workqueue("bcache_copygc",
@@ -680,8 +677,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                        max(offsetof(struct btree_read_bio, bio),
                            offsetof(struct btree_write_bio, wbio.bio)),
                        BIOSET_NEED_BVECS) ||
-           !(c->usage[0] = __alloc_percpu(fs_usage_size, sizeof(u64))) ||
-           !(c->usage_scratch = __alloc_percpu(fs_usage_size, sizeof(u64))) ||
            !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
            mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
                                        btree_bytes(c)) ||
index 7e3aebed2c18533dc54c257767124dc345413a2e..b56db15daf118bfcd7ce8e3d3437051ff09000bb 100644 (file)
@@ -243,17 +243,17 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
        pr_buf(&out, "capacity:\t\t\t%llu\n", c->capacity);
 
        pr_buf(&out, "hidden:\t\t\t\t%llu\n",
-              fs_usage->s.hidden);
+              fs_usage->hidden);
        pr_buf(&out, "data:\t\t\t\t%llu\n",
-              fs_usage->s.data);
+              fs_usage->data);
        pr_buf(&out, "cached:\t\t\t\t%llu\n",
-              fs_usage->s.cached);
+              fs_usage->cached);
        pr_buf(&out, "reserved:\t\t\t%llu\n",
-              fs_usage->s.reserved);
+              fs_usage->reserved);
        pr_buf(&out, "nr_inodes:\t\t\t%llu\n",
-              fs_usage->s.nr_inodes);
+              fs_usage->nr_inodes);
        pr_buf(&out, "online reserved:\t\t%llu\n",
-              fs_usage->s.online_reserved);
+              fs_usage->online_reserved);
 
        for (i = 0;
             i < ARRAY_SIZE(fs_usage->persistent_reserved);
@@ -269,7 +269,7 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
 
                pr_buf(&out, "\t");
                bch2_replicas_entry_to_text(&out, e);
-               pr_buf(&out, ":\t%llu\n", fs_usage->data[i]);
+               pr_buf(&out, ":\t%llu\n", fs_usage->replicas[i]);
        }
 
        percpu_up_read_preempt_enable(&c->mark_lock);