]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to b84661c042 bcachefs: Fix reflink repair code
authorKent Overstreet <kent.overstreet@gmail.com>
Thu, 10 Feb 2022 08:42:28 +0000 (03:42 -0500)
committerKent Overstreet <kent.overstreet@gmail.com>
Fri, 11 Feb 2022 23:33:16 +0000 (18:33 -0500)
28 files changed:
.bcachefs_revision
cmd_migrate.c
include/linux/compiler.h
include/trace/events/bcachefs.h
libbcachefs/alloc_background.c
libbcachefs/alloc_background.h
libbcachefs/bcachefs.h
libbcachefs/btree_gc.c
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_key_cache.c
libbcachefs/btree_types.h
libbcachefs/btree_update.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/buckets_types.h
libbcachefs/buckets_waiting_for_journal.c
libbcachefs/error.c
libbcachefs/fs.c
libbcachefs/inode.c
libbcachefs/inode.h
libbcachefs/journal_io.c
libbcachefs/movinggc.c
libbcachefs/recovery.c
libbcachefs/replicas.c
libbcachefs/super.c

index 71e83e28f05d0f5e6b1c582cfcaa88165682d826..4bc1040c34c0c35c092b34fa346f2b337019c599 100644 (file)
@@ -1 +1 @@
-bf340e68c74cdb70c692698ef7367b9dc6f6e61f
+b84661c042c7d5caaab3f79661d04789070bea78
index fc863f890368f21ad8fd27da88f28508c31b5684..4772b3bdb624a159befa8ac59e2741b9025b733b 100644 (file)
@@ -328,7 +328,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
                bch2_bkey_append_ptr(&e->k_i, (struct bch_extent_ptr) {
                                        .offset = physical,
                                        .dev = 0,
-                                       .gen = bucket(ca, b)->mark.gen,
+                                       .gen = *bucket_gen(ca, b),
                                  });
 
                ret = bch2_disk_reservation_get(c, &res, sectors, 1,
index 2bfbfadb34b250d9f0840aded40060896bb72b37..6d039ea376df93743bd0e488a66b07dd1bfd0ba5 100644 (file)
@@ -60,6 +60,7 @@
 #define unlikely(x)            __builtin_expect(!!(x), 0)
 #define unreachable()          __builtin_unreachable()
 #define __same_type(a, b)      __builtin_types_compatible_p(typeof(a), typeof(b))
+#define fallthrough            __attribute__((__fallthrough__))
 
 #define ___PASTE(a,b) a##b
 #define __PASTE(a,b) ___PASTE(a,b)
index 8f10d13b27d565dff4209e17b8726fbba82d1d43..36c4c88417416d6ab20ee9b7d6dc6f09db05be48 100644 (file)
@@ -658,6 +658,12 @@ DEFINE_EVENT(transaction_restart,  trans_restart_mark_replicas,
        TP_ARGS(trans_fn, caller_ip)
 );
 
+DEFINE_EVENT(transaction_restart,      trans_restart_key_cache_raced,
+       TP_PROTO(const char *trans_fn,
+                unsigned long caller_ip),
+       TP_ARGS(trans_fn, caller_ip)
+);
+
 DECLARE_EVENT_CLASS(transaction_restart_iter,
        TP_PROTO(const char *trans_fn,
                 unsigned long caller_ip,
index 7ad16c21eb08cf2fd8d85e98b77cc907aecb59fd..0a5ec99e7c3ebff0732a8a80145fddd1b130c751 100644 (file)
@@ -39,15 +39,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
 #undef x
 };
 
-struct bkey_alloc_buf {
-       struct bkey_i   k;
-       struct bch_alloc_v3 v;
-
-#define x(_name,  _bits)               + _bits / 8
-       u8              _pad[0 + BCH_ALLOC_FIELDS_V2()];
-#undef  x
-} __attribute__((packed, aligned(8)));
-
 /* Persistent alloc info: */
 
 static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
@@ -254,24 +245,25 @@ struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
        return ret;
 }
 
-static void bch2_alloc_pack(struct bch_fs *c,
-                           struct bkey_alloc_buf *dst,
-                           const struct bkey_alloc_unpacked src)
+struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *trans,
+                                      const struct bkey_alloc_unpacked src)
 {
-       bch2_alloc_pack_v3(dst, src);
+       struct bkey_alloc_buf *dst;
+
+       dst = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
+       if (!IS_ERR(dst))
+               bch2_alloc_pack_v3(dst, src);
+
+       return dst;
 }
 
 int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter,
                     struct bkey_alloc_unpacked *u, unsigned trigger_flags)
 {
-       struct bkey_alloc_buf *a;
-
-       a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
-       if (IS_ERR(a))
-               return PTR_ERR(a);
+       struct bkey_alloc_buf *a = bch2_alloc_pack(trans, *u);
 
-       bch2_alloc_pack(trans->c, a, *u);
-       return bch2_trans_update(trans, iter, &a->k, trigger_flags);
+       return PTR_ERR_OR_ZERO(a) ?:
+               bch2_trans_update(trans, iter, &a->k, trigger_flags);
 }
 
 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
@@ -341,7 +333,7 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
 #undef  x
 }
 
-int bch2_alloc_read(struct bch_fs *c)
+int bch2_alloc_read(struct bch_fs *c, bool gc, bool metadata_only)
 {
        struct btree_trans trans;
        struct btree_iter iter;
@@ -352,108 +344,43 @@ int bch2_alloc_read(struct bch_fs *c)
        int ret;
 
        bch2_trans_init(&trans, c, 0, 0);
-       down_read(&c->gc_lock);
 
        for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
-               if (!bkey_is_alloc(k.k))
-                       continue;
-
                ca = bch_dev_bkey_exists(c, k.k->p.inode);
-               g = bucket(ca, k.k->p.offset);
+               g = __bucket(ca, k.k->p.offset, gc);
                u = bch2_alloc_unpack(k);
 
-               *bucket_gen(ca, k.k->p.offset) = u.gen;
+               if (!gc)
+                       *bucket_gen(ca, k.k->p.offset) = u.gen;
+
                g->_mark.gen            = u.gen;
-               g->_mark.data_type      = u.data_type;
-               g->_mark.dirty_sectors  = u.dirty_sectors;
-               g->_mark.cached_sectors = u.cached_sectors;
-               g->_mark.stripe         = u.stripe != 0;
-               g->stripe               = u.stripe;
-               g->stripe_redundancy    = u.stripe_redundancy;
                g->io_time[READ]        = u.read_time;
                g->io_time[WRITE]       = u.write_time;
-               g->oldest_gen           = u.oldest_gen;
+               g->oldest_gen           = !gc ? u.oldest_gen : u.gen;
                g->gen_valid            = 1;
-       }
-       bch2_trans_iter_exit(&trans, &iter);
 
-       up_read(&c->gc_lock);
-       bch2_trans_exit(&trans);
+               if (!gc ||
+                   (metadata_only &&
+                    (u.data_type == BCH_DATA_user ||
+                     u.data_type == BCH_DATA_cached ||
+                     u.data_type == BCH_DATA_parity))) {
+                       g->_mark.data_type      = u.data_type;
+                       g->_mark.dirty_sectors  = u.dirty_sectors;
+                       g->_mark.cached_sectors = u.cached_sectors;
+                       g->_mark.stripe         = u.stripe != 0;
+                       g->stripe               = u.stripe;
+                       g->stripe_redundancy    = u.stripe_redundancy;
+               }
 
-       if (ret) {
-               bch_err(c, "error reading alloc info: %i", ret);
-               return ret;
        }
+       bch2_trans_iter_exit(&trans, &iter);
 
-       return 0;
-}
-
-static int bch2_alloc_write_key(struct btree_trans *trans,
-                               struct btree_iter *iter,
-                               unsigned flags)
-{
-       struct bch_fs *c = trans->c;
-       struct bkey_s_c k;
-       struct bkey_alloc_unpacked old_u, new_u;
-       int ret;
-retry:
-       bch2_trans_begin(trans);
-
-       ret = bch2_btree_key_cache_flush(trans,
-                       BTREE_ID_alloc, iter->pos);
-       if (ret)
-               goto err;
+       bch2_trans_exit(&trans);
 
-       k = bch2_btree_iter_peek_slot(iter);
-       ret = bkey_err(k);
        if (ret)
-               goto err;
-
-       old_u   = bch2_alloc_unpack(k);
-       new_u   = alloc_mem_to_key(c, iter);
-
-       if (!bkey_alloc_unpacked_cmp(old_u, new_u))
-               return 0;
-
-       ret   = bch2_alloc_write(trans, iter, &new_u,
-                                 BTREE_TRIGGER_NORUN) ?:
-               bch2_trans_commit(trans, NULL, NULL,
-                               BTREE_INSERT_NOFAIL|flags);
-err:
-       if (ret == -EINTR)
-               goto retry;
-       return ret;
-}
-
-int bch2_alloc_write_all(struct bch_fs *c, unsigned flags)
-{
-       struct btree_trans trans;
-       struct btree_iter iter;
-       struct bch_dev *ca;
-       unsigned i;
-       int ret = 0;
-
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
-       bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
-                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-
-       for_each_member_device(ca, c, i) {
-               bch2_btree_iter_set_pos(&iter,
-                       POS(ca->dev_idx, ca->mi.first_bucket));
+               bch_err(c, "error reading alloc info: %i", ret);
 
-               while (iter.pos.offset < ca->mi.nbuckets) {
-                       ret = bch2_alloc_write_key(&trans, &iter, flags);
-                       if (ret) {
-                               percpu_ref_put(&ca->ref);
-                               goto err;
-                       }
-                       bch2_btree_iter_advance(&iter);
-               }
-       }
-err:
-       bch2_trans_iter_exit(&trans, &iter);
-       bch2_trans_exit(&trans);
        return ret;
 }
 
index 86b64177b3d0bc2a378e7961aff478363ce0c210..98c7866e20b57ded9f8d629d8427d5966f97bfb5 100644 (file)
@@ -38,40 +38,23 @@ static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l,
        ;
 }
 
+struct bkey_alloc_buf {
+       struct bkey_i   k;
+       struct bch_alloc_v3 v;
+
+#define x(_name,  _bits)               + _bits / 8
+       u8              _pad[0 + BCH_ALLOC_FIELDS_V2()];
+#undef  x
+} __attribute__((packed, aligned(8)));
+
 struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c);
+struct bkey_alloc_buf *bch2_alloc_pack(struct btree_trans *,
+                                      const struct bkey_alloc_unpacked);
 int bch2_alloc_write(struct btree_trans *, struct btree_iter *,
                     struct bkey_alloc_unpacked *, unsigned);
 
 int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int);
 
-static inline struct bkey_alloc_unpacked
-alloc_mem_to_key(struct bch_fs *c, struct btree_iter *iter)
-{
-       struct bch_dev *ca;
-       struct bucket *g;
-       struct bkey_alloc_unpacked ret;
-
-       percpu_down_read(&c->mark_lock);
-       ca      = bch_dev_bkey_exists(c, iter->pos.inode);
-       g       = bucket(ca, iter->pos.offset);
-       ret     = (struct bkey_alloc_unpacked) {
-               .dev            = iter->pos.inode,
-               .bucket         = iter->pos.offset,
-               .gen            = g->mark.gen,
-               .oldest_gen     = g->oldest_gen,
-               .data_type      = g->mark.data_type,
-               .dirty_sectors  = g->mark.dirty_sectors,
-               .cached_sectors = g->mark.cached_sectors,
-               .read_time      = g->io_time[READ],
-               .write_time     = g->io_time[WRITE],
-               .stripe         = g->stripe,
-               .stripe_redundancy = g->stripe_redundancy,
-       };
-       percpu_up_read(&c->mark_lock);
-
-       return ret;
-}
-
 #define ALLOC_SCAN_BATCH(ca)           max_t(size_t, 1, (ca)->mi.nbuckets >> 9)
 
 const char *bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c);
@@ -101,7 +84,7 @@ static inline bool bkey_is_alloc(const struct bkey *k)
                k->type == KEY_TYPE_alloc_v3;
 }
 
-int bch2_alloc_read(struct bch_fs *);
+int bch2_alloc_read(struct bch_fs *, bool, bool);
 
 static inline void bch2_wake_allocator(struct bch_dev *ca)
 {
@@ -139,7 +122,6 @@ void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_stop(struct bch_dev *);
 int bch2_dev_allocator_start(struct bch_dev *);
 
-int bch2_alloc_write_all(struct bch_fs *, unsigned);
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
index a28ddcd5d7b727ef9b2d34b1219059ddda5a4639..eec02f8a95b724fd19ea6808f696cdb91c1f20bb 100644 (file)
@@ -451,7 +451,8 @@ struct bch_dev {
         * Or rcu_read_lock(), but only for ptr_stale():
         */
        struct bucket_array __rcu *buckets[2];
-       struct bucket_gens      *bucket_gens;
+       struct bucket_gens __rcu *bucket_gens;
+       u8                      *oldest_gen;
        unsigned long           *buckets_nouse;
        struct rw_semaphore     bucket_lock;
 
@@ -536,7 +537,6 @@ enum {
        /* misc: */
        BCH_FS_NEED_ANOTHER_GC,
        BCH_FS_DELETED_NODES,
-       BCH_FS_NEED_ALLOC_WRITE,
        BCH_FS_REBUILD_REPLICAS,
        BCH_FS_HOLD_BTREE_WRITES,
 };
@@ -716,6 +716,7 @@ struct bch_fs {
        bool                    btree_trans_barrier_initialized;
 
        struct btree_key_cache  btree_key_cache;
+       unsigned                btree_key_cache_btrees;
 
        struct workqueue_struct *btree_update_wq;
        struct workqueue_struct *btree_io_complete_wq;
@@ -952,6 +953,11 @@ static inline size_t btree_sectors(const struct bch_fs *c)
        return c->opts.btree_node_size >> 9;
 }
 
+static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
+{
+       return c->btree_key_cache_btrees & (1U << btree);
+}
+
 static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time)
 {
        struct timespec64 t;
index 809c9a76230302b046dde303c0eb530dfacd0d46..7cab220c74b71494dca1d13d0c9f97ea978bf223 100644 (file)
@@ -9,6 +9,7 @@
 #include "alloc_foreground.h"
 #include "bkey_methods.h"
 #include "bkey_buf.h"
+#include "btree_key_cache.h"
 #include "btree_locking.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
@@ -533,7 +534,6 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
        bkey_for_each_ptr_decode(k->k, ptrs, p, entry) {
                struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
                struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
-               struct bucket *g2 = PTR_BUCKET(ca, &p.ptr);
                enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
 
                if (fsck_err_on(!g->gen_valid, c,
@@ -544,9 +544,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
                                p.ptr.gen,
                                (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
                        if (!p.ptr.cached) {
-                               g2->_mark.gen   = g->_mark.gen          = p.ptr.gen;
-                               g2->gen_valid   = g->gen_valid          = true;
-                               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+                               g->_mark.gen            = p.ptr.gen;
+                               g->gen_valid            = true;
                        } else {
                                do_update = true;
                        }
@@ -560,13 +559,12 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
                                p.ptr.gen, g->mark.gen,
                                (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
                        if (!p.ptr.cached) {
-                               g2->_mark.gen   = g->_mark.gen  = p.ptr.gen;
-                               g2->gen_valid   = g->gen_valid  = true;
-                               g2->_mark.data_type             = 0;
-                               g2->_mark.dirty_sectors         = 0;
-                               g2->_mark.cached_sectors        = 0;
+                               g->_mark.gen            = p.ptr.gen;
+                               g->gen_valid            = true;
+                               g->_mark.data_type      = 0;
+                               g->_mark.dirty_sectors  = 0;
+                               g->_mark.cached_sectors = 0;
                                set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
-                               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
                        } else {
                                do_update = true;
                        }
@@ -603,8 +601,7 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
                                bch2_data_types[data_type],
                                (bch2_bkey_val_to_text(&PBUF(buf), c, *k), buf))) {
                        if (data_type == BCH_DATA_btree) {
-                               g2->_mark.data_type     = g->_mark.data_type    = data_type;
-                               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+                               g->_mark.data_type      = data_type;
                                set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
                        } else {
                                do_update = true;
@@ -1169,13 +1166,14 @@ static int bch2_gc_done(struct bch_fs *c,
        unsigned i, dev;
        int ret = 0;
 
+       percpu_down_write(&c->mark_lock);
+
 #define copy_field(_f, _msg, ...)                                      \
        if (dst->_f != src->_f) {                                       \
                if (verify)                                             \
                        fsck_err(c, _msg ": got %llu, should be %llu"   \
                                , ##__VA_ARGS__, dst->_f, src->_f);     \
                dst->_f = src->_f;                                      \
-               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);            \
        }
 #define copy_stripe_field(_f, _msg, ...)                               \
        if (dst->_f != src->_f) {                                       \
@@ -1185,18 +1183,6 @@ static int bch2_gc_done(struct bch_fs *c,
                                iter.pos, ##__VA_ARGS__,                \
                                dst->_f, src->_f);                      \
                dst->_f = src->_f;                                      \
-               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);            \
-       }
-#define copy_bucket_field(_f)                                          \
-       if (dst->b[b]._f != src->b[b]._f) {                             \
-               if (verify)                                             \
-                       fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f  \
-                               ": got %u, should be %u", dev, b,       \
-                               dst->b[b].mark.gen,                     \
-                               bch2_data_types[dst->b[b].mark.data_type],\
-                               dst->b[b]._f, src->b[b]._f);            \
-               dst->b[b]._f = src->b[b]._f;                            \
-               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);            \
        }
 #define copy_dev_field(_f, _msg, ...)                                  \
        copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
@@ -1207,36 +1193,18 @@ static int bch2_gc_done(struct bch_fs *c,
                bch2_fs_usage_acc_to_base(c, i);
 
        for_each_member_device(ca, c, dev) {
-               struct bucket_array *dst = __bucket_array(ca, 0);
-               struct bucket_array *src = __bucket_array(ca, 1);
-               size_t b;
-
-               for (b = 0; b < src->nbuckets; b++) {
-                       copy_bucket_field(_mark.gen);
-                       copy_bucket_field(_mark.data_type);
-                       copy_bucket_field(_mark.stripe);
-                       copy_bucket_field(_mark.dirty_sectors);
-                       copy_bucket_field(_mark.cached_sectors);
-                       copy_bucket_field(stripe_redundancy);
-                       copy_bucket_field(stripe);
-
-                       dst->b[b].oldest_gen = src->b[b].oldest_gen;
-               }
-
-               {
-                       struct bch_dev_usage *dst = ca->usage_base;
-                       struct bch_dev_usage *src = (void *)
-                               bch2_acc_percpu_u64s((void *) ca->usage_gc,
-                                                    dev_usage_u64s());
-
-                       copy_dev_field(buckets_ec,              "buckets_ec");
-                       copy_dev_field(buckets_unavailable,     "buckets_unavailable");
-
-                       for (i = 0; i < BCH_DATA_NR; i++) {
-                               copy_dev_field(d[i].buckets,    "%s buckets", bch2_data_types[i]);
-                               copy_dev_field(d[i].sectors,    "%s sectors", bch2_data_types[i]);
-                               copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
-                       }
+               struct bch_dev_usage *dst = ca->usage_base;
+               struct bch_dev_usage *src = (void *)
+                       bch2_acc_percpu_u64s((void *) ca->usage_gc,
+                                            dev_usage_u64s());
+
+               copy_dev_field(buckets_ec,              "buckets_ec");
+               copy_dev_field(buckets_unavailable,     "buckets_unavailable");
+
+               for (i = 0; i < BCH_DATA_NR; i++) {
+                       copy_dev_field(d[i].buckets,    "%s buckets", bch2_data_types[i]);
+                       copy_dev_field(d[i].sectors,    "%s sectors", bch2_data_types[i]);
+                       copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]);
                }
        };
 
@@ -1278,7 +1246,6 @@ static int bch2_gc_done(struct bch_fs *c,
 
 #undef copy_fs_field
 #undef copy_dev_field
-#undef copy_bucket_field
 #undef copy_stripe_field
 #undef copy_field
 fsck_err:
@@ -1286,6 +1253,8 @@ fsck_err:
                percpu_ref_put(&ca->ref);
        if (ret)
                bch_err(c, "%s: ret %i", __func__, ret);
+
+       percpu_up_write(&c->mark_lock);
        return ret;
 }
 
@@ -1308,15 +1277,6 @@ static int bch2_gc_start(struct bch_fs *c,
                BUG_ON(ca->buckets[1]);
                BUG_ON(ca->usage_gc);
 
-               ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) +
-                               ca->mi.nbuckets * sizeof(struct bucket),
-                               GFP_KERNEL|__GFP_ZERO);
-               if (!ca->buckets[1]) {
-                       percpu_ref_put(&ca->ref);
-                       bch_err(c, "error allocating ca->buckets[gc]");
-                       return -ENOMEM;
-               }
-
                ca->usage_gc = alloc_percpu(struct bch_dev_usage);
                if (!ca->usage_gc) {
                        bch_err(c, "error allocating ca->usage_gc");
@@ -1325,33 +1285,151 @@ static int bch2_gc_start(struct bch_fs *c,
                }
        }
 
-       percpu_down_write(&c->mark_lock);
+       return 0;
+}
+
+static int bch2_alloc_write_key(struct btree_trans *trans,
+                               struct btree_iter *iter,
+                               bool initial, bool metadata_only)
+{
+       struct bch_fs *c = trans->c;
+       struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
+       struct bucket *g;
+       struct bkey_s_c k;
+       struct bkey_alloc_unpacked old_u, new_u, gc_u;
+       struct bkey_alloc_buf *a;
+       int ret;
+
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = bkey_err(k);
+       if (ret)
+               return ret;
+
+       old_u = new_u = bch2_alloc_unpack(k);
+
+       percpu_down_read(&c->mark_lock);
+       g       = gc_bucket(ca, iter->pos.offset);
+       gc_u = (struct bkey_alloc_unpacked) {
+               .dev            = iter->pos.inode,
+               .bucket         = iter->pos.offset,
+               .gen            = g->mark.gen,
+               .oldest_gen     = g->oldest_gen,
+               .data_type      = g->mark.data_type,
+               .dirty_sectors  = g->mark.dirty_sectors,
+               .cached_sectors = g->mark.cached_sectors,
+               .read_time      = g->io_time[READ],
+               .write_time     = g->io_time[WRITE],
+               .stripe         = g->stripe,
+               .stripe_redundancy = g->stripe_redundancy,
+       };
+       percpu_up_read(&c->mark_lock);
+
+       if (metadata_only &&
+           gc_u.data_type != BCH_DATA_sb &&
+           gc_u.data_type != BCH_DATA_journal &&
+           gc_u.data_type != BCH_DATA_btree)
+               return 0;
+
+       if (!bkey_alloc_unpacked_cmp(old_u, gc_u) ||
+           gen_after(old_u.gen, gc_u.gen))
+               return 0;
+
+#define copy_bucket_field(_f)                                          \
+       if (fsck_err_on(new_u._f != gc_u._f, c,                         \
+                       "bucket %llu:%llu gen %u data type %s has wrong " #_f   \
+                       ": got %u, should be %u",                       \
+                       iter->pos.inode, iter->pos.offset,              \
+                       new_u.gen,                                      \
+                       bch2_data_types[new_u.data_type],               \
+                       new_u._f, gc_u._f))                             \
+               new_u._f = gc_u._f;                                     \
+
+       copy_bucket_field(gen);
+       copy_bucket_field(data_type);
+       copy_bucket_field(stripe);
+       copy_bucket_field(dirty_sectors);
+       copy_bucket_field(cached_sectors);
+       copy_bucket_field(stripe_redundancy);
+       copy_bucket_field(stripe);
+#undef copy_bucket_field
+
+       new_u.oldest_gen = gc_u.oldest_gen;
+
+       if (!bkey_alloc_unpacked_cmp(old_u, new_u))
+               return 0;
+
+       a = bch2_alloc_pack(trans, new_u);
+       if (IS_ERR(a))
+               return PTR_ERR(a);
+
+       ret = initial
+               ? bch2_journal_key_insert(c, BTREE_ID_alloc, 0, &a->k)
+               : bch2_trans_update(trans, iter, &a->k, BTREE_TRIGGER_NORUN);
+fsck_err:
+       return ret;
+}
+
+static int bch2_gc_alloc_done(struct bch_fs *c, bool initial, bool metadata_only)
+{
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bch_dev *ca;
+       unsigned i;
+       int ret = 0;
+
+       bch2_trans_init(&trans, c, 0, 0);
 
        for_each_member_device(ca, c, i) {
-               struct bucket_array *dst = __bucket_array(ca, 1);
-               struct bucket_array *src = __bucket_array(ca, 0);
-               size_t b;
+               for_each_btree_key(&trans, iter, BTREE_ID_alloc,
+                                  POS(ca->dev_idx, ca->mi.first_bucket),
+                                  BTREE_ITER_SLOTS|
+                                  BTREE_ITER_PREFETCH, k, ret) {
+                       if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+                               break;
 
-               dst->first_bucket       = src->first_bucket;
-               dst->nbuckets           = src->nbuckets;
+                       ret = __bch2_trans_do(&trans, NULL, NULL,
+                                             BTREE_INSERT_LAZY_RW,
+                                       bch2_alloc_write_key(&trans, &iter,
+                                                            initial, metadata_only));
+                       if (ret)
+                               break;
+               }
+               bch2_trans_iter_exit(&trans, &iter);
 
-               for (b = 0; b < src->nbuckets; b++) {
-                       struct bucket *d = &dst->b[b];
-                       struct bucket *s = &src->b[b];
+               if (ret) {
+                       bch_err(c, "error writing alloc info: %i", ret);
+                       percpu_ref_put(&ca->ref);
+                       break;
+               }
+       }
 
-                       d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen;
-                       d->gen_valid = s->gen_valid;
+       bch2_trans_exit(&trans);
+       return ret;
+}
 
-                       if (metadata_only &&
-                           (s->mark.data_type == BCH_DATA_user ||
-                            s->mark.data_type == BCH_DATA_cached))
-                               d->_mark = s->mark;
+static int bch2_gc_alloc_start(struct bch_fs *c, bool initial, bool metadata_only)
+{
+       struct bch_dev *ca;
+       unsigned i;
+
+       for_each_member_device(ca, c, i) {
+               struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
+                               ca->mi.nbuckets * sizeof(struct bucket),
+                               GFP_KERNEL|__GFP_ZERO);
+               if (!buckets) {
+                       percpu_ref_put(&ca->ref);
+                       percpu_up_write(&c->mark_lock);
+                       bch_err(c, "error allocating ca->buckets[gc]");
+                       return -ENOMEM;
                }
-       };
 
-       percpu_up_write(&c->mark_lock);
+               buckets->first_bucket   = ca->mi.first_bucket;
+               buckets->nbuckets       = ca->mi.nbuckets;
+               rcu_assign_pointer(ca->buckets[1], buckets);
+       };
 
-       return 0;
+       return bch2_alloc_read(c, true, metadata_only);
 }
 
 static void bch2_gc_alloc_reset(struct bch_fs *c, bool initial, bool metadata_only)
@@ -1423,10 +1501,18 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
 
                        bkey_reassemble(new, k);
 
-                       if (!r->refcount)
+                       if (!r->refcount) {
                                new->k.type = KEY_TYPE_deleted;
-                       else
+                               /*
+                                * XXX ugly: bch2_journal_key_insert() queues up
+                                * the key for the journal replay code, which
+                                * doesn't run the extent overwrite pass
+                                */
+                               if (initial)
+                                       new->k.size = 0;
+                       } else {
                                *bkey_refcount(new) = cpu_to_le64(r->refcount);
+                       }
 
                        ret = initial
                               ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new)
@@ -1598,6 +1684,7 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
                           !bch2_btree_interior_updates_nr_pending(c));
 
        ret   = bch2_gc_start(c, metadata_only) ?:
+               bch2_gc_alloc_start(c, initial, metadata_only) ?:
                bch2_gc_reflink_start(c, initial, metadata_only);
        if (ret)
                goto out;
@@ -1665,16 +1752,15 @@ out:
        if (!ret) {
                bch2_journal_block(&c->journal);
 
-               percpu_down_write(&c->mark_lock);
-               ret   = bch2_gc_reflink_done(c, initial, metadata_only) ?:
-                       bch2_gc_stripes_done(c, initial, metadata_only) ?:
+               ret   = bch2_gc_stripes_done(c, initial, metadata_only) ?:
+                       bch2_gc_reflink_done(c, initial, metadata_only) ?:
+                       bch2_gc_alloc_done(c, initial, metadata_only) ?:
                        bch2_gc_done(c, initial, metadata_only);
 
                bch2_journal_unblock(&c->journal);
-       } else {
-               percpu_down_write(&c->mark_lock);
        }
 
+       percpu_down_write(&c->mark_lock);
        /* Indicates that gc is no longer in progress: */
        __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
 
@@ -1709,9 +1795,8 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
        percpu_down_read(&c->mark_lock);
        bkey_for_each_ptr(ptrs, ptr) {
                struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-               struct bucket *g = PTR_BUCKET(ca, ptr);
 
-               if (gen_after(g->mark.gen, ptr->gen) > 16) {
+               if (ptr_stale(ca, ptr) > 16) {
                        percpu_up_read(&c->mark_lock);
                        return true;
                }
@@ -1719,10 +1804,10 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
 
        bkey_for_each_ptr(ptrs, ptr) {
                struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-               struct bucket *g = PTR_BUCKET(ca, ptr);
+               u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)];
 
-               if (gen_after(g->gc_gen, ptr->gen))
-                       g->gc_gen = ptr->gen;
+               if (gen_after(*gen, ptr->gen))
+                       *gen = ptr->gen;
        }
        percpu_up_read(&c->mark_lock);
 
@@ -1733,23 +1818,22 @@ static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
  * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree
  * node pointers currently never have cached pointers that can become stale:
  */
-static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
+static int bch2_gc_btree_gens(struct btree_trans *trans, enum btree_id btree_id)
 {
-       struct btree_trans trans;
+       struct bch_fs *c = trans->c;
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_buf sk;
        int ret = 0, commit_err = 0;
 
        bch2_bkey_buf_init(&sk);
-       bch2_trans_init(&trans, c, 0, 0);
 
-       bch2_trans_iter_init(&trans, &iter, btree_id, POS_MIN,
+       bch2_trans_iter_init(trans, &iter, btree_id, POS_MIN,
                             BTREE_ITER_PREFETCH|
                             BTREE_ITER_NOT_EXTENTS|
                             BTREE_ITER_ALL_SNAPSHOTS);
 
-       while ((bch2_trans_begin(&trans),
+       while ((bch2_trans_begin(trans),
                k = bch2_btree_iter_peek(&iter)).k) {
                ret = bkey_err(k);
 
@@ -1765,10 +1849,10 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
                        bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
                        commit_err =
-                               bch2_trans_update(&trans, &iter, sk.k, 0) ?:
-                               bch2_trans_commit(&trans, NULL, NULL,
-                                                      BTREE_INSERT_NOWAIT|
-                                                      BTREE_INSERT_NOFAIL);
+                               bch2_trans_update(trans, &iter, sk.k, 0) ?:
+                               bch2_trans_commit(trans, NULL, NULL,
+                                                 BTREE_INSERT_NOWAIT|
+                                                 BTREE_INSERT_NOFAIL);
                        if (commit_err == -EINTR) {
                                commit_err = 0;
                                continue;
@@ -1777,20 +1861,42 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
 
                bch2_btree_iter_advance(&iter);
        }
-       bch2_trans_iter_exit(&trans, &iter);
+       bch2_trans_iter_exit(trans, &iter);
 
-       bch2_trans_exit(&trans);
        bch2_bkey_buf_exit(&sk, c);
 
        return ret;
 }
 
+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter)
+{
+       struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
+       struct bkey_s_c k;
+       struct bkey_alloc_unpacked u;
+       int ret;
+
+       k = bch2_btree_iter_peek_slot(iter);
+       ret = bkey_err(k);
+       if (ret)
+               return ret;
+
+       u = bch2_alloc_unpack(k);
+
+       if (u.oldest_gen == ca->oldest_gen[iter->pos.offset])
+               return 0;
+
+       u.oldest_gen = ca->oldest_gen[iter->pos.offset];
+
+       return bch2_alloc_write(trans, iter, &u, BTREE_TRIGGER_NORUN);
+}
+
 int bch2_gc_gens(struct bch_fs *c)
 {
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
        struct bch_dev *ca;
-       struct bucket_array *buckets;
-       struct bucket *g;
-       u64 start_time = local_clock();
+       u64 b, start_time = local_clock();
        unsigned i;
        int ret;
 
@@ -1800,21 +1906,32 @@ int bch2_gc_gens(struct bch_fs *c)
         * lock at the start of going RO, thus the gc thread may get stuck:
         */
        down_read(&c->gc_lock);
+       bch2_trans_init(&trans, c, 0, 0);
 
        for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               buckets = bucket_array(ca);
+               struct bucket_gens *gens;
+
+               BUG_ON(ca->oldest_gen);
+
+               ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL);
+               if (!ca->oldest_gen) {
+                       percpu_ref_put(&ca->ref);
+                       ret = -ENOMEM;
+                       goto err;
+               }
+
+               gens = bucket_gens(ca);
 
-               for_each_bucket(g, buckets)
-                       g->gc_gen = g->mark.gen;
-               up_read(&ca->bucket_lock);
+               for (b = gens->first_bucket;
+                    b < gens->nbuckets; b++)
+                       ca->oldest_gen[b] = gens->b[b];
        }
 
        for (i = 0; i < BTREE_ID_NR; i++)
                if ((1 << i) & BTREE_ID_HAS_PTRS) {
                        c->gc_gens_btree = i;
                        c->gc_gens_pos = POS_MIN;
-                       ret = bch2_gc_btree_gens(c, i);
+                       ret = bch2_gc_btree_gens(&trans, i);
                        if (ret) {
                                bch_err(c, "error recalculating oldest_gen: %i", ret);
                                goto err;
@@ -1822,12 +1939,28 @@ int bch2_gc_gens(struct bch_fs *c)
                }
 
        for_each_member_device(ca, c, i) {
-               down_read(&ca->bucket_lock);
-               buckets = bucket_array(ca);
+               for_each_btree_key(&trans, iter, BTREE_ID_alloc,
+                                  POS(ca->dev_idx, ca->mi.first_bucket),
+                                  BTREE_ITER_SLOTS|
+                                  BTREE_ITER_PREFETCH, k, ret) {
+                       if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
+                               break;
+
+                       ret = __bch2_trans_do(&trans, NULL, NULL,
+                                             BTREE_INSERT_LAZY_RW|
+                                             BTREE_INSERT_NOFAIL,
+                                       bch2_alloc_write_oldest_gen(&trans, &iter));
+                       if (ret) {
+                               bch_err(c, "error writing oldest_gen: %i", ret);
+                               break;
+                       }
+               }
+               bch2_trans_iter_exit(&trans, &iter);
 
-               for_each_bucket(g, buckets)
-                       g->oldest_gen = g->gc_gen;
-               up_read(&ca->bucket_lock);
+               if (ret) {
+                       percpu_ref_put(&ca->ref);
+                       break;
+               }
        }
 
        c->gc_gens_btree        = 0;
@@ -1837,6 +1970,12 @@ int bch2_gc_gens(struct bch_fs *c)
 
        bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
 err:
+       for_each_member_device(ca, c, i) {
+               kvfree(ca->oldest_gen);
+               ca->oldest_gen = NULL;
+       }
+
+       bch2_trans_exit(&trans);
        up_read(&c->gc_lock);
        return ret;
 }
index efe9b8cb9f1cde340ab7f481cfc9497b86f2c567..8505ad5c05177827f8b14bc02132005877d4185d 100644 (file)
@@ -1717,8 +1717,8 @@ bch2_btree_path_make_mut(struct btree_trans *trans,
        return path;
 }
 
-static struct btree_path * __must_check
-btree_path_set_pos(struct btree_trans *trans,
+struct btree_path * __must_check
+bch2_btree_path_set_pos(struct btree_trans *trans,
                   struct btree_path *path, struct bpos new_pos,
                   bool intent, unsigned long ip)
 {
@@ -1932,7 +1932,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
            path_pos->btree_id  == btree_id &&
            path_pos->level     == level) {
                __btree_path_get(path_pos, intent);
-               path = btree_path_set_pos(trans, path_pos, pos, intent, ip);
+               path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
        } else {
                path = btree_path_alloc(trans, path_pos);
                path_pos = NULL;
@@ -1983,13 +1983,13 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct
 
        struct bkey_s_c k;
 
-       BUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
-
        if (!path->cached) {
                struct btree_path_level *l = path_l(path);
-               struct bkey_packed *_k =
-                       bch2_btree_node_iter_peek_all(&l->iter, l->b);
+               struct bkey_packed *_k;
+
+               EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
 
+               _k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
                k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null;
 
                EBUG_ON(k.k && bkey_deleted(k.k) && bpos_cmp(k.k->p, path->pos) == 0);
@@ -1999,12 +1999,15 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct
        } else {
                struct bkey_cached *ck = (void *) path->l[0].b;
 
-               EBUG_ON(path->btree_id != ck->key.btree_id ||
-                       bkey_cmp(path->pos, ck->key.pos));
+               EBUG_ON(ck &&
+                       (path->btree_id != ck->key.btree_id ||
+                        bkey_cmp(path->pos, ck->key.pos)));
 
-               /* BTREE_ITER_CACHED_NOFILL? */
-               if (unlikely(!ck->valid))
-                       goto hole;
+               /* BTREE_ITER_CACHED_NOFILL|BTREE_ITER_CACHED_NOCREATE? */
+               if (unlikely(!ck || !ck->valid))
+                       return bkey_s_c_null;
+
+               EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE);
 
                k = bkey_i_to_s_c(ck->k);
        }
@@ -2029,7 +2032,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
 {
        int ret;
 
-       iter->path = btree_path_set_pos(iter->trans, iter->path,
+       iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
                                        btree_iter_search_key(iter),
                                        iter->flags & BTREE_ITER_INTENT,
                                        btree_iter_ip_allocated(iter));
@@ -2066,7 +2069,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
        bkey_init(&iter->k);
        iter->k.p = iter->pos = b->key.k.p;
 
-       iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p,
+       iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
                                        iter->flags & BTREE_ITER_INTENT,
                                        btree_iter_ip_allocated(iter));
        iter->path->should_be_locked = true;
@@ -2128,7 +2131,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
                 * the next child node
                 */
                path = iter->path =
-                       btree_path_set_pos(trans, path, bpos_successor(iter->pos),
+                       bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
                                           iter->flags & BTREE_ITER_INTENT,
                                           btree_iter_ip_allocated(iter));
 
@@ -2151,7 +2154,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
        bkey_init(&iter->k);
        iter->k.p = iter->pos = b->key.k.p;
 
-       iter->path = btree_path_set_pos(trans, iter->path, b->key.k.p,
+       iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
                                        iter->flags & BTREE_ITER_INTENT,
                                        btree_iter_ip_allocated(iter));
        iter->path->should_be_locked = true;
@@ -2247,18 +2250,52 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
        return k;
 }
 
+/*
+ * Checks btree key cache for key at iter->pos and returns it if present, or
+ * bkey_s_c_null:
+ */
+static noinline
+struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos)
+{
+       struct btree_trans *trans = iter->trans;
+       struct bch_fs *c = trans->c;
+       struct bkey u;
+       int ret;
+
+       if (!bch2_btree_key_cache_find(c, iter->btree_id, pos))
+               return bkey_s_c_null;
+
+       if (!iter->key_cache_path)
+               iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos,
+                                                    iter->flags & BTREE_ITER_INTENT, 0,
+                                                    iter->flags|BTREE_ITER_CACHED,
+                                                    _THIS_IP_);
+
+       iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos,
+                                       iter->flags & BTREE_ITER_INTENT,
+                                       btree_iter_ip_allocated(iter));
+
+       ret = bch2_btree_path_traverse(trans, iter->key_cache_path, iter->flags|BTREE_ITER_CACHED);
+       if (unlikely(ret))
+               return bkey_s_c_err(ret);
+
+       iter->key_cache_path->should_be_locked = true;
+
+       return bch2_btree_path_peek_slot(iter->key_cache_path, &u);
+}
+
 static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
 {
        struct btree_trans *trans = iter->trans;
        struct bkey_i *next_update;
-       struct bkey_s_c k;
+       struct bkey_s_c k, k2;
        int ret;
 
        EBUG_ON(iter->path->cached || iter->path->level);
        bch2_btree_iter_verify(iter);
 
        while (1) {
-               iter->path = btree_path_set_pos(trans, iter->path, search_key,
+               iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
                                        iter->flags & BTREE_ITER_INTENT,
                                        btree_iter_ip_allocated(iter));
 
@@ -2270,8 +2307,23 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
                        goto out;
                }
 
+               iter->path->should_be_locked = true;
+
                k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
 
+               if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+                   k.k &&
+                   (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) {
+                       ret = bkey_err(k2);
+                       if (ret) {
+                               k = k2;
+                               goto out;
+                       }
+
+                       k = k2;
+                       iter->k = *k.k;
+               }
+
                if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
                        k = btree_trans_peek_journal(trans, iter, k);
 
@@ -2368,7 +2420,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
                        __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
                        iter->update_path = iter->path;
 
-                       iter->update_path = btree_path_set_pos(trans,
+                       iter->update_path = bch2_btree_path_set_pos(trans,
                                                iter->update_path, pos,
                                                iter->flags & BTREE_ITER_INTENT,
                                                btree_iter_ip_allocated(iter));
@@ -2407,7 +2459,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
        else if (bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
                iter->pos = bkey_start_pos(k.k);
 
-       iter->path = btree_path_set_pos(trans, iter->path, k.k->p,
+       iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p,
                                iter->flags & BTREE_ITER_INTENT,
                                btree_iter_ip_allocated(iter));
        BUG_ON(!iter->path->nodes_locked);
@@ -2471,7 +2523,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
                search_key.snapshot = U32_MAX;
 
        while (1) {
-               iter->path = btree_path_set_pos(trans, iter->path, search_key,
+               iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
                                                iter->flags & BTREE_ITER_INTENT,
                                                btree_iter_ip_allocated(iter));
 
@@ -2602,7 +2654,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
        }
 
        search_key = btree_iter_search_key(iter);
-       iter->path = btree_path_set_pos(trans, iter->path, search_key,
+       iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key,
                                        iter->flags & BTREE_ITER_INTENT,
                                        btree_iter_ip_allocated(iter));
 
@@ -2631,6 +2683,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
                        goto out;
                }
 
+               if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) &&
+                   (k = btree_trans_peek_key_cache(iter, iter->pos)).k) {
+                       if (!bkey_err(k))
+                               iter->k = *k.k;
+                       goto out;
+               }
+
                k = bch2_btree_path_peek_slot(iter->path, &iter->k);
        } else {
                struct bpos next;
@@ -2820,8 +2879,12 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
        if (iter->update_path)
                bch2_path_put(trans, iter->update_path,
                              iter->flags & BTREE_ITER_INTENT);
+       if (iter->key_cache_path)
+               bch2_path_put(trans, iter->key_cache_path,
+                             iter->flags & BTREE_ITER_INTENT);
        iter->path = NULL;
        iter->update_path = NULL;
+       iter->key_cache_path = NULL;
 }
 
 static void __bch2_trans_iter_init(struct btree_trans *trans,
@@ -2849,9 +2912,16 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
        if (!test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags))
                flags |= BTREE_ITER_WITH_JOURNAL;
 
+       if (!btree_id_cached(trans->c, btree_id)) {
+               flags &= ~BTREE_ITER_CACHED;
+               flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+       } else if (!(flags & BTREE_ITER_CACHED))
+               flags |= BTREE_ITER_WITH_KEY_CACHE;
+
        iter->trans     = trans;
        iter->path      = NULL;
        iter->update_path = NULL;
+       iter->key_cache_path = NULL;
        iter->btree_id  = btree_id;
        iter->min_depth = depth;
        iter->flags     = flags;
@@ -2902,6 +2972,7 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
                __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
        if (src->update_path)
                __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
+       dst->key_cache_path = NULL;
 }
 
 void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
index 5205d53ce8dc15bff99af001a8e716ad3bd623ca..759c7b52f4a24f34ddf061735341c27aee49bee1 100644 (file)
@@ -50,11 +50,6 @@ static inline struct btree *btree_node_parent(struct btree_path *path,
        return btree_path_node(path, b->c.level + 1);
 }
 
-static inline int btree_iter_err(const struct btree_iter *iter)
-{
-       return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
-}
-
 /* Iterate over paths within a transaction: */
 
 static inline struct btree_path *
@@ -132,6 +127,9 @@ __trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
 struct btree_path * __must_check
 bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
                         bool, unsigned long);
+struct btree_path * __must_check
+bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
+                       struct bpos, bool, unsigned long);
 int __must_check bch2_btree_path_traverse(struct btree_trans *,
                                          struct btree_path *, unsigned);
 struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
index faed51e7f4b86227f4f2ab8c70bfa12155fd047d..df016c98e8dcae2f57d6fb5e0773783a3a7dabcc 100644 (file)
@@ -208,19 +208,21 @@ static int btree_key_cache_fill(struct btree_trans *trans,
                                struct btree_path *ck_path,
                                struct bkey_cached *ck)
 {
-       struct btree_iter iter;
+       struct btree_path *path;
        struct bkey_s_c k;
        unsigned new_u64s = 0;
        struct bkey_i *new_k = NULL;
+       struct bkey u;
        int ret;
 
-       bch2_trans_iter_init(trans, &iter, ck->key.btree_id,
-                            ck->key.pos, BTREE_ITER_SLOTS);
-       k = bch2_btree_iter_peek_slot(&iter);
-       ret = bkey_err(k);
+       path = bch2_path_get(trans, ck->key.btree_id,
+                            ck->key.pos, 0, 0, 0, _THIS_IP_);
+       ret = bch2_btree_path_traverse(trans, path, 0);
        if (ret)
                goto err;
 
+       k = bch2_btree_path_peek_slot(path, &u);
+
        if (!bch2_btree_node_relock(trans, ck_path, 0)) {
                trace_trans_restart_relock_key_cache_fill(trans->fn,
                                _THIS_IP_, ck_path->btree_id, &ck_path->pos);
@@ -261,9 +263,9 @@ static int btree_key_cache_fill(struct btree_trans *trans,
        bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b);
 
        /* We're not likely to need this iterator again: */
-       set_btree_iter_dontneed(&iter);
+       path->preserve = false;
 err:
-       bch2_trans_iter_exit(trans, &iter);
+       bch2_path_put(trans, path, 0);
        return ret;
 }
 
@@ -384,21 +386,27 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
                             BTREE_ITER_CACHED_NOFILL|
                             BTREE_ITER_CACHED_NOCREATE|
                             BTREE_ITER_INTENT);
+       b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE;
+
        ret = bch2_btree_iter_traverse(&c_iter);
        if (ret)
                goto out;
 
        ck = (void *) c_iter.path->l[0].b;
-       if (!ck ||
-           (journal_seq && ck->journal.seq != journal_seq))
+       if (!ck)
                goto out;
 
        if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
-               if (!evict)
-                       goto out;
-               goto evict;
+               if (evict)
+                       goto evict;
+               goto out;
        }
 
+       BUG_ON(!ck->valid);
+
+       if (journal_seq && ck->journal.seq != journal_seq)
+               goto out;
+
        /*
         * Since journal reclaim depends on us making progress here, and the
         * allocator/copygc depend on journal reclaim making progress, we need
@@ -406,6 +414,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
         * */
        ret   = bch2_btree_iter_traverse(&b_iter) ?:
                bch2_trans_update(trans, &b_iter, ck->k,
+                                 BTREE_UPDATE_KEY_CACHE_RECLAIM|
                                  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
                                  BTREE_TRIGGER_NORUN) ?:
                bch2_trans_commit(trans, NULL, NULL,
index 65f460e3c567241cc68148d9820c3a5ba187a461..989129f9f76ce2f7421a10b07f68a97ae75728d2 100644 (file)
@@ -202,10 +202,10 @@ struct btree_node_iter {
  */
 #define BTREE_ITER_IS_EXTENTS          (1 << 4)
 #define BTREE_ITER_NOT_EXTENTS         (1 << 5)
-#define BTREE_ITER_ERROR               (1 << 6)
-#define BTREE_ITER_CACHED              (1 << 7)
-#define BTREE_ITER_CACHED_NOFILL       (1 << 8)
-#define BTREE_ITER_CACHED_NOCREATE     (1 << 9)
+#define BTREE_ITER_CACHED              (1 << 6)
+#define BTREE_ITER_CACHED_NOFILL       (1 << 7)
+#define BTREE_ITER_CACHED_NOCREATE     (1 << 8)
+#define BTREE_ITER_WITH_KEY_CACHE      (1 << 9)
 #define BTREE_ITER_WITH_UPDATES                (1 << 10)
 #define BTREE_ITER_WITH_JOURNAL                (1 << 11)
 #define __BTREE_ITER_ALL_SNAPSHOTS     (1 << 12)
@@ -277,6 +277,7 @@ struct btree_iter {
        struct btree_trans      *trans;
        struct btree_path       *path;
        struct btree_path       *update_path;
+       struct btree_path       *key_cache_path;
 
        enum btree_id           btree_id:4;
        unsigned                min_depth:4;
@@ -636,6 +637,7 @@ static inline bool btree_type_has_snapshots(enum btree_id id)
 
 enum btree_update_flags {
        __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
+       __BTREE_UPDATE_KEY_CACHE_RECLAIM,
 
        __BTREE_TRIGGER_NORUN,          /* Don't run triggers at all */
 
@@ -648,6 +650,7 @@ enum btree_update_flags {
 };
 
 #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
+#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
 
 #define BTREE_TRIGGER_NORUN            (1U << __BTREE_TRIGGER_NORUN)
 
index 5e5a1b5e750eb1d75552c27c31743a5326072896..d9a406a28f4728b920b74a353f606f56c57e0dc7 100644 (file)
@@ -76,8 +76,6 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *,
 int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *,
                             struct bkey_i *, enum btree_update_flags);
 
-int __must_check bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
-                                  struct bkey_i *, enum btree_update_flags);
 int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
                                   struct bkey_i *, enum btree_update_flags);
 
index 7b8ca1153efebd25f6d55c9c5df6f82802f871b9..a0f7a9f06b98e18176ebdbd28468fb18c3683bed 100644 (file)
@@ -243,6 +243,8 @@ retry:
        bch2_alloc_sectors_done(c, wp);
 mem_alloc:
        b = bch2_btree_node_mem_alloc(c);
+       six_unlock_write(&b->c.lock);
+       six_unlock_intent(&b->c.lock);
 
        /* we hold cannibalize_lock: */
        BUG_ON(IS_ERR(b));
@@ -265,6 +267,9 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
 
        b = as->prealloc_nodes[--as->nr_prealloc_nodes];
 
+       six_lock_intent(&b->c.lock, NULL, NULL);
+       six_lock_write(&b->c.lock, NULL, NULL);
+
        set_btree_node_accessed(b);
        set_btree_node_dirty(c, b);
        set_btree_node_need_write(b);
@@ -378,7 +383,8 @@ static void bch2_btree_reserve_put(struct btree_update *as)
        while (as->nr_prealloc_nodes) {
                struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
 
-               six_unlock_write(&b->c.lock);
+               six_lock_intent(&b->c.lock, NULL, NULL);
+               six_lock_write(&b->c.lock, NULL, NULL);
 
                if (c->btree_reserve_cache_nr <
                    ARRAY_SIZE(c->btree_reserve_cache)) {
@@ -392,10 +398,8 @@ static void bch2_btree_reserve_put(struct btree_update *as)
                        bch2_open_buckets_put(c, &b->ob);
                }
 
-               btree_node_lock_type(c, b, SIX_LOCK_write);
                __btree_node_free(c, b);
                six_unlock_write(&b->c.lock);
-
                six_unlock_intent(&b->c.lock);
        }
 
@@ -403,39 +407,52 @@ static void bch2_btree_reserve_put(struct btree_update *as)
 }
 
 static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
-                                 unsigned flags, struct closure *cl)
+                                 unsigned flags)
 {
        struct bch_fs *c = as->c;
+       struct closure cl;
        struct btree *b;
        int ret;
 
+       closure_init_stack(&cl);
+retry:
+
        BUG_ON(nr_nodes > BTREE_RESERVE_MAX);
 
        /*
         * Protects reaping from the btree node cache and using the btree node
         * open bucket reserve:
+        *
+        * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
+        * blocking on this lock:
         */
-       ret = bch2_btree_cache_cannibalize_lock(c, cl);
+       ret = bch2_btree_cache_cannibalize_lock(c, &cl);
        if (ret)
-               return ret;
+               goto err;
 
        while (as->nr_prealloc_nodes < nr_nodes) {
                b = __bch2_btree_node_alloc(c, &as->disk_res,
                                            flags & BTREE_INSERT_NOWAIT
-                                           ? NULL : cl, flags);
+                                           ? NULL : &cl, flags);
                if (IS_ERR(b)) {
                        ret = PTR_ERR(b);
-                       goto err_free;
+                       goto err;
                }
 
                as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
        }
 
        bch2_btree_cache_cannibalize_unlock(c);
+       closure_sync(&cl);
        return 0;
-err_free:
+err:
        bch2_btree_cache_cannibalize_unlock(c);
-       trace_btree_reserve_get_fail(c, nr_nodes, cl);
+       closure_sync(&cl);
+
+       if (ret == -EAGAIN)
+               goto retry;
+
+       trace_btree_reserve_get_fail(c, nr_nodes, &cl);
        return ret;
 }
 
@@ -935,7 +952,6 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 {
        struct bch_fs *c = trans->c;
        struct btree_update *as;
-       struct closure cl;
        u64 start_time = local_clock();
        int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
                ? BCH_DISK_RESERVATION_NOFAIL : 0;
@@ -946,9 +962,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 
        if (flags & BTREE_INSERT_JOURNAL_RESERVED)
                journal_flags |= JOURNAL_RES_GET_RESERVED;
-
-       closure_init_stack(&cl);
-retry:
+       if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
+               journal_flags |= JOURNAL_RES_GET_NONBLOCK;
 
        /*
         * XXX: figure out how far we might need to split,
@@ -1003,30 +1018,16 @@ retry:
        if (ret)
                goto err;
 
+       bch2_trans_unlock(trans);
+
        ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
                                      BTREE_UPDATE_JOURNAL_RES,
-                                     journal_flags|JOURNAL_RES_GET_NONBLOCK);
-       if (ret == -EAGAIN) {
-               bch2_trans_unlock(trans);
-
-               if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
-                       bch2_btree_update_free(as);
-                       btree_trans_restart(trans);
-                       return ERR_PTR(ret);
-               }
-
-               ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
-                               BTREE_UPDATE_JOURNAL_RES,
-                               journal_flags);
-               if (ret) {
-                       trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_);
-                       goto err;
-               }
-
-               if (!bch2_trans_relock(trans)) {
-                       ret = -EINTR;
-                       goto err;
-               }
+                                     journal_flags);
+       if (ret) {
+               bch2_btree_update_free(as);
+               trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_);
+               btree_trans_restart(trans);
+               return ERR_PTR(ret);
        }
 
        ret = bch2_disk_reservation_get(c, &as->disk_res,
@@ -1036,10 +1037,15 @@ retry:
        if (ret)
                goto err;
 
-       ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl);
+       ret = bch2_btree_reserve_get(as, nr_nodes, flags);
        if (ret)
                goto err;
 
+       if (!bch2_trans_relock(trans)) {
+               ret = -EINTR;
+               goto err;
+       }
+
        bch2_journal_pin_add(&c->journal,
                             atomic64_read(&c->journal.seq),
                             &as->journal, NULL);
@@ -1047,16 +1053,6 @@ retry:
        return as;
 err:
        bch2_btree_update_free(as);
-
-       if (ret == -EAGAIN) {
-               bch2_trans_unlock(trans);
-               closure_sync(&cl);
-               ret = -EINTR;
-       }
-
-       if (ret == -EINTR && bch2_trans_relock(trans))
-               goto retry;
-
        return ERR_PTR(ret);
 }
 
index 7186457d198b0f005668e365c4eec0edfb0209b2..9d954537f449ea107c5e6069b8392fdbd125bb84 100644 (file)
 #include <linux/sort.h>
 #include <trace/events/bcachefs.h>
 
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+                         struct bkey_i *, enum btree_update_flags);
+
 static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
                                         const struct btree_insert_entry *r)
 {
@@ -650,9 +654,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
                        char buf[200];
 
                        bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
-                       bch_err(c, "invalid bkey %s on insert from %s -> %ps: %s\n",
-                               buf, trans->fn, (void *) i->ip_allocated, invalid);
-                       bch2_fatal_error(c);
+                       bch2_fs_fatal_error(c, "invalid bkey %s on insert from %s -> %ps: %s\n",
+                                           buf, trans->fn, (void *) i->ip_allocated, invalid);
                        return -EINVAL;
                }
                btree_insert_entry_checks(trans, i);
@@ -1358,8 +1361,9 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans,
        return ret;
 }
 
-int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
-                                  struct bkey_i *k, enum btree_update_flags flags)
+static int __must_check
+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+                         struct bkey_i *k, enum btree_update_flags flags)
 {
        struct btree_insert_entry *i, n;
 
@@ -1397,17 +1401,6 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr
            !btree_insert_entry_cmp(&n, i)) {
                BUG_ON(i->insert_trigger_run || i->overwrite_trigger_run);
 
-               /*
-                * This is a hack to ensure that inode creates update the btree,
-                * not the key cache, which helps with cache coherency issues in
-                * other areas:
-                */
-               if (n.cached && !i->cached) {
-                       i->k = n.k;
-                       i->flags = n.flags;
-                       return 0;
-               }
-
                bch2_path_put(trans, i->path, true);
                *i = n;
        } else
@@ -1421,12 +1414,17 @@ int __must_check bch2_trans_update_by_path(struct btree_trans *trans, struct btr
 int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
                                   struct bkey_i *k, enum btree_update_flags flags)
 {
+       struct btree_path *path = iter->update_path ?: iter->path;
+       struct bkey_cached *ck;
+       int ret;
+
        if (iter->flags & BTREE_ITER_IS_EXTENTS)
                return bch2_trans_update_extent(trans, iter, k, flags);
 
        if (bkey_deleted(&k->k) &&
+           !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
            (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) {
-               int ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
+               ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p);
                if (unlikely(ret < 0))
                        return ret;
 
@@ -1434,8 +1432,45 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
                        k->k.type = KEY_TYPE_whiteout;
        }
 
-       return bch2_trans_update_by_path(trans, iter->update_path ?: iter->path,
-                                        k, flags);
+       if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
+           !path->cached &&
+           !path->level &&
+           btree_id_cached(trans->c, path->btree_id)) {
+               if (!iter->key_cache_path ||
+                   !iter->key_cache_path->should_be_locked ||
+                   bpos_cmp(iter->key_cache_path->pos, k->k.p)) {
+                       if (!iter->key_cache_path)
+                               iter->key_cache_path =
+                                       bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
+                                                     BTREE_ITER_INTENT|
+                                                     BTREE_ITER_CACHED, _THIS_IP_);
+
+                       iter->key_cache_path =
+                               bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos,
+                                                       iter->flags & BTREE_ITER_INTENT,
+                                                       _THIS_IP_);
+
+                       ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
+                                                      BTREE_ITER_CACHED|
+                                                      BTREE_ITER_CACHED_NOFILL);
+                       if (unlikely(ret))
+                               return ret;
+
+                       ck = (void *) iter->key_cache_path->l[0].b;
+
+                       if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
+                               trace_trans_restart_key_cache_raced(trans->fn, _RET_IP_);
+                               btree_trans_restart(trans);
+                               return -EINTR;
+                       }
+
+                       iter->key_cache_path->should_be_locked = true;
+               }
+
+               path = iter->key_cache_path;
+       }
+
+       return bch2_trans_update_by_path(trans, path, k, flags);
 }
 
 void bch2_trans_commit_hook(struct btree_trans *trans,
index bf5ad436057afae2f747ffa19e9471d7a50e0925..b9f09b8250ca9a3a62d692247ba7aeb4b457f41b 100644 (file)
@@ -520,6 +520,7 @@ static int bch2_mark_alloc(struct btree_trans *trans,
            !old_u.data_type != !new_u.data_type &&
            new.k->type == KEY_TYPE_alloc_v3) {
                struct bch_alloc_v3 *v = (struct bch_alloc_v3 *) new.v;
+               u64 old_journal_seq = le64_to_cpu(v->journal_seq);
 
                BUG_ON(!journal_seq);
 
@@ -529,7 +530,8 @@ static int bch2_mark_alloc(struct btree_trans *trans,
                 * to wait on a journal flush before we can reuse the bucket:
                 */
                new_u.journal_seq = !new_u.data_type &&
-                       bch2_journal_noflush_seq(&c->journal, journal_seq)
+                       (journal_seq == old_journal_seq ||
+                        bch2_journal_noflush_seq(&c->journal, old_journal_seq))
                        ? 0 : journal_seq;
                v->journal_seq = cpu_to_le64(new_u.journal_seq);
        }
@@ -2094,7 +2096,7 @@ static void buckets_free_rcu(struct rcu_head *rcu)
                container_of(rcu, struct bucket_array, rcu);
 
        kvpfree(buckets,
-               sizeof(struct bucket_array) +
+               sizeof(*buckets) +
                buckets->nbuckets * sizeof(struct bucket));
 }
 
@@ -2103,7 +2105,7 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu)
        struct bucket_gens *buckets =
                container_of(rcu, struct bucket_gens, rcu);
 
-       kvpfree(buckets, sizeof(struct bucket_array) + buckets->nbuckets);
+       kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
 }
 
 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
@@ -2213,9 +2215,9 @@ err:
        kvpfree(buckets_nouse,
                BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
        if (bucket_gens)
-               call_rcu(&old_buckets->rcu, bucket_gens_free_rcu);
+               call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
        if (buckets)
-               call_rcu(&old_buckets->rcu, buckets_free_rcu);
+               call_rcu(&buckets->rcu, buckets_free_rcu);
 
        return ret;
 }
@@ -2230,6 +2232,8 @@ void bch2_dev_buckets_free(struct bch_dev *ca)
                free_fifo(&ca->free[i]);
        kvpfree(ca->buckets_nouse,
                BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
+       kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
+               sizeof(struct bucket_gens) + ca->mi.nbuckets);
        kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
                sizeof(struct bucket_array) +
                ca->mi.nbuckets * sizeof(struct bucket));
index d35c96bcf3a167c6e47131b9cef72fb7241ae813..7c6c59c7762c55cb4626af78aff84d55a9c5ded1 100644 (file)
@@ -97,12 +97,6 @@ static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
        return sector_to_bucket(ca, ptr->offset);
 }
 
-static inline struct bucket *PTR_BUCKET(struct bch_dev *ca,
-                                       const struct bch_extent_ptr *ptr)
-{
-       return bucket(ca, PTR_BUCKET_NR(ca, ptr));
-}
-
 static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca,
                                           const struct bch_extent_ptr *ptr)
 {
index 24139831226d49f7912854a5fcf90f1550423231..2c73dc60b838f08f42da26ded8c6b8a0358a4b20 100644 (file)
@@ -30,7 +30,6 @@ struct bucket {
 
        u64                             io_time[2];
        u8                              oldest_gen;
-       u8                              gc_gen;
        unsigned                        gen_valid:1;
        u8                              stripe_redundancy;
        u32                             stripe;
index 33ae63703230b9e0706e66962876701dba3e5995..56b37b24fcca35718e0485601c0641976aa23603 100644 (file)
@@ -107,6 +107,10 @@ retry:
                                victim = old;
                }
 
+               /* hashed to same slot 3 times: */
+               if (!victim)
+                       break;
+
                /* Failed to find an empty slot: */
                swap(new, *victim);
                last_evicted = victim;
index 2cea694575e99a4df6c4af282c76cbc77230f06e..8279a9ba76a5c5e91524512d31e966e566cb240e 100644 (file)
@@ -15,7 +15,7 @@ bool bch2_inconsistent_error(struct bch_fs *c)
                return false;
        case BCH_ON_ERROR_ro:
                if (bch2_fs_emergency_read_only(c))
-                       bch_err(c, "emergency read only");
+                       bch_err(c, "inconsistency detected - emergency read only");
                return true;
        case BCH_ON_ERROR_panic:
                panic(bch2_fmt(c, "panic after error"));
@@ -35,7 +35,7 @@ void bch2_topology_error(struct bch_fs *c)
 void bch2_fatal_error(struct bch_fs *c)
 {
        if (bch2_fs_emergency_read_only(c))
-               bch_err(c, "emergency read only");
+               bch_err(c, "fatal error - emergency read only");
 }
 
 void bch2_io_error_work(struct work_struct *work)
index 472c03d2adb578b1e87134e69ff11cf1488996b4..91fa1897db98358c8d2f9f246002463d13df0822 100644 (file)
@@ -104,7 +104,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans,
 
        bch2_assert_pos_locked(trans, BTREE_ID_inodes,
                               POS(0, bi->bi_inum),
-                              0 && c->opts.inodes_use_key_cache);
+                              c->opts.inodes_use_key_cache);
 
        set_nlink(&inode->v, bch2_inode_nlink_get(bi));
        i_uid_write(&inode->v, bi->bi_uid);
@@ -1471,7 +1471,7 @@ static void bch2_evict_inode(struct inode *vinode)
                                KEY_TYPE_QUOTA_WARN);
                bch2_quota_acct(c, inode->ei_qid, Q_INO, -1,
                                KEY_TYPE_QUOTA_WARN);
-               bch2_inode_rm(c, inode_inum(inode), true);
+               bch2_inode_rm(c, inode_inum(inode));
        }
 }
 
index 3a7c1468410210b72395263527d9b327732ddfa7..78e2db6c938b8791aa1c3b52144a156c8973f616 100644 (file)
@@ -252,15 +252,13 @@ int bch2_inode_peek(struct btree_trans *trans,
        u32 snapshot;
        int ret;
 
-       if (0 && trans->c->opts.inodes_use_key_cache)
-               flags |= BTREE_ITER_CACHED;
-
        ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
        if (ret)
                return ret;
 
        bch2_trans_iter_init(trans, iter, BTREE_ID_inodes,
-                            SPOS(0, inum.inum, snapshot), flags);
+                            SPOS(0, inum.inum, snapshot),
+                            flags|BTREE_ITER_CACHED);
        k = bch2_btree_iter_peek_slot(iter);
        ret = bkey_err(k);
        if (ret)
@@ -631,20 +629,16 @@ err:
        return ret;
 }
 
-int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached)
+int bch2_inode_rm(struct bch_fs *c, subvol_inum inum)
 {
        struct btree_trans trans;
        struct btree_iter iter = { NULL };
        struct bkey_i_inode_generation delete;
        struct bch_inode_unpacked inode_u;
        struct bkey_s_c k;
-       unsigned iter_flags = BTREE_ITER_INTENT;
        u32 snapshot;
        int ret;
 
-       if (0 && cached && c->opts.inodes_use_key_cache)
-               iter_flags |= BTREE_ITER_CACHED;
-
        bch2_trans_init(&trans, c, 0, 1024);
 
        /*
@@ -668,7 +662,8 @@ retry:
                goto err;
 
        bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes,
-                            SPOS(0, inum.inum, snapshot), iter_flags);
+                            SPOS(0, inum.inum, snapshot),
+                            BTREE_ITER_INTENT|BTREE_ITER_CACHED);
        k = bch2_btree_iter_peek_slot(&iter);
 
        ret = bkey_err(k);
index 723186d8afb6d661535e7e59dfb80170e9eb1105..77957cc7f9dda3eac49a9bd435969c72184c6545 100644 (file)
@@ -87,7 +87,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
 int bch2_inode_create(struct btree_trans *, struct btree_iter *,
                      struct bch_inode_unpacked *, u32, u64);
 
-int bch2_inode_rm(struct bch_fs *, subvol_inum, bool);
+int bch2_inode_rm(struct bch_fs *, subvol_inum);
 
 int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum,
                                  struct bch_inode_unpacked *);
index e566f8516052ee43d50c041b4615020690d12bbf..651828b8bc97e7ce17d0ab3557299e1a549ff297 100644 (file)
@@ -1677,6 +1677,6 @@ no_io:
        continue_at(cl, journal_write_done, c->io_complete_wq);
        return;
 err:
-       bch2_inconsistent_error(c);
+       bch2_fatal_error(c);
        continue_at(cl, journal_write_done, c->io_complete_wq);
 }
index 92f78907bcb6fa6e412ba5d3d88a3dff7ba1b855..c82ecff3efe2b198eb541616e2f13fd4e4f4564e 100644 (file)
@@ -6,6 +6,7 @@
  */
 
 #include "bcachefs.h"
+#include "alloc_background.h"
 #include "alloc_foreground.h"
 #include "btree_iter.h"
 #include "btree_update.h"
@@ -137,18 +138,106 @@ static inline int fragmentation_cmp(copygc_heap *heap,
        return cmp_int(l.fragmentation, r.fragmentation);
 }
 
+static int walk_buckets_to_copygc(struct bch_fs *c)
+{
+       copygc_heap *h = &c->copygc_heap;
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_alloc_unpacked u;
+       int ret;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               struct bch_dev *ca = bch_dev_bkey_exists(c, iter.pos.inode);
+               struct copygc_heap_entry e;
+
+               u = bch2_alloc_unpack(k);
+
+               if (u.data_type != BCH_DATA_user ||
+                   u.dirty_sectors >= ca->mi.bucket_size ||
+                   bch2_bucket_is_open(c, iter.pos.inode, iter.pos.offset))
+                       continue;
+
+               e = (struct copygc_heap_entry) {
+                       .dev            = iter.pos.inode,
+                       .gen            = u.gen,
+                       .replicas       = 1 + u.stripe_redundancy,
+                       .fragmentation  = u.dirty_sectors * (1U << 15)
+                               / ca->mi.bucket_size,
+                       .sectors        = u.dirty_sectors,
+                       .offset         = bucket_to_sector(ca, iter.pos.offset),
+               };
+               heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
+
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
+static int bucket_inorder_cmp(const void *_l, const void *_r)
+{
+       const struct copygc_heap_entry *l = _l;
+       const struct copygc_heap_entry *r = _r;
+
+       return cmp_int(l->dev, r->dev) ?: cmp_int(l->offset, r->offset);
+}
+
+static int check_copygc_was_done(struct bch_fs *c,
+                                u64 *sectors_not_moved,
+                                u64 *buckets_not_moved)
+{
+       copygc_heap *h = &c->copygc_heap;
+       struct btree_trans trans;
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_alloc_unpacked u;
+       struct copygc_heap_entry *i;
+       int ret = 0;
+
+       sort(h->data, h->used, sizeof(h->data[0]), bucket_inorder_cmp, NULL);
+
+       bch2_trans_init(&trans, c, 0, 0);
+       bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, 0);
+
+       for (i = h->data; i < h->data + h->used; i++) {
+               struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
+
+               bch2_btree_iter_set_pos(&iter, POS(i->dev, sector_to_bucket(ca, i->offset)));
+
+               ret = lockrestart_do(&trans,
+                               bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+               if (ret)
+                       break;
+
+               u = bch2_alloc_unpack(k);
+
+               if (u.gen == i->gen && u.dirty_sectors) {
+                       *sectors_not_moved += u.dirty_sectors;
+                       *buckets_not_moved += 1;
+               }
+       }
+       bch2_trans_iter_exit(&trans, &iter);
+
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
 static int bch2_copygc(struct bch_fs *c)
 {
        copygc_heap *h = &c->copygc_heap;
        struct copygc_heap_entry e, *i;
-       struct bucket_array *buckets;
        struct bch_move_stats move_stats;
        u64 sectors_to_move = 0, sectors_to_write = 0, sectors_not_moved = 0;
        u64 sectors_reserved = 0;
        u64 buckets_to_move, buckets_not_moved = 0;
        struct bch_dev *ca;
        unsigned dev_idx;
-       size_t b, heap_size = 0;
+       size_t heap_size = 0;
        int ret;
 
        bch_move_stats_init(&move_stats, "copygc");
@@ -178,34 +267,12 @@ static int bch2_copygc(struct bch_fs *c)
                spin_lock(&ca->fs->freelist_lock);
                sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size;
                spin_unlock(&ca->fs->freelist_lock);
+       }
 
-               down_read(&ca->bucket_lock);
-               buckets = bucket_array(ca);
-
-               for (b = buckets->first_bucket; b < buckets->nbuckets; b++) {
-                       struct bucket *g = buckets->b + b;
-                       struct bucket_mark m = READ_ONCE(g->mark);
-                       struct copygc_heap_entry e;
-
-                       if (m.owned_by_allocator ||
-                           m.data_type != BCH_DATA_user ||
-                           m.dirty_sectors >= ca->mi.bucket_size)
-                               continue;
-
-                       WARN_ON(m.stripe && !g->stripe_redundancy);
-
-                       e = (struct copygc_heap_entry) {
-                               .dev            = dev_idx,
-                               .gen            = m.gen,
-                               .replicas       = 1 + g->stripe_redundancy,
-                               .fragmentation  = m.dirty_sectors * (1U << 15)
-                                       / ca->mi.bucket_size,
-                               .sectors        = m.dirty_sectors,
-                               .offset         = bucket_to_sector(ca, b),
-                       };
-                       heap_add_or_replace(h, e, -fragmentation_cmp, NULL);
-               }
-               up_read(&ca->bucket_lock);
+       ret = walk_buckets_to_copygc(c);
+       if (ret) {
+               bch2_fs_fatal_error(c, "error walking buckets to copygc!");
+               return ret;
        }
 
        if (!h->used) {
@@ -251,30 +318,18 @@ static int bch2_copygc(struct bch_fs *c)
                             writepoint_ptr(&c->copygc_write_point),
                             copygc_pred, NULL,
                             &move_stats);
+       if (ret) {
+               bch_err(c, "error %i from bch2_move_data() in copygc", ret);
+               return ret;
+       }
 
-       for_each_rw_member(ca, c, dev_idx) {
-               down_read(&ca->bucket_lock);
-               buckets = bucket_array(ca);
-               for (i = h->data; i < h->data + h->used; i++) {
-                       struct bucket_mark m;
-                       size_t b;
-
-                       if (i->dev != dev_idx)
-                               continue;
-
-                       b = sector_to_bucket(ca, i->offset);
-                       m = READ_ONCE(buckets->b[b].mark);
-
-                       if (i->gen == m.gen &&
-                           m.dirty_sectors) {
-                               sectors_not_moved += m.dirty_sectors;
-                               buckets_not_moved++;
-                       }
-               }
-               up_read(&ca->bucket_lock);
+       ret = check_copygc_was_done(c, &sectors_not_moved, &buckets_not_moved);
+       if (ret) {
+               bch_err(c, "error %i from check_copygc_was_done()", ret);
+               return ret;
        }
 
-       if (sectors_not_moved && !ret)
+       if (sectors_not_moved)
                bch_warn_ratelimited(c,
                        "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)",
                         sectors_not_moved, sectors_to_move,
index b818093eab39333265a9a4004b49bc6da0bee198..7e4400cc02a9249381e5c03b6b7ff7776c4ee910 100644 (file)
@@ -1095,7 +1095,11 @@ use_clean:
 
        bch_verbose(c, "starting alloc read");
        err = "error reading allocation information";
-       ret = bch2_alloc_read(c);
+
+       down_read(&c->gc_lock);
+       ret = bch2_alloc_read(c, false, false);
+       up_read(&c->gc_lock);
+
        if (ret)
                goto err;
        bch_verbose(c, "alloc read done");
@@ -1153,23 +1157,6 @@ use_clean:
        if (c->opts.verbose || !c->sb.clean)
                bch_info(c, "journal replay done");
 
-       if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) &&
-           !c->opts.nochanges) {
-               /*
-                * note that even when filesystem was clean there might be work
-                * to do here, if we ran gc (because of fsck) which recalculated
-                * oldest_gen:
-                */
-               bch_verbose(c, "writing allocation info");
-               err = "error writing out alloc info";
-               ret = bch2_alloc_write_all(c, BTREE_INSERT_LAZY_RW);
-               if (ret) {
-                       bch_err(c, "error writing alloc info");
-                       goto err;
-               }
-               bch_verbose(c, "alloc write done");
-       }
-
        if (c->sb.version < bcachefs_metadata_version_snapshot_2) {
                bch2_fs_lazy_rw(c);
 
index a08f1e084a9d9d5b24a29d752cb1abba5b57b226..96994b7a75a555699fc0aa2c2745f8353cdeeaba 100644 (file)
@@ -414,18 +414,10 @@ err:
        goto out;
 }
 
-static int __bch2_mark_replicas(struct bch_fs *c,
-                               struct bch_replicas_entry *r,
-                               bool check)
-{
-       return likely(bch2_replicas_marked(c, r))       ? 0
-               : check                                 ? -1
-               : bch2_mark_replicas_slowpath(c, r);
-}
-
 int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
 {
-       return __bch2_mark_replicas(c, r, false);
+       return likely(bch2_replicas_marked(c, r))
+               ? 0 : bch2_mark_replicas_slowpath(c, r);
 }
 
 /* replicas delta list: */
index 586ba60d03ea7a2e0cb5b4ae9abeeb0c4bff0638..d8b72d8dd7a860a1f49383646a2e33f4739052a4 100644 (file)
@@ -762,6 +762,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
        bch2_opts_apply(&c->opts, opts);
 
+       /* key cache currently disabled for inodes, because of snapshots: */
+       c->opts.inodes_use_key_cache = 0;
+
+       c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc;
+       if (c->opts.inodes_use_key_cache)
+               c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes;
+
        c->block_bits           = ilog2(block_sectors(c));
        c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c);