]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 783085c3cc44 kbuild: Allow gcov to be enabled on the comma...
authorKent Overstreet <kent.overstreet@linux.dev>
Tue, 21 Nov 2023 00:33:52 +0000 (19:33 -0500)
committerKent Overstreet <kent.overstreet@linux.dev>
Wed, 22 Nov 2023 21:55:06 +0000 (16:55 -0500)
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
71 files changed:
.bcachefs_revision
Makefile
cmd_data.c
cmd_device.c
include/linux/atomic.h
include/linux/closure.h
include/linux/shrinker.h
libbcachefs/alloc_background.c
libbcachefs/alloc_foreground.c
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bcachefs_ioctl.h
libbcachefs/btree_cache.c
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_journal_iter.c
libbcachefs/btree_journal_iter.h
libbcachefs/btree_key_cache.c
libbcachefs/btree_key_cache_types.h
libbcachefs/btree_trans_commit.c
libbcachefs/btree_types.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_write_buffer.c
libbcachefs/btree_write_buffer.h
libbcachefs/btree_write_buffer_types.h
libbcachefs/buckets.c
libbcachefs/buckets.h
libbcachefs/chardev.c
libbcachefs/clock.c
libbcachefs/darray.c
libbcachefs/darray.h
libbcachefs/ec.c
libbcachefs/ec_types.h
libbcachefs/errcode.h
libbcachefs/fs-io-direct.c
libbcachefs/fs-io.c
libbcachefs/fs-ioctl.c
libbcachefs/fs.c
libbcachefs/fs.h
libbcachefs/fsck.c
libbcachefs/inode.c
libbcachefs/io_write.c
libbcachefs/io_write.h
libbcachefs/journal.c
libbcachefs/journal.h
libbcachefs/journal_io.c
libbcachefs/journal_io.h
libbcachefs/journal_reclaim.c
libbcachefs/journal_reclaim.h
libbcachefs/journal_types.h
libbcachefs/move.c
libbcachefs/move.h
libbcachefs/movinggc.c
libbcachefs/opts.h
libbcachefs/recovery.c
libbcachefs/replicas.c
libbcachefs/replicas.h
libbcachefs/replicas_types.h
libbcachefs/sb-clean.c
libbcachefs/sb-errors.h
libbcachefs/six.c
libbcachefs/super-io.c
libbcachefs/super-io.h
libbcachefs/super.c
libbcachefs/super_types.h
libbcachefs/sysfs.c
libbcachefs/trace.h
linux/closure.c
linux/shrinker.c

index baa146ea85a20e762eb41f0b0f999ef55e75aac3..4649f2ba299a4818220296f95d8e446a0775ded3 100644 (file)
@@ -1 +1 @@
-938f680845d1be28979e23aee972dba010c464ba
+783085c3cc440183ba5e987b1aa7791cc1ca42ba
index 61a624558e5876c9ab624699403d67474600037a..3bd84874e30067abead73b6e31251e65ea416e3b 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -18,7 +18,7 @@ else
   Q = @
 endif
 
-CFLAGS+=-std=gnu11 -O2 -g -MMD -Wall -fPIC                     \
+CFLAGS+=-std=gnu11 -O2 -g -MMD -Wall -fPIC                     \
        -Wno-pointer-sign                                       \
        -Wno-deprecated-declarations                            \
        -fno-strict-aliasing                                    \
index 6d709883720d7a5dc137105d99cc234056e860a0..1ef689bc6f67b270ca2906ad0938a80d6c60b284 100644 (file)
@@ -5,6 +5,7 @@
 
 #include "libbcachefs/bcachefs_ioctl.h"
 #include "libbcachefs/btree_cache.h"
+#include "libbcachefs/move.h"
 
 #include "cmds.h"
 #include "libbcachefs.h"
@@ -55,7 +56,7 @@ int cmd_data_rereplicate(int argc, char *argv[])
                die("too many arguments");
 
        return bchu_data(bcache_fs_open(fs_path), (struct bch_ioctl_data) {
-               .op             = BCH_DATA_OP_REREPLICATE,
+               .op             = BCH_DATA_OP_rereplicate,
                .start_btree    = 0,
                .start_pos      = POS_MIN,
                .end_btree      = BTREE_ID_NR,
@@ -70,7 +71,7 @@ static void data_job_usage(void)
             "\n"
             "Kick off a data job and report progress\n"
             "\n"
-            "job: one of scrub, rereplicate, migrate, or rewrite_old_nodes\n"
+            "job: one of scrub, rereplicate, migrate, rewrite_old_nodes, or drop_extra_replicas\n"
             "\n"
             "Options:\n"
             "  -b btree                    btree to operate on\n"
@@ -81,14 +82,6 @@ static void data_job_usage(void)
        exit(EXIT_SUCCESS);
 }
 
-const char * const data_jobs[] = {
-       "scrub",
-       "rereplicate",
-       "migrate",
-       "rewrite_old_nodes",
-       NULL
-};
-
 int cmd_data_job(int argc, char *argv[])
 {
        struct bch_ioctl_data op = {
@@ -121,10 +114,7 @@ int cmd_data_job(int argc, char *argv[])
        if (!job)
                die("please specify which type of job");
 
-       op.op = read_string_list_or_die(job, data_jobs, "bad job type");
-
-       if (op.op == BCH_DATA_OP_SCRUB)
-               die("scrub not implemented yet");
+       op.op = read_string_list_or_die(job, bch2_data_ops_strs, "bad job type");
 
        char *fs_path = arg_pop();
        if (!fs_path)
index 1cb31ab858422f7f5646aeb2ee7d3d7dd38f6c35..bd4968353fa8a44d52f74f362c904877639384cc 100644 (file)
@@ -332,7 +332,7 @@ int cmd_device_evacuate(int argc, char *argv[])
        }
 
        return bchu_data(fs, (struct bch_ioctl_data) {
-               .op             = BCH_DATA_OP_MIGRATE,
+               .op             = BCH_DATA_OP_migrate,
                .start_btree    = 0,
                .start_pos      = POS_MIN,
                .end_btree      = BTREE_ID_NR,
index 2c983cd4efb871822a0ef5652235759d3bcb4b86..7effc1612683eff6cd2b4c70c6ec4d7c67a3c675 100644 (file)
@@ -161,6 +161,13 @@ static inline i_type a_type##_read(const a_type##_t *v)                    \
        return __ATOMIC_READ(&v->counter);                              \
 }                                                                      \
                                                                        \
+static inline i_type a_type##_read_acquire(const a_type##_t *v)                \
+{                                                                      \
+       i_type ret = __ATOMIC_READ(&v->counter);                        \
+       smp_mb__after_atomic();                                         \
+       return ret;                                                     \
+}                                                                      \
+                                                                       \
 static inline void a_type##_set(a_type##_t *v, i_type i)               \
 {                                                                      \
        return __ATOMIC_SET(&v->counter, i);                            \
index de7bb47d8a46ace38d95a81ed6df231d91ac725b..c554c6a08768ad60cdf529a65cf962095363a4a9 100644 (file)
 
 struct closure;
 struct closure_syncer;
-typedef void (closure_fn) (struct closure *);
+typedef void (closure_fn) (struct work_struct *);
 extern struct dentry *bcache_debug;
 
 struct closure_waitlist {
@@ -254,7 +254,7 @@ static inline void closure_queue(struct closure *cl)
                INIT_WORK(&cl->work, cl->work.func);
                BUG_ON(!queue_work(wq, &cl->work));
        } else
-               cl->fn(cl);
+               cl->fn(&cl->work);
 }
 
 /**
@@ -309,6 +309,11 @@ static inline void closure_wake_up(struct closure_waitlist *list)
        __closure_wake_up(list);
 }
 
+#define CLOSURE_CALLBACK(name) void name(struct work_struct *ws)
+#define closure_type(name, type, member)                               \
+       struct closure *cl = container_of(ws, struct closure, work);    \
+       type *name = container_of(cl, type, member)
+
 /**
  * continue_at - jump to another function with barrier
  *
index bca00d61c27f7c00c30eb19f2dd59fcc8548b662..2d1adabf7f2b4e04271c5fb74e3ed4dda90a1dcd 100644 (file)
@@ -22,10 +22,18 @@ struct shrinker {
        int seeks;      /* seeks to recreate an obj */
        long batch;     /* reclaim batch size, 0 = default */
        struct list_head list;
+       void    *private_data;
 };
 
-int register_shrinker(struct shrinker *, const char *, ...);
-void unregister_shrinker(struct shrinker *);
+static inline void shrinker_free(struct shrinker *s)
+{
+       free(s);
+}
+
+struct shrinker *shrinker_alloc(unsigned int, const char *, ...);
+
+int shrinker_register(struct shrinker *);
+void shrinker_unregister(struct shrinker *);
 
 void run_shrinkers(gfp_t gfp_mask, bool);
 
index 113273b214645ff5ac43f508ed2d168ccd1c1743..1ed8506c33c7385e20d775a94da34748a29e92e3 100644 (file)
@@ -847,6 +847,19 @@ int bch2_trans_mark_alloc(struct btree_trans *trans,
                        return ret;
        }
 
+       /*
+        * need to know if we're getting called from the invalidate path or
+        * not:
+        */
+
+       if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
+           old_a->cached_sectors) {
+               ret = bch2_update_cached_sectors_list(trans, new->k.p.inode,
+                                                     -((s64) old_a->cached_sectors));
+               if (ret)
+                       return ret;
+       }
+
        return 0;
 }
 
@@ -1212,7 +1225,7 @@ fsck_err:
        return ret;
 }
 
-static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans,
+static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans,
                                              struct btree_iter *iter)
 {
        struct bch_fs *c = trans->c;
@@ -1271,24 +1284,6 @@ delete:
        goto out;
 }
 
-static int bch2_check_discard_freespace_key(struct btree_trans *trans,
-                                           struct btree_iter *iter,
-                                           struct bpos end)
-{
-       if (!btree_id_is_extents(iter->btree_id)) {
-               return __bch2_check_discard_freespace_key(trans, iter);
-       } else {
-               int ret = 0;
-
-               while (!bkey_eq(iter->pos, end) &&
-                      !(ret = btree_trans_too_many_iters(trans) ?:
-                              __bch2_check_discard_freespace_key(trans, iter)))
-                       bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
-
-               return ret;
-       }
-}
-
 /*
  * We've already checked that generation numbers in the bucket_gens btree are
  * valid for buckets that exist; this just checks for keys for nonexistent
@@ -1445,12 +1440,40 @@ bkey_err:
        ret = for_each_btree_key2(trans, iter,
                        BTREE_ID_need_discard, POS_MIN,
                        BTREE_ITER_PREFETCH, k,
-               bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
-             for_each_btree_key2(trans, iter,
-                       BTREE_ID_freespace, POS_MIN,
-                       BTREE_ITER_PREFETCH, k,
-               bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
-             for_each_btree_key_commit(trans, iter,
+               bch2_check_discard_freespace_key(trans, &iter));
+       if (ret)
+               goto err;
+
+       bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
+                            BTREE_ITER_PREFETCH);
+       while (1) {
+               bch2_trans_begin(trans);
+               k = bch2_btree_iter_peek(&iter);
+               if (!k.k)
+                       break;
+
+               ret = bkey_err(k) ?:
+                       bch2_check_discard_freespace_key(trans, &iter);
+               if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+                       ret = 0;
+                       continue;
+               }
+               if (ret) {
+                       struct printbuf buf = PRINTBUF;
+                       bch2_bkey_val_to_text(&buf, c, k);
+
+                       bch_err(c, "while checking %s", buf.buf);
+                       printbuf_exit(&buf);
+                       break;
+               }
+
+               bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
+       }
+       bch2_trans_iter_exit(trans, &iter);
+       if (ret)
+               goto err;
+
+       ret = for_each_btree_key_commit(trans, iter,
                        BTREE_ID_bucket_gens, POS_MIN,
                        BTREE_ITER_PREFETCH, k,
                        NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
@@ -1802,7 +1825,7 @@ static void bch2_do_invalidates_work(struct work_struct *work)
        unsigned i;
        int ret = 0;
 
-       ret = bch2_btree_write_buffer_flush(trans);
+       ret = bch2_btree_write_buffer_tryflush(trans);
        if (ret)
                goto err;
 
index b85c7765272f6e4ae5e8aceb5a4bbaa89c535912..eef6fa8d0f9fdf715b6df4d39c158b5824138713 100644 (file)
@@ -1297,6 +1297,30 @@ out:
        return wp;
 }
 
+static noinline void
+deallocate_extra_replicas(struct bch_fs *c,
+                         struct open_buckets *ptrs,
+                         struct open_buckets *ptrs_no_use,
+                         unsigned extra_replicas)
+{
+       struct open_buckets ptrs2 = { 0 };
+       struct open_bucket *ob;
+       unsigned i;
+
+       open_bucket_for_each(c, ptrs, ob, i) {
+               unsigned d = bch_dev_bkey_exists(c, ob->dev)->mi.durability;
+
+               if (d && d <= extra_replicas) {
+                       extra_replicas -= d;
+                       ob_push(c, ptrs_no_use, ob);
+               } else {
+                       ob_push(c, &ptrs2, ob);
+               }
+       }
+
+       *ptrs = ptrs2;
+}
+
 /*
  * Get us an open_bucket we can allocate from, return with it locked:
  */
@@ -1382,6 +1406,9 @@ alloc_done:
        if (ret)
                goto err;
 
+       if (nr_effective > nr_replicas)
+               deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas);
+
        /* Free buckets we didn't use: */
        open_bucket_for_each(c, &wp->ptrs, ob, i)
                open_bucket_free_unused(c, ob);
index 3117ab4426a74ae8262f79ccaffc325d28ba4254..53f93f03f0885f8b27d5959041a2b8b05cf9a555 100644 (file)
@@ -406,6 +406,7 @@ BCH_DEBUG_PARAMS_DEBUG()
        x(blocked_journal_max_in_flight)        \
        x(blocked_allocate)                     \
        x(blocked_allocate_open_bucket)         \
+       x(blocked_write_buffer_full)            \
        x(nocow_lock_contended)
 
 enum bch_time_stats {
@@ -640,6 +641,8 @@ struct journal_keys {
        size_t                  gap;
        size_t                  nr;
        size_t                  size;
+       atomic_t                ref;
+       bool                    initial_ref_held;
 };
 
 struct btree_trans_buf {
@@ -664,7 +667,8 @@ struct btree_trans_buf {
        x(invalidate)                                                   \
        x(delete_dead_snapshots)                                        \
        x(snapshot_delete_pagecache)                                    \
-       x(sysfs)
+       x(sysfs)                                                        \
+       x(btree_write_buffer)
 
 enum bch_write_ref {
 #define x(n) BCH_WRITE_REF_##n,
@@ -1064,6 +1068,16 @@ static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
 #endif
 }
 
+static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+       return !test_bit(BCH_FS_GOING_RO, &c->flags) &&
+               atomic_long_inc_not_zero(&c->writes[ref]);
+#else
+       return percpu_ref_tryget(&c->writes);
+#endif
+}
+
 static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
 {
 #ifdef BCH_WRITE_REF_DEBUG
index 0a750953ff921b9d62d9fd1918da27d375c2c6dc..ad0f298c87c39e1420ad31b00a57b371b87af37d 100644 (file)
@@ -303,6 +303,13 @@ struct bkey_i {
        struct bch_val  v;
 };
 
+#define POS_KEY(_pos)                                                  \
+((struct bkey) {                                                       \
+       .u64s           = BKEY_U64s,                                    \
+       .format         = KEY_FORMAT_CURRENT,                           \
+       .p              = _pos,                                         \
+})
+
 #define KEY(_inode, _offset, _size)                                    \
 ((struct bkey) {                                                       \
        .u64s           = BKEY_U64s,                                    \
@@ -1436,7 +1443,7 @@ struct bch_sb_field_replicas_v0 {
        struct bch_replicas_entry_v0 entries[];
 } __packed __aligned(8);
 
-struct bch_replicas_entry {
+struct bch_replicas_entry_v1 {
        __u8                    data_type;
        __u8                    nr_devs;
        __u8                    nr_required;
@@ -1448,7 +1455,7 @@ struct bch_replicas_entry {
 
 struct bch_sb_field_replicas {
        struct bch_sb_field     field;
-       struct bch_replicas_entry entries[];
+       struct bch_replicas_entry_v1 entries[];
 } __packed __aligned(8);
 
 /* BCH_SB_FIELD_quota: */
@@ -2124,7 +2131,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
        x(clock,                7)              \
        x(dev_usage,            8)              \
        x(log,                  9)              \
-       x(overwrite,            10)
+       x(overwrite,            10)             \
+       x(write_buffer_keys,    11)
 
 enum {
 #define x(f, nr)       BCH_JSET_ENTRY_##f      = nr,
@@ -2174,7 +2182,7 @@ struct jset_entry_usage {
 struct jset_entry_data_usage {
        struct jset_entry       entry;
        __le64                  v;
-       struct bch_replicas_entry r;
+       struct bch_replicas_entry_v1 r;
 } __packed;
 
 struct jset_entry_clock {
index f05881f7e1135abe30771f20f19d98693844475d..18eb325401cf647662ed0b2ec1d1d623669214ae 100644 (file)
@@ -173,12 +173,18 @@ struct bch_ioctl_disk_set_state {
        __u64                   dev;
 };
 
+#define BCH_DATA_OPS()                 \
+       x(scrub,                0)      \
+       x(rereplicate,          1)      \
+       x(migrate,              2)      \
+       x(rewrite_old_nodes,    3)      \
+       x(drop_extra_replicas,  4)
+
 enum bch_data_ops {
-       BCH_DATA_OP_SCRUB               = 0,
-       BCH_DATA_OP_REREPLICATE         = 1,
-       BCH_DATA_OP_MIGRATE             = 2,
-       BCH_DATA_OP_REWRITE_OLD_NODES   = 3,
-       BCH_DATA_OP_NR                  = 4,
+#define x(t, n) BCH_DATA_OP_##t = n,
+       BCH_DATA_OPS()
+#undef x
+       BCH_DATA_OP_NR
 };
 
 /*
@@ -237,7 +243,7 @@ struct bch_ioctl_data_event {
 
 struct bch_replicas_usage {
        __u64                   sectors;
-       struct bch_replicas_entry r;
+       struct bch_replicas_entry_v1 r;
 } __packed;
 
 static inline struct bch_replicas_usage *
index 84636ad034fa84cdfaecf654cc9b17eeb79bd04a..72dea90e12fa9d11b2d1ceb74edebecc3b78bc5e 100644 (file)
@@ -318,8 +318,7 @@ static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b)
 static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
                                           struct shrink_control *sc)
 {
-       struct bch_fs *c = container_of(shrink, struct bch_fs,
-                                       btree_cache.shrink);
+       struct bch_fs *c = shrink->private_data;
        struct btree_cache *bc = &c->btree_cache;
        struct btree *b, *t;
        unsigned long nr = sc->nr_to_scan;
@@ -420,8 +419,7 @@ out_nounlock:
 static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
                                            struct shrink_control *sc)
 {
-       struct bch_fs *c = container_of(shrink, struct bch_fs,
-                                       btree_cache.shrink);
+       struct bch_fs *c = shrink->private_data;
        struct btree_cache *bc = &c->btree_cache;
 
        if (bch2_btree_shrinker_disabled)
@@ -432,8 +430,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink,
 
 static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
 {
-       struct bch_fs *c = container_of(shrink, struct bch_fs,
-                                       btree_cache.shrink);
+       struct bch_fs *c = shrink->private_data;
        char *cbuf;
        size_t buflen = seq_buf_get_buf(s, &cbuf);
        struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
@@ -448,7 +445,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
        struct btree *b;
        unsigned i, flags;
 
-       unregister_shrinker(&bc->shrink);
+       shrinker_free(bc->shrink);
 
        /* vfree() can allocate memory: */
        flags = memalloc_nofs_save();
@@ -502,6 +499,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 int bch2_fs_btree_cache_init(struct bch_fs *c)
 {
        struct btree_cache *bc = &c->btree_cache;
+       struct shrinker *shrink;
        unsigned i;
        int ret = 0;
 
@@ -521,13 +519,16 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
 
        mutex_init(&c->verify_lock);
 
-       bc->shrink.count_objects        = bch2_btree_cache_count;
-       bc->shrink.scan_objects         = bch2_btree_cache_scan;
-       bc->shrink.to_text              = bch2_btree_cache_shrinker_to_text;
-       bc->shrink.seeks                = 4;
-       ret = register_shrinker(&bc->shrink, "%s-btree_cache", c->name);
-       if (ret)
+       shrink = shrinker_alloc(0, "%s-btree_cache", c->name);
+       if (!shrink)
                goto err;
+       bc->shrink = shrink;
+       shrink->count_objects   = bch2_btree_cache_count;
+       shrink->scan_objects    = bch2_btree_cache_scan;
+       shrink->to_text         = bch2_btree_cache_shrinker_to_text;
+       shrink->seeks           = 4;
+       shrink->private_data    = c;
+       shrinker_register(shrink);
 
        return 0;
 err:
index c4922bd30fafa52990cffca0ab19761fa28fc97d..7e5d52f8ffd718d89b27cc4e8578d9e3ee5a7bf2 100644 (file)
@@ -1287,7 +1287,7 @@ static int bch2_gc_done(struct bch_fs *c,
                }
 
                for (i = 0; i < c->replicas.nr; i++) {
-                       struct bch_replicas_entry *e =
+                       struct bch_replicas_entry_v1 *e =
                                cpu_replicas_entry(&c->replicas, i);
 
                        if (metadata_only &&
index 1f73ee0ee359bbf3c7434e2254ea2361432d968d..3c663c596b464ae3c8b3db68d2fc44a44bf19b9a 100644 (file)
@@ -1358,10 +1358,9 @@ static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *
        return offset;
 }
 
-static void btree_node_read_all_replicas_done(struct closure *cl)
+static CLOSURE_CALLBACK(btree_node_read_all_replicas_done)
 {
-       struct btree_node_read_all *ra =
-               container_of(cl, struct btree_node_read_all, cl);
+       closure_type(ra, struct btree_node_read_all, cl);
        struct bch_fs *c = ra->c;
        struct btree *b = ra->b;
        struct printbuf buf = PRINTBUF;
@@ -1567,7 +1566,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
 
        if (sync) {
                closure_sync(&ra->cl);
-               btree_node_read_all_replicas_done(&ra->cl);
+               btree_node_read_all_replicas_done(&ra->cl.work);
        } else {
                continue_at(&ra->cl, btree_node_read_all_replicas_done,
                            c->io_complete_wq);
index 3128695062d9345b8e94b442db77e93c25897e0e..a52fd206f8222858264b75420b3fcb43511934f3 100644 (file)
@@ -1854,19 +1854,11 @@ static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
                                              struct btree_iter *iter,
                                              struct bpos end_pos)
 {
-       struct bkey_i *k;
-
-       if (bpos_lt(iter->path->pos, iter->journal_pos))
-               iter->journal_idx = 0;
-
-       k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
-                                       iter->path->level,
-                                       iter->path->pos,
-                                       end_pos,
-                                       &iter->journal_idx);
-
-       iter->journal_pos = k ? k->k.p : end_pos;
-       return k;
+       return bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
+                                          iter->path->level,
+                                          iter->path->pos,
+                                          end_pos,
+                                          &iter->journal_idx);
 }
 
 static noinline
@@ -2874,7 +2866,8 @@ struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
        trans->fn_idx           = fn_idx;
        trans->locking_wait.task = current;
        trans->journal_replay_not_finished =
-               !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
+               unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
+               atomic_inc_not_zero(&c->journal_keys.ref);
        closure_init_stack(&trans->ref);
 
        s = btree_trans_stats(trans);
@@ -2991,6 +2984,9 @@ void bch2_trans_put(struct btree_trans *trans)
                        kfree(trans->fs_usage_deltas);
        }
 
+       if (unlikely(trans->journal_replay_not_finished))
+               bch2_journal_keys_put(c);
+
        if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
                mempool_free(trans->mem, &c->btree_trans_mem_pool);
        else
index e5b989a8eb98ab330eeacadd6b8c24d72f440599..a4fec7cce911cb2878ed6ee181a8162bb057c0b0 100644 (file)
@@ -445,14 +445,16 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
                                          unsigned flags,
                                          unsigned long ip)
 {
-       memset(iter, 0, sizeof(*iter));
-       iter->trans     = trans;
-       iter->btree_id  = btree_id;
-       iter->flags     = flags;
-       iter->snapshot  = pos.snapshot;
-       iter->pos       = pos;
-       iter->k.p       = pos;
-
+       iter->trans             = trans;
+       iter->update_path       = NULL;
+       iter->key_cache_path    = NULL;
+       iter->btree_id          = btree_id;
+       iter->min_depth         = 0;
+       iter->flags             = flags;
+       iter->snapshot          = pos.snapshot;
+       iter->pos               = pos;
+       iter->k                 = POS_KEY(pos);
+       iter->journal_idx       = 0;
 #ifdef CONFIG_BCACHEFS_DEBUG
        iter->ip_allocated = ip;
 #endif
index 58a981bcf3aa8ca6749c41ca9e22655098adee14..7a5e0a893df924b35a6126829111c3240c847d1f 100644 (file)
@@ -73,6 +73,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys,
        return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
 }
 
+/* Returns first non-overwritten key >= search key: */
 struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
                                           unsigned level, struct bpos pos,
                                           struct bpos end_pos, size_t *idx)
@@ -80,16 +81,32 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree
        struct journal_keys *keys = &c->journal_keys;
        unsigned iters = 0;
        struct journal_key *k;
+
+       BUG_ON(*idx > keys->nr);
 search:
        if (!*idx)
                *idx = __bch2_journal_key_search(keys, btree_id, level, pos);
 
+       while (*idx &&
+              __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
+               --(*idx);
+               iters++;
+               if (iters == 10) {
+                       *idx = 0;
+                       goto search;
+               }
+       }
+
        while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
                if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
                        return NULL;
 
-               if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
-                   !k->overwritten)
+               if (k->overwritten) {
+                       (*idx)++;
+                       continue;
+               }
+
+               if (__journal_key_cmp(btree_id, level, pos, k) <= 0)
                        return k->k;
 
                (*idx)++;
@@ -189,10 +206,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
                /* Since @keys was full, there was no gap: */
                memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
                kvfree(keys->d);
-               *keys = new_keys;
+               keys->d         = new_keys.d;
+               keys->nr        = new_keys.nr;
+               keys->size      = new_keys.size;
 
                /* And now the gap is at the end: */
-               keys->gap = keys->nr;
+               keys->gap       = keys->nr;
        }
 
        journal_iters_move_gap(c, keys->gap, idx);
@@ -415,10 +434,16 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
                cmp_int(l->journal_offset, r->journal_offset);
 }
 
-void bch2_journal_keys_free(struct journal_keys *keys)
+void bch2_journal_keys_put(struct bch_fs *c)
 {
+       struct journal_keys *keys = &c->journal_keys;
        struct journal_key *i;
 
+       BUG_ON(atomic_read(&keys->ref) <= 0);
+
+       if (!atomic_dec_and_test(&keys->ref))
+               return;
+
        move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
        keys->gap = keys->nr;
 
@@ -429,6 +454,8 @@ void bch2_journal_keys_free(struct journal_keys *keys)
        kvfree(keys->d);
        keys->d = NULL;
        keys->nr = keys->gap = keys->size = 0;
+
+       bch2_journal_entries_free(c);
 }
 
 static void __journal_keys_sort(struct journal_keys *keys)
index 5d64e7e22f262df66076bddd982d1b6cc8b4c85a..8ca4c100b2e3e413d7adbb8dd5599d9f42de6d30 100644 (file)
@@ -49,7 +49,15 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
                                                struct bch_fs *,
                                                struct btree *);
 
-void bch2_journal_keys_free(struct journal_keys *);
+void bch2_journal_keys_put(struct bch_fs *);
+
+static inline void bch2_journal_keys_put_initial(struct bch_fs *c)
+{
+       if (c->journal_keys.initial_ref_held)
+               bch2_journal_keys_put(c);
+       c->journal_keys.initial_ref_held = false;
+}
+
 void bch2_journal_entries_free(struct bch_fs *);
 
 int bch2_journal_keys_sort(struct bch_fs *);
index 8e80a5b687fe04e685f61b022333befa218c0c9c..e14e9b4cd0298b70df3428c2ff0e51774f986ad4 100644 (file)
@@ -646,11 +646,19 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
        if (journal_seq && ck->journal.seq != journal_seq)
                goto out;
 
+       trans->journal_res.seq = ck->journal.seq;
+
        /*
-        * Since journal reclaim depends on us making progress here, and the
-        * allocator/copygc depend on journal reclaim making progress, we need
-        * to be using alloc reserves:
+        * If we're at the end of the journal, we really want to free up space
+        * in the journal right away - we don't want to pin that old journal
+        * sequence number with a new btree node write, we want to re-journal
+        * the update
         */
+       if (ck->journal.seq == journal_last_seq(j))
+               commit_flags |= BCH_WATERMARK_reclaim;
+       else
+               commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
+
        ret   = bch2_btree_iter_traverse(&b_iter) ?:
                bch2_trans_update(trans, &b_iter, ck->k,
                                  BTREE_UPDATE_KEY_CACHE_RECLAIM|
@@ -659,9 +667,6 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
                bch2_trans_commit(trans, NULL, NULL,
                                  BCH_TRANS_COMMIT_no_check_rw|
                                  BCH_TRANS_COMMIT_no_enospc|
-                                 (ck->journal.seq == journal_last_seq(j)
-                                  ? BCH_WATERMARK_reclaim
-                                  : 0)|
                                  commit_flags);
 
        bch2_fs_fatal_err_on(ret &&
@@ -830,8 +835,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans,
 static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink,
                                           struct shrink_control *sc)
 {
-       struct bch_fs *c = container_of(shrink, struct bch_fs,
-                                       btree_key_cache.shrink);
+       struct bch_fs *c = shrink->private_data;
        struct btree_key_cache *bc = &c->btree_key_cache;
        struct bucket_table *tbl;
        struct bkey_cached *ck, *t;
@@ -932,8 +936,7 @@ out:
 static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink,
                                            struct shrink_control *sc)
 {
-       struct bch_fs *c = container_of(shrink, struct bch_fs,
-                                       btree_key_cache.shrink);
+       struct bch_fs *c = shrink->private_data;
        struct btree_key_cache *bc = &c->btree_key_cache;
        long nr = atomic_long_read(&bc->nr_keys) -
                atomic_long_read(&bc->nr_dirty);
@@ -953,7 +956,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
        int cpu;
 #endif
 
-       unregister_shrinker(&bc->shrink);
+       shrinker_free(bc->shrink);
 
        mutex_lock(&bc->lock);
 
@@ -1028,8 +1031,8 @@ void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c)
 
 static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink)
 {
-       struct btree_key_cache *bc =
-               container_of(shrink, struct btree_key_cache, shrink);
+       struct bch_fs *c = shrink->private_data;
+       struct btree_key_cache *bc = &c->btree_key_cache;
        char *cbuf;
        size_t buflen = seq_buf_get_buf(s, &cbuf);
        struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen);
@@ -1041,6 +1044,7 @@ static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shri
 int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 {
        struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache);
+       struct shrinker *shrink;
 
 #ifdef __KERNEL__
        bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist);
@@ -1053,12 +1057,16 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc)
 
        bc->table_init_done = true;
 
-       bc->shrink.seeks                = 0;
-       bc->shrink.count_objects        = bch2_btree_key_cache_count;
-       bc->shrink.scan_objects         = bch2_btree_key_cache_scan;
-       bc->shrink.to_text              = bch2_btree_key_cache_shrinker_to_text;
-       if (register_shrinker(&bc->shrink, "%s-btree_key_cache", c->name))
+       shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name);
+       if (!shrink)
                return -BCH_ERR_ENOMEM_fs_btree_cache_init;
+       bc->shrink = shrink;
+       shrink->seeks           = 0;
+       shrink->count_objects   = bch2_btree_key_cache_count;
+       shrink->scan_objects    = bch2_btree_key_cache_scan;
+       shrink->to_text         = bch2_btree_key_cache_shrinker_to_text;
+       shrink->private_data    = c;
+       shrinker_register(shrink);
        return 0;
 }
 
index cfd09f5199658e2143dd1469ab5dadeb02fa249c..290e4e57df5bbcfeffb38d666aa18c89a7c1c5a6 100644 (file)
@@ -17,7 +17,7 @@ struct btree_key_cache {
        struct list_head        freed_nonpcpu;
        size_t                  nr_freed_nonpcpu;
 
-       struct shrinker         shrink;
+       struct shrinker         *shrink;
        unsigned                shrink_iter;
        struct btree_key_cache_freelist __percpu *pcpu_freed;
 
index 70077efae7889bb143195821b0d039d2dca64a89..09e94cc460ef9d8d39115b305dce1437d118b763 100644 (file)
@@ -660,10 +660,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
                i->k->k.needs_whiteout = false;
        }
 
-       if (trans->nr_wb_updates &&
-           trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
-               return -BCH_ERR_btree_insert_need_flush_buffer;
-
        /*
         * Don't get journal reservation until after we know insert will
         * succeed:
@@ -698,14 +694,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
            bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
                return -BCH_ERR_btree_insert_need_mark_replicas;
 
-       if (trans->nr_wb_updates) {
-               EBUG_ON(flags & BCH_TRANS_COMMIT_no_journal_res);
-
-               ret = bch2_btree_insert_keys_write_buffer(trans);
-               if (ret)
-                       goto revert_fs_usage;
-       }
-
        h = trans->hooks;
        while (h) {
                ret = h->fn(trans, h);
@@ -767,7 +755,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
 
                trans_for_each_wb_update(trans, wb) {
                        entry = bch2_journal_add_entry(j, &trans->journal_res,
-                                              BCH_JSET_ENTRY_btree_keys,
+                                              BCH_JSET_ENTRY_write_buffer_keys,
                                               wb->btree, 0,
                                               wb->k.k.u64s);
                        bkey_copy((struct bkey_i *) entry->start, &wb->k);
@@ -951,30 +939,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
 
                ret = bch2_trans_relock(trans);
                break;
-       case -BCH_ERR_btree_insert_need_flush_buffer: {
-               struct btree_write_buffer *wb = &c->btree_write_buffer;
-
-               ret = 0;
-
-               if (wb->state.nr > wb->size * 3 / 4) {
-                       bch2_trans_unlock(trans);
-                       mutex_lock(&wb->flush_lock);
-
-                       if (wb->state.nr > wb->size * 3 / 4) {
-                               bch2_trans_begin(trans);
-                               ret = __bch2_btree_write_buffer_flush(trans,
-                                               flags|BCH_TRANS_COMMIT_no_check_rw, true);
-                               if (!ret) {
-                                       trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
-                                       ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
-                               }
-                       } else {
-                               mutex_unlock(&wb->flush_lock);
-                               ret = bch2_trans_relock(trans);
-                       }
-               }
-               break;
-       }
        default:
                BUG_ON(ret >= 0);
                break;
@@ -1073,20 +1037,6 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
                        goto out_reset;
        }
 
-       if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
-           mutex_trylock(&c->btree_write_buffer.flush_lock)) {
-               bch2_trans_begin(trans);
-               bch2_trans_unlock(trans);
-
-               ret = __bch2_btree_write_buffer_flush(trans,
-                                       flags|BCH_TRANS_COMMIT_no_check_rw, true);
-               if (!ret) {
-                       trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
-                       ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
-               }
-               goto out;
-       }
-
        EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
 
        trans->journal_u64s             = trans->extra_journal_entries.nr;
index e58575ad104542bf1d5f5f111a5bf1ee10255ba0..14983e778756f4be403a74718cabefd6d3eeb023 100644 (file)
@@ -173,7 +173,7 @@ struct btree_cache {
        unsigned                not_freed_will_make_reachable;
        unsigned                not_freed_access_bit;
        atomic_t                dirty;
-       struct shrinker         shrink;
+       struct shrinker         *shrink;
 
        /*
         * If we need to allocate memory for a new btree node and that
@@ -297,8 +297,7 @@ struct btree_iter {
        struct btree_path       *key_cache_path;
 
        enum btree_id           btree_id:8;
-       unsigned                min_depth:3;
-       unsigned                advanced:1;
+       u8                      min_depth;
 
        /* btree_iter_copy starts here: */
        u16                     flags;
@@ -315,7 +314,6 @@ struct btree_iter {
 
        /* BTREE_ITER_WITH_JOURNAL: */
        size_t                  journal_idx;
-       struct bpos             journal_pos;
 #ifdef TRACK_PATH_ALLOCATED
        unsigned long           ip_allocated;
 #endif
index 18e5a75142e9a5e95b68661965342e8b923e140e..bfe4d7975bd8738e1af5cbfb4e33f5f8ac1bc9c8 100644 (file)
@@ -774,9 +774,9 @@ static void btree_interior_update_work(struct work_struct *work)
        }
 }
 
-static void btree_update_set_nodes_written(struct closure *cl)
+static CLOSURE_CALLBACK(btree_update_set_nodes_written)
 {
-       struct btree_update *as = container_of(cl, struct btree_update, cl);
+       closure_type(as, struct btree_update, cl);
        struct bch_fs *c = as->c;
 
        mutex_lock(&c->btree_interior_update_lock);
index a6bf6ed37ced60cfee4bb61c15c47c06d5ace9c7..0c2db1fab38acabd96253931bb7acd8935e7966a 100644 (file)
 #include "btree_write_buffer.h"
 #include "error.h"
 #include "journal.h"
+#include "journal_io.h"
 #include "journal_reclaim.h"
 
-#include <linux/sort.h>
+#include <linux/prefetch.h>
 
 static int bch2_btree_write_buffer_journal_flush(struct journal *,
                                struct journal_entry_pin *, u64);
 
-static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
+static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *);
+
+static inline bool __wb_key_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
 {
-       const struct btree_write_buffered_key *l = _l;
-       const struct btree_write_buffered_key *r = _r;
+       return (cmp_int(l->hi, r->hi) ?:
+               cmp_int(l->mi, r->mi) ?:
+               cmp_int(l->lo, r->lo)) >= 0;
+}
 
-       return  cmp_int(l->btree, r->btree) ?:
-               bpos_cmp(l->k.k.p, r->k.k.p) ?:
-               cmp_int(l->journal_seq, r->journal_seq) ?:
-               cmp_int(l->journal_offset, r->journal_offset);
+static inline bool wb_key_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
+{
+#ifdef CONFIG_X86_64
+       int cmp;
+
+       asm(".intel_syntax noprefix;"
+           "mov rax, [%[l]];"
+           "sub rax, [%[r]];"
+           "mov rax, [%[l] + 8];"
+           "sbb rax, [%[r] + 8];"
+           "mov rax, [%[l] + 16];"
+           "sbb rax, [%[r] + 16];"
+           ".att_syntax prefix;"
+           : "=@ccae" (cmp)
+           : [l] "r" (l), [r] "r" (r)
+           : "rax", "cc");
+
+       EBUG_ON(cmp != __wb_key_cmp(l, r));
+       return cmp;
+#else
+       return __wb_key_cmp(l, r);
+#endif
 }
 
-static int btree_write_buffered_journal_cmp(const void *_l, const void *_r)
+/* Compare excluding idx, the low 24 bits: */
+static inline bool wb_key_eq(const void *_l, const void *_r)
 {
-       const struct btree_write_buffered_key *l = _l;
-       const struct btree_write_buffered_key *r = _r;
+       const struct wb_key_ref *l = _l;
+       const struct wb_key_ref *r = _r;
 
-       return  cmp_int(l->journal_seq, r->journal_seq);
+       return !((l->hi ^ r->hi)|
+                (l->mi ^ r->mi)|
+                ((l->lo >> 24) ^ (r->lo >> 24)));
 }
 
-static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
-                                            struct btree_iter *iter,
-                                            struct btree_write_buffered_key *wb,
-                                            unsigned commit_flags,
-                                            bool *write_locked,
-                                            size_t *fast)
+static noinline void wb_sort(struct wb_key_ref *base, size_t num)
+{
+       size_t n = num, a = num / 2;
+
+       if (!a)         /* num < 2 || size == 0 */
+               return;
+
+       for (;;) {
+               size_t b, c, d;
+
+               if (a)                  /* Building heap: sift down --a */
+                       --a;
+               else if (--n)           /* Sorting: Extract root to --n */
+                       swap(base[0], base[n]);
+               else                    /* Sort complete */
+                       break;
+
+               /*
+                * Sift element at "a" down into heap.  This is the
+                * "bottom-up" variant, which significantly reduces
+                * calls to cmp_func(): we find the sift-down path all
+                * the way to the leaves (one compare per level), then
+                * backtrack to find where to insert the target element.
+                *
+                * Because elements tend to sift down close to the leaves,
+                * this uses fewer compares than doing two per level
+                * on the way down.  (A bit more than half as many on
+                * average, 3/4 worst-case.)
+                */
+               for (b = a; c = 2*b + 1, (d = c + 1) < n;)
+                       b = wb_key_cmp(base + c, base + d) ? c : d;
+               if (d == n)             /* Special case last leaf with no sibling */
+                       b = c;
+
+               /* Now backtrack from "b" to the correct location for "a" */
+               while (b != a && wb_key_cmp(base + a, base + b))
+                       b = (b - 1) / 2;
+               c = b;                  /* Where "a" belongs */
+               while (b != a) {        /* Shift it into place */
+                       b = (b - 1) / 2;
+                       swap(base[b], base[c]);
+               }
+       }
+}
+
+static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
+                                         struct btree_iter *iter,
+                                         struct btree_write_buffered_key *wb)
+{
+       bch2_btree_node_unlock_write(trans, iter->path, iter->path->l[0].b);
+
+       trans->journal_res.seq = wb->journal_seq;
+
+       return bch2_trans_update(trans, iter, &wb->k,
+                                BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+               bch2_trans_commit(trans, NULL, NULL,
+                                 BCH_TRANS_COMMIT_no_enospc|
+                                 BCH_TRANS_COMMIT_no_check_rw|
+                                 BCH_TRANS_COMMIT_no_journal_res|
+                                 BCH_TRANS_COMMIT_journal_reclaim);
+}
+
+static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
+                              struct btree_write_buffered_key *wb,
+                              bool *write_locked, size_t *fast)
 {
        struct bch_fs *c = trans->c;
        struct btree_path *path;
        int ret;
 
+       EBUG_ON(!wb->journal_seq);
+       EBUG_ON(!c->btree_write_buffer.flushing.pin.seq);
+       EBUG_ON(c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
+
        ret = bch2_btree_iter_traverse(iter);
        if (ret)
                return ret;
@@ -66,46 +155,14 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
                *write_locked = true;
        }
 
-       if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) {
-               bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+       if (unlikely(!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s))) {
                *write_locked = false;
-               goto trans_commit;
+               return wb_flush_one_slowpath(trans, iter, wb);
        }
 
        bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
        (*fast)++;
        return 0;
-trans_commit:
-       trans->journal_res.seq = wb->journal_seq;
-
-       return  bch2_trans_update(trans, iter, &wb->k,
-                                 BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
-               bch2_trans_commit(trans, NULL, NULL,
-                                 commit_flags|
-                                 BCH_TRANS_COMMIT_no_check_rw|
-                                 BCH_TRANS_COMMIT_no_enospc|
-                                 BCH_TRANS_COMMIT_no_journal_res|
-                                 BCH_TRANS_COMMIT_journal_reclaim);
-}
-
-static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
-{
-       union btree_write_buffer_state old, new;
-       u64 v = READ_ONCE(wb->state.v);
-
-       do {
-               old.v = new.v = v;
-
-               new.nr = 0;
-               new.idx++;
-       } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
-
-       while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1)
-               cpu_relax();
-
-       smp_mb();
-
-       return old;
 }
 
 /*
@@ -137,35 +194,79 @@ btree_write_buffered_insert(struct btree_trans *trans,
        return ret;
 }
 
-int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags,
-                                   bool locked)
+static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb)
+{
+       struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer);
+       struct journal *j = &c->journal;
+
+       if (!wb->inc.keys.nr)
+               return;
+
+       bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin,
+                            bch2_btree_write_buffer_journal_flush);
+
+       darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr));
+       darray_resize(&wb->sorted, wb->flushing.keys.size);
+
+       if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) {
+               swap(wb->flushing.keys, wb->inc.keys);
+               goto out;
+       }
+
+       size_t nr = min(darray_room(wb->flushing.keys),
+                       wb->sorted.size - wb->flushing.keys.nr);
+       nr = min(nr, wb->inc.keys.nr);
+
+       memcpy(&darray_top(wb->flushing.keys),
+              wb->inc.keys.data,
+              sizeof(wb->inc.keys.data[0]) * nr);
+
+       memmove(wb->inc.keys.data,
+               wb->inc.keys.data + nr,
+              sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr));
+
+       wb->flushing.keys.nr    += nr;
+       wb->inc.keys.nr         -= nr;
+out:
+       if (!wb->inc.keys.nr)
+               bch2_journal_pin_drop(j, &wb->inc.pin);
+       else
+               bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin,
+                                       bch2_btree_write_buffer_journal_flush);
+
+       if (j->watermark) {
+               spin_lock(&j->lock);
+               bch2_journal_set_watermark(j);
+               spin_unlock(&j->lock);
+       }
+
+       BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
+}
+
+static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 {
        struct bch_fs *c = trans->c;
        struct journal *j = &c->journal;
        struct btree_write_buffer *wb = &c->btree_write_buffer;
-       struct journal_entry_pin pin;
-       struct btree_write_buffered_key *i, *keys;
+       struct wb_key_ref *i;
        struct btree_iter iter = { NULL };
-       size_t nr = 0, skipped = 0, fast = 0, slowpath = 0;
+       size_t skipped = 0, fast = 0, slowpath = 0;
        bool write_locked = false;
-       union btree_write_buffer_state s;
        int ret = 0;
 
-       memset(&pin, 0, sizeof(pin));
-
-       if (!locked && !mutex_trylock(&wb->flush_lock))
-               return 0;
-
-       bch2_journal_pin_copy(j, &pin, &wb->journal_pin,
-                             bch2_btree_write_buffer_journal_flush);
-       bch2_journal_pin_drop(j, &wb->journal_pin);
+       bch2_trans_unlock(trans);
+       bch2_trans_begin(trans);
 
-       s = btree_write_buffer_switch(wb);
-       keys = wb->keys[s.idx];
-       nr = s.nr;
+       mutex_lock(&wb->inc.lock);
+       move_keys_from_inc_to_flushing(wb);
+       mutex_unlock(&wb->inc.lock);
 
-       if (race_fault())
-               goto slowpath;
+       for (size_t i = 0; i < wb->flushing.keys.nr; i++) {
+               wb->sorted.data[i].idx = i;
+               wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree;
+               memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos));
+       }
+       wb->sorted.nr = wb->flushing.keys.nr;
 
        /*
         * We first sort so that we can detect and skip redundant updates, and
@@ -181,110 +282,178 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
         * If that happens, simply skip the key so we can optimistically insert
         * as many keys as possible in the fast path.
         */
-       sort(keys, nr, sizeof(keys[0]),
-            btree_write_buffered_key_cmp, NULL);
+       wb_sort(wb->sorted.data, wb->sorted.nr);
+
+       darray_for_each(wb->sorted, i) {
+               struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
+
+               for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
+                       prefetch(&wb->flushing.keys.data[n->idx]);
+
+               BUG_ON(!k->journal_seq);
+
+               if (i + 1 < &darray_top(wb->sorted) &&
+                   wb_key_eq(i, i + 1)) {
+                       struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
 
-       for (i = keys; i < keys + nr; i++) {
-               if (i + 1 < keys + nr &&
-                   i[0].btree == i[1].btree &&
-                   bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
                        skipped++;
-                       i->journal_seq = 0;
+                       n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);;
+                       k->journal_seq = 0;
                        continue;
                }
 
                if (write_locked &&
-                   (iter.path->btree_id != i->btree ||
-                    bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) {
+                   (iter.path->btree_id != k->btree ||
+                    bpos_gt(k->k.k.p, iter.path->l[0].b->key.k.p))) {
                        bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
                        write_locked = false;
                }
 
-               if (!iter.path || iter.path->btree_id != i->btree) {
+               if (!iter.path || iter.path->btree_id != k->btree) {
                        bch2_trans_iter_exit(trans, &iter);
-                       bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p,
+                       bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p,
                                             BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
                }
 
-               bch2_btree_iter_set_pos(&iter, i->k.k.p);
+               bch2_btree_iter_set_pos(&iter, k->k.k.p);
                iter.path->preserve = false;
 
                do {
-                       ret = bch2_btree_write_buffer_flush_one(trans, &iter, i,
-                                               commit_flags, &write_locked, &fast);
+                       if (race_fault()) {
+                               ret = -BCH_ERR_journal_reclaim_would_deadlock;
+                               break;
+                       }
+
+                       ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
                        if (!write_locked)
                                bch2_trans_begin(trans);
                } while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
 
-               if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
+               if (!ret) {
+                       k->journal_seq = 0;
+               } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
                        slowpath++;
-                       continue;
-               }
-               if (ret)
+                       ret = 0;
+               } else
                        break;
-
-               i->journal_seq = 0;
        }
 
        if (write_locked)
                bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
        bch2_trans_iter_exit(trans, &iter);
 
-       trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
+       if (ret)
+               goto err;
+
+       if (slowpath) {
+               /*
+                * Flush in the order they were present in the journal, so that
+                * we can release journal pins:
+                * The fastpath zapped the seq of keys that were successfully flushed so
+                * we can skip those here.
+                */
+               trace_write_buffer_flush_slowpath(trans, slowpath, wb->flushing.keys.nr);
+
+               struct btree_write_buffered_key *i;
+               darray_for_each(wb->flushing.keys, i) {
+                       if (!i->journal_seq)
+                               continue;
+
+                       bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
+                                               bch2_btree_write_buffer_journal_flush);
+
+                       bch2_trans_begin(trans);
+
+                       ret = commit_do(trans, NULL, NULL,
+                                       BCH_WATERMARK_reclaim|
+                                       BCH_TRANS_COMMIT_no_check_rw|
+                                       BCH_TRANS_COMMIT_no_enospc|
+                                       BCH_TRANS_COMMIT_no_journal_res|
+                                       BCH_TRANS_COMMIT_journal_reclaim,
+                                       btree_write_buffered_insert(trans, i));
+                       if (ret)
+                               goto err;
+               }
+       }
+err:
+       bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
+       trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
+       bch2_journal_pin_drop(j, &wb->flushing.pin);
+       wb->flushing.keys.nr = 0;
+       return ret;
+}
 
-       if (slowpath)
-               goto slowpath;
+static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq)
+{
+       struct journal *j = &c->journal;
+       struct journal_buf *buf;
+       int ret = 0;
+
+       mutex_lock(&j->buf_lock);
+       while ((buf = bch2_next_write_buffer_flush_journal_buf(j, seq)))
+               if (bch2_journal_keys_to_write_buffer(c, buf)) {
+                       ret = -ENOMEM;
+                       break;
+               }
+       mutex_unlock(&j->buf_lock);
 
-       bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
-out:
-       bch2_journal_pin_drop(j, &pin);
-       mutex_unlock(&wb->flush_lock);
        return ret;
-slowpath:
-       trace_write_buffer_flush_slowpath(trans, i - keys, nr);
+}
+
+int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_write_buffer *wb = &c->btree_write_buffer;
+       int ret = 0, fetch_from_journal_err;
+
+       trace_write_buffer_flush_sync(trans, _RET_IP_);
+retry:
+       bch2_trans_unlock(trans);
+
+       bch2_journal_block_reservations(&c->journal);
+       fetch_from_journal_err = fetch_wb_keys_from_journal(c, U64_MAX);
+       bch2_journal_unblock(&c->journal);
 
        /*
-        * Now sort the rest by journal seq and bump the journal pin as we go.
-        * The slowpath zapped the seq of keys that were successfully flushed so
-        * we can skip those here.
+        * On memory allocation failure, bch2_btree_write_buffer_flush_locked()
+        * is not guaranteed to empty wb->inc:
         */
-       sort(keys, nr, sizeof(keys[0]),
-            btree_write_buffered_journal_cmp,
-            NULL);
+       mutex_lock(&wb->flushing.lock);
+       while (!ret &&
+              (wb->flushing.keys.nr || wb->inc.keys.nr))
+               ret = bch2_btree_write_buffer_flush_locked(trans);
+       mutex_unlock(&wb->flushing.lock);
 
-       commit_flags &= ~BCH_WATERMARK_MASK;
-       commit_flags |= BCH_WATERMARK_reclaim;
+       if (!ret && fetch_from_journal_err)
+               goto retry;
 
-       for (i = keys; i < keys + nr; i++) {
-               if (!i->journal_seq)
-                       continue;
+       return ret;
+}
 
-               bch2_journal_pin_update(j, i->journal_seq, &pin,
-                             bch2_btree_write_buffer_journal_flush);
+int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_write_buffer *wb = &c->btree_write_buffer;
+       int ret = 0;
 
-               ret = commit_do(trans, NULL, NULL,
-                               commit_flags|
-                               BCH_TRANS_COMMIT_no_enospc|
-                               BCH_TRANS_COMMIT_no_journal_res|
-                               BCH_TRANS_COMMIT_journal_reclaim,
-                               btree_write_buffered_insert(trans, i));
-               if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
-                       break;
+       if (mutex_trylock(&wb->flushing.lock)) {
+               ret = bch2_btree_write_buffer_flush_locked(trans);
+               mutex_unlock(&wb->flushing.lock);
        }
 
-       goto out;
+       return ret;
 }
 
-int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
+int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
 {
-       bch2_trans_unlock(trans);
-       mutex_lock(&trans->c->btree_write_buffer.flush_lock);
-       return __bch2_btree_write_buffer_flush(trans, 0, true);
-}
+       struct bch_fs *c = trans->c;
 
-int bch2_btree_write_buffer_flush(struct btree_trans *trans)
-{
-       return __bch2_btree_write_buffer_flush(trans, 0, false);
+       if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer))
+               return -BCH_ERR_erofs_no_writes;
+
+       int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans);
+       bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+       return ret;
 }
 
 static int bch2_btree_write_buffer_journal_flush(struct journal *j,
@@ -292,84 +461,195 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j,
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct btree_write_buffer *wb = &c->btree_write_buffer;
+       int ret, fetch_from_journal_err;
+
+       do {
+               fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq);
 
-       mutex_lock(&wb->flush_lock);
+               mutex_lock(&wb->flushing.lock);
+               ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans));
+               mutex_unlock(&wb->flushing.lock);
+       } while (!ret &&
+                (fetch_from_journal_err ||
+                 (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq) ||
+                 (wb->inc.pin.seq && wb->inc.pin.seq <= seq)));
 
-       return bch2_trans_run(c,
-                       __bch2_btree_write_buffer_flush(trans, BCH_TRANS_COMMIT_no_check_rw, true));
+       return ret;
 }
 
-static inline u64 btree_write_buffer_ref(int idx)
+static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
 {
-       return ((union btree_write_buffer_state) {
-               .ref0 = idx == 0,
-               .ref1 = idx == 1,
-       }).v;
+       struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work);
+       struct btree_write_buffer *wb = &c->btree_write_buffer;
+       int ret;
+
+       mutex_lock(&wb->flushing.lock);
+       do {
+               ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans));
+       } while (!ret && bch2_btree_write_buffer_should_flush(c));
+       mutex_unlock(&wb->flushing.lock);
+
+       bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
 }
 
-int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans)
+int __bch2_journal_key_to_wb(struct bch_fs *c,
+                            struct journal_keys_to_wb *dst,
+                            enum btree_id btree, struct bkey_i *k)
 {
-       struct bch_fs *c = trans->c;
        struct btree_write_buffer *wb = &c->btree_write_buffer;
-       struct btree_write_buffered_key *i;
-       union btree_write_buffer_state old, new;
-       int ret = 0;
-       u64 v;
-
-       trans_for_each_wb_update(trans, i) {
-               EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+       int ret;
+retry:
+       ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL);
+       if (!ret && dst->wb == &wb->flushing)
+               ret = darray_resize(&wb->sorted, wb->flushing.keys.size);
+
+       if (unlikely(ret)) {
+               if (dst->wb == &c->btree_write_buffer.flushing) {
+                       mutex_unlock(&dst->wb->lock);
+                       dst->wb = &c->btree_write_buffer.inc;
+                       bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin,
+                                            bch2_btree_write_buffer_journal_flush);
+                       goto retry;
+               }
 
-               i->journal_seq          = trans->journal_res.seq;
-               i->journal_offset       = trans->journal_res.offset;
+               return ret;
        }
 
-       preempt_disable();
-       v = READ_ONCE(wb->state.v);
-       do {
-               old.v = new.v = v;
+       dst->room = darray_room(dst->wb->keys);
+       if (dst->wb == &wb->flushing)
+               dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
+       BUG_ON(!dst->room);
+       BUG_ON(!dst->seq);
+
+       struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
+       wb_k->journal_seq       = dst->seq;
+       wb_k->btree             = btree;
+       bkey_copy(&wb_k->k, k);
+       dst->wb->keys.nr++;
+       dst->room--;
+       return 0;
+}
+
+void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq)
+{
+       struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+       if (mutex_trylock(&wb->flushing.lock)) {
+               mutex_lock(&wb->inc.lock);
+               move_keys_from_inc_to_flushing(wb);
+
+               /*
+                * Attempt to skip wb->inc, and add keys directly to
+                * wb->flushing, saving us a copy later:
+                */
 
-               new.v += btree_write_buffer_ref(new.idx);
-               new.nr += trans->nr_wb_updates;
-               if (new.nr > wb->size) {
-                       ret = -BCH_ERR_btree_insert_need_flush_buffer;
-                       goto out;
+               if (!wb->inc.keys.nr) {
+                       dst->wb = &wb->flushing;
+               } else {
+                       mutex_unlock(&wb->flushing.lock);
+                       dst->wb = &wb->inc;
                }
-       } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
+       } else {
+               mutex_lock(&wb->inc.lock);
+               dst->wb = &wb->inc;
+       }
 
-       memcpy(wb->keys[new.idx] + old.nr,
-              trans->wb_updates,
-              sizeof(trans->wb_updates[0]) * trans->nr_wb_updates);
+       dst->room = darray_room(dst->wb->keys);
+       if (dst->wb == &wb->flushing)
+               dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
+       dst->seq = seq;
 
-       bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin,
+       bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin,
                             bch2_btree_write_buffer_journal_flush);
+}
+
+void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
+{
+       struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+       if (!dst->wb->keys.nr)
+               bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
+
+       if (bch2_btree_write_buffer_should_flush(c) &&
+           __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) &&
+           !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work))
+               bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+
+       if (dst->wb == &wb->flushing)
+               mutex_unlock(&wb->flushing.lock);
+       mutex_unlock(&wb->inc.lock);
+}
+
+static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
+{
+       struct journal_keys_to_wb dst;
+       struct jset_entry *entry;
+       struct bkey_i *k;
+       int ret = 0;
+
+       bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
+
+       for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
+               jset_entry_for_each_key(entry, k) {
+                       ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
+                       if (ret)
+                               goto out;
+               }
 
-       atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter);
+               entry->type = BCH_JSET_ENTRY_btree_keys;
+       }
+
+       buf->need_flush_to_write_buffer = false;
 out:
-       preempt_enable();
+       bch2_journal_keys_to_write_buffer_end(c, &dst);
+       return ret;
+}
+
+static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size)
+{
+       if (wb->keys.size >= new_size)
+               return 0;
+
+       if (!mutex_trylock(&wb->lock))
+               return -EINTR;
+
+       int ret = darray_resize(&wb->keys, new_size);
+       mutex_unlock(&wb->lock);
        return ret;
 }
 
+int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size)
+{
+       struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+       return wb_keys_resize(&wb->flushing, new_size) ?:
+               wb_keys_resize(&wb->inc, new_size);
+}
+
 void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
 {
        struct btree_write_buffer *wb = &c->btree_write_buffer;
 
-       BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal));
+       BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) &&
+              !bch2_journal_error(&c->journal));
 
-       kvfree(wb->keys[1]);
-       kvfree(wb->keys[0]);
+       darray_exit(&wb->sorted);
+       darray_exit(&wb->flushing.keys);
+       darray_exit(&wb->inc.keys);
 }
 
 int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
 {
        struct btree_write_buffer *wb = &c->btree_write_buffer;
 
-       mutex_init(&wb->flush_lock);
-       wb->size = c->opts.btree_write_buffer_size;
+       mutex_init(&wb->inc.lock);
+       mutex_init(&wb->flushing.lock);
+       INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work);
 
-       wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL);
-       wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL);
-       if (!wb->keys[0] || !wb->keys[1])
-               return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init;
+       /* Will be resized by journal as needed: */
+       unsigned initial_size = 1 << 16;
 
-       return 0;
+       return  darray_make_room(&wb->inc.keys, initial_size) ?:
+               darray_make_room(&wb->flushing.keys, initial_size) ?:
+               darray_make_room(&wb->sorted, initial_size);
 }
index 322df1c8304e09415238b42f14095789a146ffb3..1f645f529ed21bf7afba7b67bac3ff1b28df2b1b 100644 (file)
@@ -2,12 +2,59 @@
 #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
 #define _BCACHEFS_BTREE_WRITE_BUFFER_H
 
-int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool);
+#include "bkey.h"
+
+static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c)
+{
+       struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+       return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4;
+}
+
+static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c)
+{
+       struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+       return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4;
+}
+
+struct btree_trans;
 int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
-int bch2_btree_write_buffer_flush(struct btree_trans *);
+int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *);
+int bch2_btree_write_buffer_tryflush(struct btree_trans *);
+
+struct journal_keys_to_wb {
+       struct btree_write_buffer_keys  *wb;
+       size_t                          room;
+       u64                             seq;
+};
+
+int __bch2_journal_key_to_wb(struct bch_fs *,
+                            struct journal_keys_to_wb *,
+                            enum btree_id, struct bkey_i *);
+
+static inline int bch2_journal_key_to_wb(struct bch_fs *c,
+                            struct journal_keys_to_wb *dst,
+                            enum btree_id btree, struct bkey_i *k)
+{
+       EBUG_ON(!dst->seq);
+
+       if (unlikely(!dst->room))
+               return __bch2_journal_key_to_wb(c, dst, btree, k);
+
+       struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
+       wb_k->journal_seq       = dst->seq;
+       wb_k->btree             = btree;
+       bkey_copy(&wb_k->k, k);
+       dst->wb->keys.nr++;
+       dst->room--;
+       return 0;
+}
 
-int bch2_btree_insert_keys_write_buffer(struct btree_trans *);
+void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64);
+void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
 
+int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
 void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
 int bch2_fs_btree_write_buffer_init(struct bch_fs *);
 
index 99993ba77aeab01a63470111e84db4c2ebc5afad..9b9433de9c3686aa59255858e44411384219bafc 100644 (file)
@@ -2,43 +2,56 @@
 #ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
 #define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
 
+#include "darray.h"
 #include "journal_types.h"
 
 #define BTREE_WRITE_BUFERED_VAL_U64s_MAX       4
 #define BTREE_WRITE_BUFERED_U64s_MAX   (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
 
-struct btree_write_buffered_key {
-       u64                     journal_seq;
-       unsigned                journal_offset;
-       enum btree_id           btree;
-       __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
-};
-
-union btree_write_buffer_state {
+struct wb_key_ref {
+union {
        struct {
-               atomic64_t      counter;
-       };
-
-       struct {
-               u64             v;
-       };
-
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+               unsigned                        idx:24;
+               u8                              pos[sizeof(struct bpos)];
+               enum btree_id                   btree:8;
+#else
+               enum btree_id                   btree:8;
+               u8                              pos[sizeof(struct bpos)];
+               unsigned                        idx:24;
+#endif
+       } __packed;
        struct {
-               u64                     nr:23;
-               u64                     idx:1;
-               u64                     ref0:20;
-               u64                     ref1:20;
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+               u64 lo;
+               u64 mi;
+               u64 hi;
+#else
+               u64 hi;
+               u64 mi;
+               u64 lo;
+#endif
        };
 };
+};
 
-struct btree_write_buffer {
-       struct mutex                    flush_lock;
-       struct journal_entry_pin        journal_pin;
+struct btree_write_buffered_key {
+       enum btree_id                   btree:8;
+       u64                             journal_seq:56;
+       __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
+};
 
-       union btree_write_buffer_state  state;
-       size_t                          size;
+struct btree_write_buffer_keys {
+       DARRAY(struct btree_write_buffered_key) keys;
+       struct journal_entry_pin        pin;
+       struct mutex                    lock;
+};
 
-       struct btree_write_buffered_key *keys[2];
+struct btree_write_buffer {
+       DARRAY(struct wb_key_ref)       sorted;
+       struct btree_write_buffer_keys  inc;
+       struct btree_write_buffer_keys  flushing;
+       struct work_struct              flush_work;
 };
 
 #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
index 58d8c6ffd955429d9f13207ddf04c1f687a68b2e..5bfa102a0438a33ab691fc0a73f8196b0f584cf2 100644 (file)
@@ -61,7 +61,7 @@ void bch2_fs_usage_initialize(struct bch_fs *c)
                usage->reserved += usage->persistent_reserved[i];
 
        for (i = 0; i < c->replicas.nr; i++) {
-               struct bch_replicas_entry *e =
+               struct bch_replicas_entry_v1 *e =
                        cpu_replicas_entry(&c->replicas, i);
 
                fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
@@ -214,7 +214,7 @@ void bch2_fs_usage_to_text(struct printbuf *out,
        }
 
        for (i = 0; i < c->replicas.nr; i++) {
-               struct bch_replicas_entry *e =
+               struct bch_replicas_entry_v1 *e =
                        cpu_replicas_entry(&c->replicas, i);
 
                prt_printf(out, "\t");
@@ -345,7 +345,7 @@ static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
 
 static inline int __update_replicas(struct bch_fs *c,
                                    struct bch_fs_usage *fs_usage,
-                                   struct bch_replicas_entry *r,
+                                   struct bch_replicas_entry_v1 *r,
                                    s64 sectors)
 {
        int idx = bch2_replicas_entry_idx(c, r);
@@ -359,7 +359,7 @@ static inline int __update_replicas(struct bch_fs *c,
 }
 
 static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
-                       struct bch_replicas_entry *r, s64 sectors,
+                       struct bch_replicas_entry_v1 *r, s64 sectors,
                        unsigned journal_seq, bool gc)
 {
        struct bch_fs_usage *fs_usage;
@@ -453,9 +453,9 @@ int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
                                __replicas_deltas_realloc(trans, more, _gfp));
 }
 
-static inline int update_replicas_list(struct btree_trans *trans,
-                                       struct bch_replicas_entry *r,
-                                       s64 sectors)
+int bch2_update_replicas_list(struct btree_trans *trans,
+                        struct bch_replicas_entry_v1 *r,
+                        s64 sectors)
 {
        struct replicas_delta_list *d;
        struct replicas_delta *n;
@@ -481,14 +481,13 @@ static inline int update_replicas_list(struct btree_trans *trans,
        return 0;
 }
 
-static inline int update_cached_sectors_list(struct btree_trans *trans,
-                                             unsigned dev, s64 sectors)
+int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors)
 {
        struct bch_replicas_padded r;
 
        bch2_replicas_entry_cached(&r.e, dev);
 
-       return update_replicas_list(trans, &r.e, sectors);
+       return bch2_update_replicas_list(trans, &r.e, sectors);
 }
 
 int bch2_mark_alloc(struct btree_trans *trans,
@@ -580,23 +579,6 @@ int bch2_mark_alloc(struct btree_trans *trans,
        }
        percpu_up_read(&c->mark_lock);
 
-       /*
-        * need to know if we're getting called from the invalidate path or
-        * not:
-        */
-
-       if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
-           old_a->cached_sectors) {
-               ret = update_cached_sectors(c, new, ca->dev_idx,
-                                           -((s64) old_a->cached_sectors),
-                                           journal_seq, gc);
-               if (ret) {
-                       bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
-                                           __func__);
-                       return ret;
-               }
-       }
-
        if (new_a->data_type == BCH_DATA_free &&
            (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
                closure_wake_up(&c->freelist_wait);
@@ -1470,7 +1452,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
 
        bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
        r.e.data_type = data_type;
-       ret = update_replicas_list(trans, &r.e, sectors);
+       ret = bch2_update_replicas_list(trans, &r.e, sectors);
 err:
        bch2_trans_iter_exit(trans, &iter);
        return ret;
@@ -1513,8 +1495,8 @@ static int __trans_mark_extent(struct btree_trans *trans,
 
                if (p.ptr.cached) {
                        if (!stale) {
-                               ret = update_cached_sectors_list(trans, p.ptr.dev,
-                                                                disk_sectors);
+                               ret = bch2_update_cached_sectors_list(trans, p.ptr.dev,
+                                                                     disk_sectors);
                                if (ret)
                                        return ret;
                        }
@@ -1532,7 +1514,7 @@ static int __trans_mark_extent(struct btree_trans *trans,
        }
 
        if (r.e.nr_devs)
-               ret = update_replicas_list(trans, &r.e, dirty_sectors);
+               ret = bch2_update_replicas_list(trans, &r.e, dirty_sectors);
 
        return ret;
 }
@@ -1669,7 +1651,7 @@ int bch2_trans_mark_stripe(struct btree_trans *trans,
                s64 sectors = le16_to_cpu(new_s->sectors);
 
                bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new));
-               ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
+               ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
                if (ret)
                        return ret;
        }
@@ -1678,7 +1660,7 @@ int bch2_trans_mark_stripe(struct btree_trans *trans,
                s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
 
                bch2_bkey_to_replicas(&r.e, old);
-               ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
+               ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
                if (ret)
                        return ret;
        }
index 21f6cb356921f1e3b1f9df59fbdae7309f3931fa..5574b62e0553074a706678208efe8ad58fa264db 100644 (file)
@@ -315,6 +315,9 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
                            : c->usage[journal_seq & JOURNAL_BUF_MASK]);
 }
 
+int bch2_update_replicas_list(struct btree_trans *,
+                        struct bch_replicas_entry_v1 *, s64);
+int bch2_update_cached_sectors_list(struct btree_trans *, unsigned, s64);
 int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned);
 
 void bch2_fs_usage_initialize(struct bch_fs *);
index 4bb88aefed121f275582df94e3cea9dcdec7c58c..de3d82de9d290a07ff09d27d93bf5c98bfad7309 100644 (file)
@@ -444,7 +444,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
        dst_end = (void *) arg->replicas + replica_entries_bytes;
 
        for (i = 0; i < c->replicas.nr; i++) {
-               struct bch_replicas_entry *src_e =
+               struct bch_replicas_entry_v1 *src_e =
                        cpu_replicas_entry(&c->replicas, i);
 
                /* check that we have enough space for one replicas entry */
index f41889093a2c7eacaa1723667fc7bb2af5d0f3aa..c36bfc627a985d68213cceb9f8f0b1f4cb5c5e2b 100644 (file)
@@ -95,7 +95,6 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
                                unsigned long io_until,
                                unsigned long cpu_timeout)
 {
-       bool kthread = (current->flags & PF_KTHREAD) != 0;
        struct io_clock_wait wait;
 
        wait.io_timer.expire    = io_until;
@@ -111,7 +110,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
 
        while (1) {
                set_current_state(TASK_INTERRUPTIBLE);
-               if (kthread && kthread_should_stop())
+               if (kthread_should_stop())
                        break;
 
                if (wait.expired)
index aae07be1d911dee203ae440d77d415875a3d5256..4c900c8532688fd7be902585ce531a822577804f 100644 (file)
@@ -9,10 +9,12 @@ int __bch2_darray_resize(darray_void *d, size_t element_size, size_t new_size, g
        if (new_size > d->size) {
                new_size = roundup_pow_of_two(new_size);
 
-               void *data = krealloc_array(d->data, new_size, element_size, gfp);
+               void *data = kvmalloc_array(new_size, element_size, gfp);
                if (!data)
                        return -ENOMEM;
 
+               memcpy(data, d->data, d->size * element_size);
+               kvfree(d->data);
                d->data = data;
                d->size = new_size;
        }
index 43ea21ad9ea338931e0cb7a54d13bf9f50874b77..6157c53d5bf044516f2373c972275b5df215d26b 100644 (file)
@@ -92,7 +92,7 @@ do {                                                                  \
 
 #define darray_exit(_d)                                                        \
 do {                                                                   \
-       kfree((_d)->data);                                              \
+       kvfree((_d)->data);                                             \
        darray_init(_d);                                                \
 } while (0)
 
index c730f0933d29a9f63aec199385914fad6d083762..2a02bf00b67fba12d6024b42291f301f60aa4253 100644 (file)
@@ -1005,7 +1005,7 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
        unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
        int ret = 0;
 
-       ret = bch2_btree_write_buffer_flush(trans);
+       ret = bch2_btree_write_buffer_flush_sync(trans);
        if (ret)
                goto err;
 
index e2b02a82de321bb4612e79eb1034ed997e7f3f0f..976426da3a124aaeb7edd70747cd71e547558224 100644 (file)
@@ -5,7 +5,7 @@
 #include "bcachefs_format.h"
 
 struct bch_replicas_padded {
-       struct bch_replicas_entry       e;
+       struct bch_replicas_entry_v1    e;
        u8                              pad[BCH_BKEY_PTRS_MAX];
 };
 
index e5c3262cc3032d33e561b21a4a380568ae38f917..e42b45293bbd6b0590bc53454180f76a8038c8e8 100644 (file)
        x(BCH_ERR_btree_insert_fail,    btree_insert_need_mark_replicas)        \
        x(BCH_ERR_btree_insert_fail,    btree_insert_need_journal_res)          \
        x(BCH_ERR_btree_insert_fail,    btree_insert_need_journal_reclaim)      \
-       x(BCH_ERR_btree_insert_fail,    btree_insert_need_flush_buffer)         \
        x(0,                            backpointer_to_overwritten_btree_node)  \
        x(0,                            lock_fail_root_changed)                 \
        x(0,                            journal_reclaim_would_deadlock)         \
index 5b42a76c4796f90062bb86e2914d0301e52cf7d0..9a479e4de6b36a71d1bc4b3c1ef62d8787098179 100644 (file)
@@ -35,9 +35,9 @@ static void bio_check_or_release(struct bio *bio, bool check_dirty)
        }
 }
 
-static void bch2_dio_read_complete(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_dio_read_complete)
 {
-       struct dio_read *dio = container_of(cl, struct dio_read, cl);
+       closure_type(dio, struct dio_read, cl);
 
        dio->req->ki_complete(dio->req, dio->ret);
        bio_check_or_release(&dio->rbio.bio, dio->should_dirty);
@@ -325,9 +325,9 @@ static noinline int bch2_dio_write_copy_iov(struct dio_write *dio)
        return 0;
 }
 
-static void bch2_dio_write_flush_done(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_dio_write_flush_done)
 {
-       struct dio_write *dio = container_of(cl, struct dio_write, op.cl);
+       closure_type(dio, struct dio_write, op.cl);
        struct bch_fs *c = dio->op.c;
 
        closure_debug_destroy(cl);
index b0e8144ec5500cd37a2d35f71f399c1ebe424d53..31f40e587a4f3ca76d5aa73eb3de53ac5214f548 100644 (file)
@@ -861,7 +861,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
            abs(pos_src - pos_dst) < len)
                return -EINVAL;
 
-       bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+       lock_two_nondirectories(&src->v, &dst->v);
+       bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
 
        inode_dio_wait(&src->v);
        inode_dio_wait(&dst->v);
@@ -914,7 +915,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
                ret = bch2_flush_inode(c, dst);
 err:
        bch2_quota_reservation_put(c, dst, &quota_res);
-       bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+       bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
+       unlock_two_nondirectories(&src->v, &dst->v);
 
        return bch2_err_class(ret);
 }
index d7c1b05aa438568c4b7becceb00e20853ad87d3f..5a39bcb597a33d42826a16a98da394de3fe23660 100644 (file)
@@ -453,35 +453,33 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
 static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
                                struct bch_ioctl_subvolume arg)
 {
-       struct filename *name;
        struct path path;
        struct inode *dir;
-       struct dentry *victim;
        int ret = 0;
 
        if (arg.flags)
                return -EINVAL;
 
-       name = getname((const char __user *)(unsigned long)arg.dst_ptr);
-       victim = filename_path_locked(arg.dirfd, name, &path);
-       putname(name);
-       if (IS_ERR(victim))
-               return PTR_ERR(victim);
+       ret = user_path_at(arg.dirfd,
+                       (const char __user *)(unsigned long)arg.dst_ptr,
+                       LOOKUP_FOLLOW, &path);
+       if (ret)
+               return ret;
 
-       if (victim->d_sb->s_fs_info != c) {
+       if (path.dentry->d_sb->s_fs_info != c) {
                ret = -EXDEV;
                goto err;
        }
 
-       dir = d_inode(path.dentry);
-       ret = __bch2_unlink(dir, victim, true);
-       if (!ret) {
-               fsnotify_rmdir(dir, victim);
-               d_delete(victim);
-       }
-       inode_unlock(dir);
+       dir = path.dentry->d_parent->d_inode;
+
+       ret = __bch2_unlink(dir, path.dentry, true);
+       if (ret)
+               goto err;
+
+       fsnotify_rmdir(dir, path.dentry);
+       d_delete(path.dentry);
 err:
-       dput(victim);
        path_put(&path);
        return ret;
 }
index f76d403ccb766d479b280c8faae33cd98f001548..0d0a37cad2d42b25433cd15b43bb52100244e9ee 100644 (file)
@@ -1667,8 +1667,7 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
                if (!first)
                        seq_putc(seq, ':');
                first = false;
-               seq_puts(seq, "/dev/");
-               seq_puts(seq, ca->name);
+               seq_puts(seq, ca->disk_sb.sb_name);
        }
 
        return 0;
@@ -1901,7 +1900,7 @@ got_sb:
                sb->s_flags     |= SB_POSIXACL;
 #endif
 
-       sb->s_shrink.seeks = 0;
+       sb->s_shrink->seeks = 0;
 
        vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
        ret = PTR_ERR_OR_ZERO(vinode);
index 5edf1d4b9e6bdfa9a992bf895727228c79de4267..c3af7225ff693ec9c5af06502e22f3fbc8354fd5 100644 (file)
@@ -77,9 +77,8 @@ static inline int ptrcmp(void *l, void *r)
 }
 
 enum bch_inode_lock_op {
-       INODE_LOCK              = (1U << 0),
-       INODE_PAGECACHE_BLOCK   = (1U << 1),
-       INODE_UPDATE_LOCK       = (1U << 2),
+       INODE_PAGECACHE_BLOCK   = (1U << 0),
+       INODE_UPDATE_LOCK       = (1U << 1),
 };
 
 #define bch2_lock_inodes(_locks, ...)                                  \
@@ -91,8 +90,6 @@ do {                                                                  \
                                                                        \
        for (i = 1; i < ARRAY_SIZE(a); i++)                             \
                if (a[i] != a[i - 1]) {                                 \
-                       if ((_locks) & INODE_LOCK)                      \
-                               down_write_nested(&a[i]->v.i_rwsem, i); \
                        if ((_locks) & INODE_PAGECACHE_BLOCK)           \
                                bch2_pagecache_block_get(a[i]);\
                        if ((_locks) & INODE_UPDATE_LOCK)                       \
@@ -109,8 +106,6 @@ do {                                                                        \
                                                                        \
        for (i = 1; i < ARRAY_SIZE(a); i++)                             \
                if (a[i] != a[i - 1]) {                                 \
-                       if ((_locks) & INODE_LOCK)                      \
-                               up_write(&a[i]->v.i_rwsem);             \
                        if ((_locks) & INODE_PAGECACHE_BLOCK)           \
                                bch2_pagecache_block_put(a[i]);\
                        if ((_locks) & INODE_UPDATE_LOCK)                       \
index cc90279fdf4ee85c8d6db9f8108ce188c9fd19aa..b1c89a4821f5e176f4b7b96710f9fc8de0a236e5 100644 (file)
@@ -826,6 +826,18 @@ fsck_err:
        goto out;
 }
 
+static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
+       int ret = bkey_err(k);
+       if (ret)
+               return ret;
+
+       bch2_trans_iter_exit(trans, &iter);
+       return k.k->type == KEY_TYPE_set;
+}
+
 static int check_inode(struct btree_trans *trans,
                       struct btree_iter *iter,
                       struct bkey_s_c k,
@@ -890,6 +902,17 @@ static int check_inode(struct btree_trans *trans,
                return 0;
        }
 
+       if (u.bi_flags & BCH_INODE_unlinked &&
+           c->sb.version >= bcachefs_metadata_version_deleted_inodes) {
+               ret = check_inode_deleted_list(trans, k.k->p);
+               if (ret)
+                       return ret;
+
+               fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list,
+                           "inode %llu:%u unlinked, but not on deleted list",
+                           u.bi_inum, k.k->p.snapshot);
+       }
+
        if (u.bi_flags & BCH_INODE_unlinked &&
            (!c->sb.clean ||
             fsck_err(c, inode_unlinked_but_clean,
index b9d6dbf3a54b26bacc211b3d4779fddc68460af6..a2c96f7c193f96bec34e041bfa0abbff5cf7620f 100644 (file)
@@ -1157,10 +1157,6 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
 again:
        need_another_pass = false;
 
-       ret = bch2_btree_write_buffer_flush_sync(trans);
-       if (ret)
-               goto err;
-
        /*
         * Weird transaction restart handling here because on successful delete,
         * bch2_inode_rm_snapshot() will return a nested transaction restart,
@@ -1191,8 +1187,12 @@ again:
        }
        bch2_trans_iter_exit(trans, &iter);
 
-       if (!ret && need_another_pass)
+       if (!ret && need_another_pass) {
+               ret = bch2_btree_write_buffer_flush_sync(trans);
+               if (ret)
+                       goto err;
                goto again;
+       }
 err:
        bch2_trans_put(trans);
 
index 75376f040e4b4e29d6abe94a9352d5413e0972a5..d6bd8f788d3a7e35113847a3ac02557ecbcc0770 100644 (file)
@@ -580,9 +580,9 @@ static inline void wp_update_state(struct write_point *wp, bool running)
        __wp_update_state(wp, state);
 }
 
-static void bch2_write_index(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_write_index)
 {
-       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+       closure_type(op, struct bch_write_op, cl);
        struct write_point *wp = op->wp;
        struct workqueue_struct *wq = index_update_wq(op);
        unsigned long flags;
@@ -1208,9 +1208,9 @@ static void __bch2_nocow_write_done(struct bch_write_op *op)
                bch2_nocow_write_convert_unwritten(op);
 }
 
-static void bch2_nocow_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_nocow_write_done)
 {
-       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+       closure_type(op, struct bch_write_op, cl);
 
        __bch2_nocow_write_done(op);
        bch2_write_done(cl);
@@ -1363,7 +1363,7 @@ err:
                op->insert_keys.top = op->insert_keys.keys;
        } else if (op->flags & BCH_WRITE_SYNC) {
                closure_sync(&op->cl);
-               bch2_nocow_write_done(&op->cl);
+               bch2_nocow_write_done(&op->cl.work);
        } else {
                /*
                 * XXX
@@ -1566,9 +1566,9 @@ err:
  * If op->discard is true, instead of inserting the data it invalidates the
  * region of the cache represented by op->bio and op->inode.
  */
-void bch2_write(struct closure *cl)
+CLOSURE_CALLBACK(bch2_write)
 {
-       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+       closure_type(op, struct bch_write_op, cl);
        struct bio *bio = &op->wbio.bio;
        struct bch_fs *c = op->c;
        unsigned data_len;
index 9323167229eeae8900b65733f022dae6047448b5..6c276a48f95dc2051f22dbfe00e4181319f1ee76 100644 (file)
@@ -90,8 +90,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
        op->devs_need_flush     = NULL;
 }
 
-void bch2_write(struct closure *);
-
+CLOSURE_CALLBACK(bch2_write);
 void bch2_write_point_do_index_updates(struct work_struct *);
 
 static inline struct bch_write_bio *wbio_init(struct bio *bio)
index 7d448136434bd8f2b63674298387e84c214753bb..86b148d9bea343ddc4c5bcf7bee26fdaf866209d 100644 (file)
@@ -10,6 +10,7 @@
 #include "bkey_methods.h"
 #include "btree_gc.h"
 #include "btree_update.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "error.h"
 #include "journal.h"
@@ -147,6 +148,7 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
                bch2_journal_reclaim_fast(j);
        if (write)
                closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+       wake_up(&j->wait);
 }
 
 /*
@@ -184,6 +186,8 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
        /* Close out old buffer: */
        buf->data->u64s         = cpu_to_le32(old.cur_entry_offset);
 
+       trace_journal_entry_close(c, vstruct_bytes(buf->data));
+
        sectors = vstruct_blocks_plus(buf->data, c->block_bits,
                                      buf->u64s_reserved) << c->block_bits;
        BUG_ON(sectors > buf->sectors);
@@ -328,6 +332,7 @@ static int journal_entry_open(struct journal *j)
        buf->must_flush = false;
        buf->separate_flush = false;
        buf->flush_time = 0;
+       buf->need_flush_to_write_buffer = true;
 
        memset(buf->data, 0, sizeof(*buf->data));
        buf->data->seq  = cpu_to_le64(journal_cur_seq(j));
@@ -764,6 +769,75 @@ void bch2_journal_block(struct journal *j)
        journal_quiesce(j);
 }
 
+/*
+ * XXX: ideally this would not be closing the current journal entry, but
+ * otherwise we do not have a way to avoid racing with res_get() - j->blocked
+ * will race.
+ */
+static bool journal_reservations_stopped(struct journal *j)
+{
+       union journal_res_state s;
+
+       journal_entry_close(j);
+
+       s.v = atomic64_read_acquire(&j->reservations.counter);
+
+       return  s.buf0_count == 0 &&
+               s.buf1_count == 0 &&
+               s.buf2_count == 0 &&
+               s.buf3_count == 0;
+}
+
+void bch2_journal_block_reservations(struct journal *j)
+{
+       spin_lock(&j->lock);
+       j->blocked++;
+       spin_unlock(&j->lock);
+
+       wait_event(j->wait, journal_reservations_stopped(j));
+}
+
+static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
+{
+       spin_lock(&j->lock);
+       max_seq = min(max_seq, journal_cur_seq(j));
+
+       for (u64 seq = journal_last_unwritten_seq(j);
+            seq <= max_seq;
+            seq++) {
+               unsigned idx = seq & JOURNAL_BUF_MASK;
+               struct journal_buf *buf = j->buf + idx;
+               union journal_res_state s;
+
+               if (!buf->need_flush_to_write_buffer)
+                       continue;
+
+               if (seq == journal_cur_seq(j))
+                       __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+
+               s.v = atomic64_read_acquire(&j->reservations.counter);
+
+               if (journal_state_count(s, idx)) {
+                       spin_unlock(&j->lock);
+                       return ERR_PTR(-EAGAIN);
+               }
+
+               spin_unlock(&j->lock);
+               return buf;
+       }
+
+       spin_unlock(&j->lock);
+       return NULL;
+}
+
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
+{
+       struct journal_buf *ret;
+
+       wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN));
+       return ret;
+}
+
 /* allocate journal on a device: */
 
 static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
@@ -1215,6 +1289,7 @@ int bch2_fs_journal_init(struct journal *j)
        static struct lock_class_key res_key;
        unsigned i;
 
+       mutex_init(&j->buf_lock);
        spin_lock_init(&j->lock);
        spin_lock_init(&j->err_lock);
        init_waitqueue_head(&j->wait);
index c85d01cf49484984d08d20a2159f84b2506f96a1..b5185f97af0f12100300d6366e0228a62971b0dc 100644 (file)
@@ -259,7 +259,7 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u
 {
        union journal_res_state s;
 
-       s.v = atomic64_sub_return(((union journal_res_state) {
+       s.v = atomic64_sub_return_release(((union journal_res_state) {
                                    .buf0_count = idx == 0,
                                    .buf1_count = idx == 1,
                                    .buf2_count = idx == 2,
@@ -427,6 +427,8 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
 
 void bch2_journal_unblock(struct journal *);
 void bch2_journal_block(struct journal *);
+void bch2_journal_block_reservations(struct journal *);
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq);
 
 void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
 void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
index 109c1157eba1d0c18aa510b94ac134356324e8af..4ec5d5d38abca8d85d8bc023f3f5eea1dc751415 100644 (file)
@@ -4,6 +4,7 @@
 #include "alloc_foreground.h"
 #include "btree_io.h"
 #include "btree_update_interior.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "checksum.h"
 #include "disk_groups.h"
@@ -713,6 +714,22 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs
        journal_entry_btree_keys_to_text(out, c, entry);
 }
 
+static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
+                               struct jset *jset,
+                               struct jset_entry *entry,
+                               unsigned version, int big_endian,
+                               enum bkey_invalid_flags flags)
+{
+       return journal_entry_btree_keys_validate(c, jset, entry,
+                               version, big_endian, READ);
+}
+
+static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
+                                           struct jset_entry *entry)
+{
+       journal_entry_btree_keys_to_text(out, c, entry);
+}
+
 struct jset_entry_ops {
        int (*validate)(struct bch_fs *, struct jset *,
                        struct jset_entry *, unsigned, int,
@@ -1025,10 +1042,9 @@ next_block:
        return 0;
 }
 
-static void bch2_journal_read_device(struct closure *cl)
+static CLOSURE_CALLBACK(bch2_journal_read_device)
 {
-       struct journal_device *ja =
-               container_of(cl, struct journal_device, read);
+       closure_type(ja, struct journal_device, read);
        struct bch_dev *ca = container_of(ja, struct bch_dev, journal);
        struct bch_fs *c = ca->fs;
        struct journal_list *jlist =
@@ -1494,6 +1510,8 @@ done:
 
 static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 {
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
        /* we aren't holding j->lock: */
        unsigned new_size = READ_ONCE(j->buf_size_want);
        void *new_buf;
@@ -1501,6 +1519,11 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
        if (buf->buf_size >= new_size)
                return;
 
+       size_t btree_write_buffer_size = new_size / 64;
+
+       if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
+               return;
+
        new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
        if (!new_buf)
                return;
@@ -1520,9 +1543,9 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
        return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK);
 }
 
-static void journal_write_done(struct closure *cl)
+static CLOSURE_CALLBACK(journal_write_done)
 {
-       struct journal *j = container_of(cl, struct journal, io);
+       closure_type(j, struct journal, io);
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct journal_buf *w = journal_last_unwritten_buf(j);
        struct bch_replicas_padded replicas;
@@ -1590,6 +1613,7 @@ static void journal_write_done(struct closure *cl)
        } while ((v = atomic64_cmpxchg(&j->reservations.counter,
                                       old.v, new.v)) != old.v);
 
+       bch2_journal_reclaim_fast(j);
        bch2_journal_space_available(j);
 
        track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
@@ -1641,9 +1665,9 @@ static void journal_write_endio(struct bio *bio)
        percpu_ref_put(&ca->io_ref);
 }
 
-static void do_journal_write(struct closure *cl)
+static CLOSURE_CALLBACK(do_journal_write)
 {
-       struct journal *j = container_of(cl, struct journal, io);
+       closure_type(j, struct journal, io);
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct bch_dev *ca;
        struct journal_buf *w = journal_last_unwritten_buf(j);
@@ -1693,9 +1717,11 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct jset_entry *start, *end, *i, *next, *prev = NULL;
        struct jset *jset = w->data;
+       struct journal_keys_to_wb wb = { NULL };
        unsigned sectors, bytes, u64s;
-       bool validate_before_checksum = false;
        unsigned long btree_roots_have = 0;
+       bool validate_before_checksum = false;
+       u64 seq = le64_to_cpu(jset->seq);
        int ret;
 
        /*
@@ -1723,9 +1749,28 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
                 * to c->btree_roots we have to get any missing btree roots and
                 * add them to this journal entry:
                 */
-               if (i->type == BCH_JSET_ENTRY_btree_root) {
+               switch (i->type) {
+               case BCH_JSET_ENTRY_btree_root:
                        bch2_journal_entry_to_btree_root(c, i);
                        __set_bit(i->btree_id, &btree_roots_have);
+                       break;
+               case BCH_JSET_ENTRY_write_buffer_keys:
+                       EBUG_ON(!w->need_flush_to_write_buffer);
+
+                       if (!wb.wb)
+                               bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
+
+                       struct bkey_i *k;
+                       jset_entry_for_each_key(i, k) {
+                               ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
+                               if (ret) {
+                                       bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer");
+                                       bch2_journal_keys_to_write_buffer_end(c, &wb);
+                                       return ret;
+                               }
+                       }
+                       i->type = BCH_JSET_ENTRY_btree_keys;
+                       break;
                }
 
                /* Can we merge with previous entry? */
@@ -1748,6 +1793,10 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
                        memmove_u64s_down(prev, i, jset_u64s(u64s));
        }
 
+       if (wb.wb)
+               bch2_journal_keys_to_write_buffer_end(c, &wb);
+       w->need_flush_to_write_buffer = false;
+
        prev = prev ? vstruct_next(prev) : jset->start;
        jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
 
@@ -1755,8 +1804,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
 
        end     = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
 
-       bch2_journal_super_entries_add_common(c, &end,
-                               le64_to_cpu(jset->seq));
+       bch2_journal_super_entries_add_common(c, &end, seq);
        u64s    = (u64 *) end - (u64 *) start;
        BUG_ON(u64s > j->entry_u64s_reserved);
 
@@ -1779,7 +1827,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
        SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
 
        if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
-               j->last_empty_seq = le64_to_cpu(jset->seq);
+               j->last_empty_seq = seq;
 
        if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
                validate_before_checksum = true;
@@ -1838,7 +1886,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
            (!w->must_flush &&
             (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
             test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
-                    w->noflush = true;
+               w->noflush = true;
                SET_JSET_NO_FLUSH(w->data, true);
                w->data->last_seq       = 0;
                w->last_seq             = 0;
@@ -1853,9 +1901,9 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
        return 0;
 }
 
-void bch2_journal_write(struct closure *cl)
+CLOSURE_CALLBACK(bch2_journal_write)
 {
-       struct journal *j = container_of(cl, struct journal, io);
+       closure_type(j, struct journal, io);
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct bch_dev *ca;
        struct journal_buf *w = journal_last_unwritten_buf(j);
@@ -1875,9 +1923,11 @@ void bch2_journal_write(struct closure *cl)
        if (ret)
                goto err;
 
+       mutex_lock(&j->buf_lock);
        journal_buf_realloc(j, w);
 
        ret = bch2_journal_write_prep(j, w);
+       mutex_unlock(&j->buf_lock);
        if (ret)
                goto err;
 
index a88d097b13f1294a5ca1f3c30ebba5282ef56da3..c035e7c108e19012e6e4e1f708136dec27b5387c 100644 (file)
@@ -60,6 +60,6 @@ void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *,
 
 int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
 
-void bch2_journal_write(struct closure *);
+CLOSURE_CALLBACK(bch2_journal_write);
 
 #endif /* _BCACHEFS_JOURNAL_IO_H */
index 8fa05bedb7dff8f36084fafc37106fe9915579a0..2aa4c0c6bbba480da77dd41fe59843f0d4f1c0f9 100644 (file)
@@ -3,6 +3,7 @@
 #include "bcachefs.h"
 #include "btree_key_cache.h"
 #include "btree_update.h"
+#include "btree_write_buffer.h"
 #include "buckets.h"
 #include "errcode.h"
 #include "error.h"
@@ -50,20 +51,23 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
        return available;
 }
 
-static inline void journal_set_watermark(struct journal *j)
+void bch2_journal_set_watermark(struct journal *j)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        bool low_on_space = j->space[journal_space_clean].total * 4 <=
                j->space[journal_space_total].total;
        bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4;
-       unsigned watermark = low_on_space || low_on_pin
+       bool low_on_wb = bch2_btree_write_buffer_must_wait(c);
+       unsigned watermark = low_on_space || low_on_pin || low_on_wb
                ? BCH_WATERMARK_reclaim
                : BCH_WATERMARK_stripe;
 
        if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
                               &j->low_on_space_start, low_on_space) ||
            track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
-                              &j->low_on_pin_start, low_on_pin))
+                              &j->low_on_pin_start, low_on_pin) ||
+           track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full],
+                              &j->write_buffer_full_start, low_on_wb))
                trace_and_count(c, journal_full, c);
 
        swap(watermark, j->watermark);
@@ -230,7 +234,7 @@ void bch2_journal_space_available(struct journal *j)
        else
                clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
 
-       journal_set_watermark(j);
+       bch2_journal_set_watermark(j);
 out:
        j->cur_entry_sectors    = !ret ? j->space[journal_space_discarded].next_entry : 0;
        j->cur_entry_error      = ret;
@@ -303,6 +307,7 @@ void bch2_journal_reclaim_fast(struct journal *j)
         * all btree nodes got written out
         */
        while (!fifo_empty(&j->pin) &&
+              j->pin.front <= j->seq_ondisk &&
               !atomic_read(&fifo_peek_front(&j->pin).count)) {
                j->pin.front++;
                popped = true;
@@ -635,7 +640,6 @@ static u64 journal_seq_to_flush(struct journal *j)
 static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
-       bool kthread = (current->flags & PF_KTHREAD) != 0;
        u64 seq_to_flush;
        size_t min_nr, min_key_cache, nr_flushed;
        unsigned flags;
@@ -651,7 +655,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked)
        flags = memalloc_noreclaim_save();
 
        do {
-               if (kthread && kthread_should_stop())
+               if (kthread_should_stop())
                        break;
 
                if (bch2_journal_error(j)) {
index 7b15d682a0f51d28c47f7d881edb1b08ca24d10c..ec84c334528177e8c865ebdbf9b9d7e265270718 100644 (file)
@@ -16,6 +16,7 @@ static inline void journal_reclaim_kick(struct journal *j)
 unsigned bch2_journal_dev_buckets_available(struct journal *,
                                            struct journal_device *,
                                            enum journal_space_from);
+void bch2_journal_set_watermark(struct journal *);
 void bch2_journal_space_available(struct journal *);
 
 static inline bool journal_pin_active(struct journal_entry_pin *pin)
index 2427cce64fed93388214c3de8b6446875eaf01b6..85c543af60e5143a3a8546650906b09242026ee8 100644 (file)
@@ -36,6 +36,7 @@ struct journal_buf {
        bool                    noflush;        /* write has already been kicked off, and was noflush */
        bool                    must_flush;     /* something wants a flush */
        bool                    separate_flush;
+       bool                    need_flush_to_write_buffer;
 };
 
 /*
@@ -181,6 +182,12 @@ struct journal {
         */
        darray_u64              early_journal_entries;
 
+       /*
+        * Protects journal_buf->data, when accessing without a jorunal
+        * reservation: for synchronization between the btree write buffer code
+        * and the journal write path:
+        */
+       struct mutex            buf_lock;
        /*
         * Two journal entries -- one is currently open for new entries, the
         * other is possibly being written out.
@@ -271,6 +278,7 @@ struct journal {
        u64                     low_on_space_start;
        u64                     low_on_pin_start;
        u64                     max_in_flight_start;
+       u64                     write_buffer_full_start;
 
        struct bch2_time_stats  *flush_write_time;
        struct bch2_time_stats  *noflush_write_time;
index 4f7d1758d8a97588a2e73a7397ea9880a356b414..cf36f2b0738f40284d099b9e654d1f78463eae71 100644 (file)
 #include <linux/ioprio.h>
 #include <linux/kthread.h>
 
+const char * const bch2_data_ops_strs[] = {
+#define x(t, n, ...) [n] = #t,
+       BCH_DATA_OPS()
+#undef x
+       NULL
+};
+
 static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
 {
        if (trace_move_extent_enabled()) {
@@ -163,12 +170,17 @@ void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
                atomic_read(&ctxt->write_sectors) != sectors_pending);
 }
 
+static void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
+{
+       move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
+       closure_sync(&ctxt->cl);
+}
+
 void bch2_moving_ctxt_exit(struct moving_context *ctxt)
 {
        struct bch_fs *c = ctxt->trans->c;
 
-       move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
-       closure_sync(&ctxt->cl);
+       bch2_moving_ctxt_flush_all(ctxt);
 
        EBUG_ON(atomic_read(&ctxt->write_sectors));
        EBUG_ON(atomic_read(&ctxt->write_ios));
@@ -216,7 +228,7 @@ void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
        trace_move_data(c, stats);
 }
 
-void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
+void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
 {
        memset(stats, 0, sizeof(*stats));
        stats->data_type = BCH_DATA_user;
@@ -484,8 +496,8 @@ int bch2_move_ratelimit(struct moving_context *ctxt)
        struct bch_fs *c = ctxt->trans->c;
        u64 delay;
 
-       if (ctxt->wait_on_copygc && !c->copygc_running) {
-               bch2_trans_unlock_long(ctxt->trans);
+       if (ctxt->wait_on_copygc && c->copygc_running) {
+               bch2_moving_ctxt_flush_all(ctxt);
                wait_event_killable(c->copygc_running_wq,
                                    !c->copygc_running ||
                                    kthread_should_stop());
@@ -503,7 +515,7 @@ int bch2_move_ratelimit(struct moving_context *ctxt)
                        set_current_state(TASK_INTERRUPTIBLE);
                }
 
-               if ((current->flags & PF_KTHREAD) && kthread_should_stop()) {
+               if (kthread_should_stop()) {
                        __set_current_state(TASK_RUNNING);
                        return 1;
                }
@@ -512,7 +524,7 @@ int bch2_move_ratelimit(struct moving_context *ctxt)
                        schedule_timeout(delay);
 
                if (unlikely(freezing(current))) {
-                       move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
+                       bch2_moving_ctxt_flush_all(ctxt);
                        try_to_freeze();
                }
        } while (delay);
@@ -721,11 +733,10 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
        bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
        fragmentation = a->fragmentation_lru;
 
-       ret = bch2_btree_write_buffer_flush(trans);
-       if (ret) {
-               bch_err_msg(c, ret, "flushing btree write buffer");
+       ret = bch2_btree_write_buffer_tryflush(trans);
+       bch_err_msg(c, ret, "flushing btree write buffer");
+       if (ret)
                goto err;
-       }
 
        while (!(ret = bch2_move_ratelimit(ctxt))) {
                bch2_trans_begin(trans);
@@ -856,18 +867,17 @@ typedef bool (*move_btree_pred)(struct bch_fs *, void *,
                                struct data_update_opts *);
 
 static int bch2_move_btree(struct bch_fs *c,
-                          enum btree_id start_btree_id, struct bpos start_pos,
-                          enum btree_id end_btree_id,   struct bpos end_pos,
+                          struct bbpos start,
+                          struct bbpos end,
                           move_btree_pred pred, void *arg,
                           struct bch_move_stats *stats)
 {
-       bool kthread = (current->flags & PF_KTHREAD) != 0;
        struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
        struct moving_context ctxt;
        struct btree_trans *trans;
        struct btree_iter iter;
        struct btree *b;
-       enum btree_id id;
+       enum btree_id btree;
        struct data_update_opts data_opts;
        int ret = 0;
 
@@ -878,26 +888,26 @@ static int bch2_move_btree(struct bch_fs *c,
 
        stats->data_type = BCH_DATA_btree;
 
-       for (id = start_btree_id;
-            id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
-            id++) {
-               stats->pos = BBPOS(id, POS_MIN);
+       for (btree = start.btree;
+            btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
+            btree ++) {
+               stats->pos = BBPOS(btree, POS_MIN);
 
-               if (!bch2_btree_id_root(c, id)->b)
+               if (!bch2_btree_id_root(c, btree)->b)
                        continue;
 
-               bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
+               bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
                                          BTREE_ITER_PREFETCH);
 retry:
                ret = 0;
                while (bch2_trans_begin(trans),
                       (b = bch2_btree_iter_peek_node(&iter)) &&
                       !(ret = PTR_ERR_OR_ZERO(b))) {
-                       if (kthread && kthread_should_stop())
+                       if (kthread_should_stop())
                                break;
 
-                       if ((cmp_int(id, end_btree_id) ?:
-                            bpos_cmp(b->key.k.p, end_pos)) > 0)
+                       if ((cmp_int(btree, end.btree) ?:
+                            bpos_cmp(b->key.k.p, end.pos)) > 0)
                                break;
 
                        stats->pos = BBPOS(iter.btree_id, iter.pos);
@@ -918,7 +928,7 @@ next:
 
                bch2_trans_iter_exit(trans, &iter);
 
-               if (kthread && kthread_should_stop())
+               if (kthread_should_stop())
                        break;
        }
 
@@ -1034,8 +1044,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
        int ret;
 
        ret = bch2_move_btree(c,
-                             0,                POS_MIN,
-                             BTREE_ID_NR,      SPOS_MAX,
+                             BBPOS_MIN,
+                             BBPOS_MAX,
                              rewrite_old_nodes_pred, c, stats);
        if (!ret) {
                mutex_lock(&c->sb_lock);
@@ -1050,71 +1060,101 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
        return ret;
 }
 
+static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
+                            struct bkey_s_c k,
+                            struct bch_io_opts *io_opts,
+                            struct data_update_opts *data_opts)
+{
+       unsigned durability = bch2_bkey_durability(c, k);
+       unsigned replicas = bkey_is_btree_ptr(k.k)
+               ? c->opts.metadata_replicas
+               : io_opts->data_replicas;
+       const union bch_extent_entry *entry;
+       struct extent_ptr_decoded p;
+       unsigned i = 0;
+
+       bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
+               unsigned d = bch2_extent_ptr_durability(c, &p);
+
+               if (d && durability - d >= replicas) {
+                       data_opts->kill_ptrs |= BIT(i);
+                       durability -= d;
+               }
+
+               i++;
+       }
+
+       return data_opts->kill_ptrs != 0;
+}
+
+static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
+                                  struct btree *b,
+                                  struct bch_io_opts *io_opts,
+                                  struct data_update_opts *data_opts)
+{
+       return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
 int bch2_data_job(struct bch_fs *c,
                  struct bch_move_stats *stats,
                  struct bch_ioctl_data op)
 {
+       struct bbpos start      = BBPOS(op.start_btree, op.start_pos);
+       struct bbpos end        = BBPOS(op.end_btree, op.end_pos);
        int ret = 0;
 
+       if (op.op >= BCH_DATA_OP_NR)
+               return -EINVAL;
+
+       bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
+
        switch (op.op) {
-       case BCH_DATA_OP_REREPLICATE:
-               bch2_move_stats_init(stats, "rereplicate");
+       case BCH_DATA_OP_rereplicate:
                stats->data_type = BCH_DATA_journal;
                ret = bch2_journal_flush_device_pins(&c->journal, -1);
-
-               ret = bch2_move_btree(c,
-                                     op.start_btree,   op.start_pos,
-                                     op.end_btree,     op.end_pos,
+               ret = bch2_move_btree(c, start, end,
                                      rereplicate_btree_pred, c, stats) ?: ret;
-               ret = bch2_replicas_gc2(c) ?: ret;
-
-               ret = bch2_move_data(c,
-                                    (struct bbpos) { op.start_btree,   op.start_pos },
-                                    (struct bbpos) { op.end_btree,     op.end_pos },
+               ret = bch2_move_data(c, start, end,
                                     NULL,
                                     stats,
                                     writepoint_hashed((unsigned long) current),
                                     true,
                                     rereplicate_pred, c) ?: ret;
                ret = bch2_replicas_gc2(c) ?: ret;
-
-               bch2_move_stats_exit(stats, c);
                break;
-       case BCH_DATA_OP_MIGRATE:
+       case BCH_DATA_OP_migrate:
                if (op.migrate.dev >= c->sb.nr_devices)
                        return -EINVAL;
 
-               bch2_move_stats_init(stats, "migrate");
                stats->data_type = BCH_DATA_journal;
                ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
-
-               ret = bch2_move_btree(c,
-                                     op.start_btree,   op.start_pos,
-                                     op.end_btree,     op.end_pos,
+               ret = bch2_move_btree(c, start, end,
                                      migrate_btree_pred, &op, stats) ?: ret;
-               ret = bch2_replicas_gc2(c) ?: ret;
-
-               ret = bch2_move_data(c,
-                                    (struct bbpos) { op.start_btree,   op.start_pos },
-                                    (struct bbpos) { op.end_btree,     op.end_pos },
+               ret = bch2_move_data(c, start, end,
                                     NULL,
                                     stats,
                                     writepoint_hashed((unsigned long) current),
                                     true,
                                     migrate_pred, &op) ?: ret;
                ret = bch2_replicas_gc2(c) ?: ret;
-
-               bch2_move_stats_exit(stats, c);
                break;
-       case BCH_DATA_OP_REWRITE_OLD_NODES:
-               bch2_move_stats_init(stats, "rewrite_old_nodes");
+       case BCH_DATA_OP_rewrite_old_nodes:
                ret = bch2_scan_old_btree_nodes(c, stats);
-               bch2_move_stats_exit(stats, c);
+               break;
+       case BCH_DATA_OP_drop_extra_replicas:
+               ret = bch2_move_btree(c, start, end,
+                               drop_extra_replicas_btree_pred, c, stats) ?: ret;
+               ret = bch2_move_data(c, start, end, NULL, stats,
+                               writepoint_hashed((unsigned long) current),
+                               true,
+                               drop_extra_replicas_pred, c) ?: ret;
+               ret = bch2_replicas_gc2(c) ?: ret;
                break;
        default:
                ret = -EINVAL;
        }
 
+       bch2_move_stats_exit(stats, c);
        return ret;
 }
 
index 07cf9d42643b4fe537b6db513285efc1f65bd366..cedde6ee99d0c88ceb86525e4e8fb53e643385ef 100644 (file)
@@ -56,6 +56,8 @@ do {                                                                  \
 typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
                             struct bch_io_opts *, struct data_update_opts *);
 
+extern const char * const bch2_data_ops_strs[];
+
 void bch2_moving_ctxt_exit(struct moving_context *);
 void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
                           struct bch_ratelimit *, struct bch_move_stats *,
@@ -130,7 +132,7 @@ int bch2_data_job(struct bch_fs *,
 
 void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
 void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
-void bch2_move_stats_init(struct bch_move_stats *, char *);
+void bch2_move_stats_init(struct bch_move_stats *, const char *);
 
 void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
 
index 0a0576326c5b2d433fcd4aace513379972f57152..e884324bd2fa3e742e17c46522ba47b921b8e21a 100644 (file)
@@ -153,8 +153,11 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
 
        move_buckets_wait(ctxt, buckets_in_flight, false);
 
-       ret = bch2_btree_write_buffer_flush(trans);
-       if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
+       ret = bch2_btree_write_buffer_tryflush(trans);
+       if (bch2_err_matches(ret, EROFS))
+               return ret;
+
+       if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()",
                                 __func__, bch2_err_str(ret)))
                return ret;
 
index 8526f177450a56900c907a2e4cba3950fe5f9e00..b7f9990c58485d6416d45a138f0223f3d08fef17 100644 (file)
@@ -233,11 +233,6 @@ enum fsck_err_opts {
          OPT_BOOL(),                                                   \
          BCH2_NO_SB_OPT,               true,                           \
          NULL,         "Stash pointer to in memory btree node in btree ptr")\
-       x(btree_write_buffer_size, u32,                                 \
-         OPT_FS|OPT_MOUNT,                                             \
-         OPT_UINT(16, (1U << 20) - 1),                                 \
-         BCH2_NO_SB_OPT,               1U << 13,                       \
-         NULL,         "Number of btree write buffer entries")         \
        x(gc_reserve_percent,           u8,                             \
          OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                      \
          OPT_UINT(5, 21),                                              \
index 83fc121ff3c44d3012a1d5770536549cbc479971..a94f2b5ed055f747a27b1cca56ed4ac233e80345 100644 (file)
@@ -159,6 +159,8 @@ static int bch2_journal_replay(struct bch_fs *c)
                        goto err;
        }
 
+       BUG_ON(!atomic_read(&keys->ref));
+
        /*
         * First, attempt to replay keys in sorted order. This is more
         * efficient - better locality of btree access -  but some might fail if
@@ -218,14 +220,15 @@ static int bch2_journal_replay(struct bch_fs *c)
        bch2_trans_put(trans);
        trans = NULL;
 
+       if (!c->opts.keep_journal)
+               bch2_journal_keys_put_initial(c);
+
        replay_now_at(j, j->replay_journal_seq_end);
        j->replay_journal_seq = 0;
 
        bch2_journal_set_replay_done(j);
-       bch2_journal_flush_all_pins(j);
-       ret = bch2_journal_error(j);
 
-       if (keys->nr && !ret)
+       if (keys->nr)
                bch2_journal_log_msg(c, "journal replay finished");
 err:
        if (trans)
@@ -935,8 +938,12 @@ use_clean:
 
                bch2_move_stats_init(&stats, "recovery");
 
-               bch_info(c, "scanning for old btree nodes");
-               ret =   bch2_fs_read_write(c) ?:
+               struct printbuf buf = PRINTBUF;
+               bch2_version_to_text(&buf, c->sb.version_min);
+               bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf);
+               printbuf_exit(&buf);
+
+               ret =   bch2_fs_read_write_early(c) ?:
                        bch2_scan_old_btree_nodes(c, &stats);
                if (ret)
                        goto err;
@@ -953,10 +960,8 @@ out:
        bch2_flush_fsck_errs(c);
 
        if (!c->opts.keep_journal &&
-           test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) {
-               bch2_journal_keys_free(&c->journal_keys);
-               bch2_journal_entries_free(c);
-       }
+           test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+               bch2_journal_keys_put_initial(c);
        kfree(clean);
 
        if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) {
index 1c3ae13bfced1d8ce9eeee118cb6e9fe1552e7a5..820f99898a16e8f89f99e2923201dcbf6c87c958 100644 (file)
@@ -11,7 +11,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
 
 /* Replicas tracking - in memory: */
 
-static void verify_replicas_entry(struct bch_replicas_entry *e)
+static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
        unsigned i;
@@ -26,7 +26,7 @@ static void verify_replicas_entry(struct bch_replicas_entry *e)
 #endif
 }
 
-void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
+void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
 {
        bubble_sort(e->devs, e->nr_devs, u8_cmp);
 }
@@ -53,7 +53,7 @@ static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
 }
 
 void bch2_replicas_entry_to_text(struct printbuf *out,
-                                struct bch_replicas_entry *e)
+                                struct bch_replicas_entry_v1 *e)
 {
        unsigned i;
 
@@ -71,7 +71,7 @@ void bch2_replicas_entry_to_text(struct printbuf *out,
 void bch2_cpu_replicas_to_text(struct printbuf *out,
                               struct bch_replicas_cpu *r)
 {
-       struct bch_replicas_entry *e;
+       struct bch_replicas_entry_v1 *e;
        bool first = true;
 
        for_each_cpu_replicas_entry(r, e) {
@@ -84,7 +84,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out,
 }
 
 static void extent_to_replicas(struct bkey_s_c k,
-                              struct bch_replicas_entry *r)
+                              struct bch_replicas_entry_v1 *r)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
@@ -104,7 +104,7 @@ static void extent_to_replicas(struct bkey_s_c k,
 }
 
 static void stripe_to_replicas(struct bkey_s_c k,
-                              struct bch_replicas_entry *r)
+                              struct bch_replicas_entry_v1 *r)
 {
        struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
        const struct bch_extent_ptr *ptr;
@@ -117,7 +117,7 @@ static void stripe_to_replicas(struct bkey_s_c k,
                r->devs[r->nr_devs++] = ptr->dev;
 }
 
-void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
+void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
                           struct bkey_s_c k)
 {
        e->nr_devs = 0;
@@ -142,7 +142,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
        bch2_replicas_entry_sort(e);
 }
 
-void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
+void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
                              enum bch_data_type data_type,
                              struct bch_devs_list devs)
 {
@@ -164,7 +164,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
 
 static struct bch_replicas_cpu
 cpu_replicas_add_entry(struct bch_replicas_cpu *old,
-                      struct bch_replicas_entry *new_entry)
+                      struct bch_replicas_entry_v1 *new_entry)
 {
        unsigned i;
        struct bch_replicas_cpu new = {
@@ -194,7 +194,7 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
 }
 
 static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
-                                      struct bch_replicas_entry *search)
+                                      struct bch_replicas_entry_v1 *search)
 {
        int idx, entry_size = replicas_entry_bytes(search);
 
@@ -212,7 +212,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
 }
 
 int bch2_replicas_entry_idx(struct bch_fs *c,
-                           struct bch_replicas_entry *search)
+                           struct bch_replicas_entry_v1 *search)
 {
        bch2_replicas_entry_sort(search);
 
@@ -220,13 +220,13 @@ int bch2_replicas_entry_idx(struct bch_fs *c,
 }
 
 static bool __replicas_has_entry(struct bch_replicas_cpu *r,
-                                struct bch_replicas_entry *search)
+                                struct bch_replicas_entry_v1 *search)
 {
        return __replicas_entry_idx(r, search) >= 0;
 }
 
 bool bch2_replicas_marked(struct bch_fs *c,
-                         struct bch_replicas_entry *search)
+                         struct bch_replicas_entry_v1 *search)
 {
        bool marked;
 
@@ -343,7 +343,7 @@ err:
 static unsigned reserve_journal_replicas(struct bch_fs *c,
                                     struct bch_replicas_cpu *r)
 {
-       struct bch_replicas_entry *e;
+       struct bch_replicas_entry_v1 *e;
        unsigned journal_res_u64s = 0;
 
        /* nr_inodes: */
@@ -368,7 +368,7 @@ static unsigned reserve_journal_replicas(struct bch_fs *c,
 
 noinline
 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
-                               struct bch_replicas_entry *new_entry)
+                               struct bch_replicas_entry_v1 *new_entry)
 {
        struct bch_replicas_cpu new_r, new_gc;
        int ret = 0;
@@ -433,7 +433,7 @@ err:
        goto out;
 }
 
-int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
+int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
 {
        return likely(bch2_replicas_marked(c, r))
                ? 0 : bch2_mark_replicas_slowpath(c, r);
@@ -484,7 +484,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 
 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
 {
-       struct bch_replicas_entry *e;
+       struct bch_replicas_entry_v1 *e;
        unsigned i = 0;
 
        lockdep_assert_held(&c->replicas_gc_lock);
@@ -559,7 +559,7 @@ retry:
        }
 
        for (i = 0; i < c->replicas.nr; i++) {
-               struct bch_replicas_entry *e =
+               struct bch_replicas_entry_v1 *e =
                        cpu_replicas_entry(&c->replicas, i);
 
                if (e->data_type == BCH_DATA_journal ||
@@ -590,7 +590,7 @@ retry:
 }
 
 int bch2_replicas_set_usage(struct bch_fs *c,
-                           struct bch_replicas_entry *r,
+                           struct bch_replicas_entry_v1 *r,
                            u64 sectors)
 {
        int ret, idx = bch2_replicas_entry_idx(c, r);
@@ -623,7 +623,7 @@ static int
 __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
                                   struct bch_replicas_cpu *cpu_r)
 {
-       struct bch_replicas_entry *e, *dst;
+       struct bch_replicas_entry_v1 *e, *dst;
        unsigned nr = 0, entry_size = 0, idx = 0;
 
        for_each_replicas_entry(sb_r, e) {
@@ -661,7 +661,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
                nr++;
        }
 
-       entry_size += sizeof(struct bch_replicas_entry) -
+       entry_size += sizeof(struct bch_replicas_entry_v1) -
                sizeof(struct bch_replicas_entry_v0);
 
        cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
@@ -672,7 +672,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
        cpu_r->entry_size       = entry_size;
 
        for_each_replicas_entry(sb_r, e) {
-               struct bch_replicas_entry *dst =
+               struct bch_replicas_entry_v1 *dst =
                        cpu_replicas_entry(cpu_r, idx++);
 
                dst->data_type  = e->data_type;
@@ -716,7 +716,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
 {
        struct bch_sb_field_replicas_v0 *sb_r;
        struct bch_replicas_entry_v0 *dst;
-       struct bch_replicas_entry *src;
+       struct bch_replicas_entry_v1 *src;
        size_t bytes;
 
        bytes = sizeof(struct bch_sb_field_replicas);
@@ -754,7 +754,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
                                            struct bch_replicas_cpu *r)
 {
        struct bch_sb_field_replicas *sb_r;
-       struct bch_replicas_entry *dst, *src;
+       struct bch_replicas_entry_v1 *dst, *src;
        bool need_v1 = false;
        size_t bytes;
 
@@ -805,7 +805,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
                      memcmp, NULL);
 
        for (i = 0; i < cpu_r->nr; i++) {
-               struct bch_replicas_entry *e =
+               struct bch_replicas_entry_v1 *e =
                        cpu_replicas_entry(cpu_r, i);
 
                if (e->data_type >= BCH_DATA_NR) {
@@ -835,7 +835,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
                        }
 
                if (i + 1 < cpu_r->nr) {
-                       struct bch_replicas_entry *n =
+                       struct bch_replicas_entry_v1 *n =
                                cpu_replicas_entry(cpu_r, i + 1);
 
                        BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
@@ -872,7 +872,7 @@ static void bch2_sb_replicas_to_text(struct printbuf *out,
                                     struct bch_sb_field *f)
 {
        struct bch_sb_field_replicas *r = field_to_type(f, replicas);
-       struct bch_replicas_entry *e;
+       struct bch_replicas_entry_v1 *e;
        bool first = true;
 
        for_each_replicas_entry(r, e) {
@@ -934,7 +934,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
 bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
                           unsigned flags, bool print)
 {
-       struct bch_replicas_entry *e;
+       struct bch_replicas_entry_v1 *e;
        bool ret = true;
 
        percpu_down_read(&c->mark_lock);
@@ -994,7 +994,7 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
        replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
 
        if (replicas) {
-               struct bch_replicas_entry *r;
+               struct bch_replicas_entry_v1 *r;
 
                for_each_replicas_entry(replicas, r)
                        for (i = 0; i < r->nr_devs; i++)
index 4887675a86f09c7a3942f3eae33d76179fe3c7bc..b2bb12a9b5335fbb25a95d339ceecb0a28f06433 100644 (file)
@@ -6,26 +6,26 @@
 #include "eytzinger.h"
 #include "replicas_types.h"
 
-void bch2_replicas_entry_sort(struct bch_replicas_entry *);
+void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
 void bch2_replicas_entry_to_text(struct printbuf *,
-                                struct bch_replicas_entry *);
+                                struct bch_replicas_entry_v1 *);
 void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
 
-static inline struct bch_replicas_entry *
+static inline struct bch_replicas_entry_v1 *
 cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
 {
        return (void *) r->entries + r->entry_size * i;
 }
 
 int bch2_replicas_entry_idx(struct bch_fs *,
-                           struct bch_replicas_entry *);
+                           struct bch_replicas_entry_v1 *);
 
-void bch2_devlist_to_replicas(struct bch_replicas_entry *,
+void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *,
                              enum bch_data_type,
                              struct bch_devs_list);
-bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
+bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *);
 int bch2_mark_replicas(struct bch_fs *,
-                      struct bch_replicas_entry *);
+                      struct bch_replicas_entry_v1 *);
 
 static inline struct replicas_delta *
 replicas_delta_next(struct replicas_delta *d)
@@ -35,9 +35,9 @@ replicas_delta_next(struct replicas_delta *d)
 
 int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
 
-void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
+void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c);
 
-static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,
                                              unsigned dev)
 {
        e->data_type    = BCH_DATA_cached;
@@ -57,7 +57,7 @@ int bch2_replicas_gc_start(struct bch_fs *, unsigned);
 int bch2_replicas_gc2(struct bch_fs *);
 
 int bch2_replicas_set_usage(struct bch_fs *,
-                           struct bch_replicas_entry *,
+                           struct bch_replicas_entry_v1 *,
                            u64);
 
 #define for_each_cpu_replicas_entry(_r, _i)                            \
index 5cfff489bbc34860e9e2a833617f9298653b255a..030324078bbab10eda32082a6443af297ea00871 100644 (file)
@@ -5,12 +5,12 @@
 struct bch_replicas_cpu {
        unsigned                nr;
        unsigned                entry_size;
-       struct bch_replicas_entry *entries;
+       struct bch_replicas_entry_v1 *entries;
 };
 
 struct replicas_delta {
        s64                     delta;
-       struct bch_replicas_entry r;
+       struct bch_replicas_entry_v1 r;
 } __packed;
 
 struct replicas_delta_list {
index e151ada1c8bd2db23e31bc1f6f027815585e8ab2..fedc9e102eddad07cbf4743ec1d9122c20e19798 100644 (file)
@@ -235,7 +235,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
        }
 
        for (i = 0; i < c->replicas.nr; i++) {
-               struct bch_replicas_entry *e =
+               struct bch_replicas_entry_v1 *e =
                        cpu_replicas_entry(&c->replicas, i);
                struct jset_entry_data_usage *u =
                        container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
index 5a09a53966be195128baf4d5a35205e6eff4b479..57bce14c9d71447ad832bd4089f5f3329f5c925f 100644 (file)
        x(root_inode_not_dir,                                   240)    \
        x(dir_loop,                                             241)    \
        x(hash_table_key_duplicate,                             242)    \
-       x(hash_table_key_wrong_offset,                          243)
+       x(hash_table_key_wrong_offset,                          243)    \
+       x(unlinked_inode_not_on_deleted_list,                   244)
 
 enum bch_sb_error_id {
 #define x(t, n) BCH_FSCK_ERR_##t = n,
index f37d5ad2c6ffdd458a841796a18d940d26762a41..3a494c5d12478595c76bebc89fd15b517c5ed6d0 100644 (file)
@@ -324,7 +324,7 @@ bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
 }
 EXPORT_SYMBOL_GPL(six_relock_ip);
 
-#ifdef CONFIG_LOCK_SPIN_ON_OWNER
+#ifdef CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN
 
 static inline bool six_owner_running(struct six_lock *lock)
 {
index f4cad903f4d69da7776825f50bf561a1980a02a0..fa8534782183163841874f8d3f972530017b7735 100644 (file)
@@ -166,6 +166,7 @@ void bch2_free_super(struct bch_sb_handle *sb)
        if (!IS_ERR_OR_NULL(sb->bdev))
                blkdev_put(sb->bdev, sb->holder);
        kfree(sb->holder);
+       kfree(sb->sb_name);
 
        kfree(sb->sb);
        memset(sb, 0, sizeof(*sb));
@@ -657,12 +658,13 @@ reread:
        return 0;
 }
 
-int bch2_read_super(const char *path, struct bch_opts *opts,
-                   struct bch_sb_handle *sb)
+int __bch2_read_super(const char *path, struct bch_opts *opts,
+                   struct bch_sb_handle *sb, bool ignore_notbchfs_msg)
 {
        u64 offset = opt_get(*opts, sb);
        struct bch_sb_layout layout;
        struct printbuf err = PRINTBUF;
+       struct printbuf err2 = PRINTBUF;
        __le64 *i;
        int ret;
 #ifndef __KERNEL__
@@ -675,6 +677,10 @@ retry:
        if (!sb->holder)
                return -ENOMEM;
 
+       sb->sb_name = kstrdup(path, GFP_KERNEL);
+       if (!sb->sb_name)
+               return -ENOMEM;
+
 #ifndef __KERNEL__
        if (opt_get(*opts, direct_io) == false)
                sb->mode |= BLK_OPEN_BUFFERED;
@@ -721,8 +727,14 @@ retry:
        if (opt_defined(*opts, sb))
                goto err;
 
-       printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n",
+       prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
               path, err.buf);
+       if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
+               printk(KERN_INFO "%s", err2.buf);
+       else
+               printk(KERN_ERR "%s", err2.buf);
+
+       printbuf_exit(&err2);
        printbuf_reset(&err);
 
        /*
@@ -798,6 +810,20 @@ err_no_print:
        goto out;
 }
 
+int bch2_read_super(const char *path, struct bch_opts *opts,
+                   struct bch_sb_handle *sb)
+{
+       return __bch2_read_super(path, opts, sb, false);
+}
+
+/* provide a silenced version for mount.bcachefs */
+
+int bch2_read_super_silent(const char *path, struct bch_opts *opts,
+                   struct bch_sb_handle *sb)
+{
+       return __bch2_read_super(path, opts, sb, true);
+}
+
 /* write superblock: */
 
 static void write_super_endio(struct bio *bio)
index f5abd102bff7502bd2f142dfde8487c82f8aed29..e6f40a05431933e146c68376c4c87eacce38402e 100644 (file)
@@ -74,6 +74,7 @@ void bch2_free_super(struct bch_sb_handle *);
 int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
 
 int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
+int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
 int bch2_write_super(struct bch_fs *);
 void __bch2_check_set_feature(struct bch_fs *, unsigned);
 
index bb9451082e872ca6752e085ca1c884563821a157..91f757173ef0ae5cdbb96e0f2cef9570f23ab57e 100644 (file)
@@ -314,7 +314,8 @@ void bch2_fs_read_only(struct bch_fs *c)
                BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
                BUG_ON(atomic_read(&c->btree_cache.dirty));
                BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
-               BUG_ON(c->btree_write_buffer.state.nr);
+               BUG_ON(c->btree_write_buffer.inc.keys.nr);
+               BUG_ON(c->btree_write_buffer.flushing.keys.nr);
 
                bch_verbose(c, "marking filesystem clean");
                bch2_fs_mark_clean(c);
@@ -504,8 +505,8 @@ static void __bch2_fs_free(struct bch_fs *c)
        bch2_io_clock_exit(&c->io_clock[WRITE]);
        bch2_io_clock_exit(&c->io_clock[READ]);
        bch2_fs_compress_exit(c);
-       bch2_journal_keys_free(&c->journal_keys);
-       bch2_journal_entries_free(c);
+       bch2_journal_keys_put_initial(c);
+       BUG_ON(atomic_read(&c->journal_keys.ref));
        bch2_fs_btree_write_buffer_exit(c);
        percpu_free_rwsem(&c->mark_lock);
        free_percpu(c->online_reserved);
@@ -704,6 +705,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
        init_rwsem(&c->gc_lock);
        mutex_init(&c->gc_gens_lock);
+       atomic_set(&c->journal_keys.ref, 1);
+       c->journal_keys.initial_ref_held = true;
 
        for (i = 0; i < BCH_TIME_STAT_NR; i++)
                bch2_time_stats_init(&c->times[i]);
index 7dda4985b99fe6cfdde52c6df869e3df446d48d0..9c1fd4ca2b103478c9d943a2063f64825b8550cb 100644 (file)
@@ -5,6 +5,7 @@
 struct bch_sb_handle {
        struct bch_sb           *sb;
        struct block_device     *bdev;
+       char                    *sb_name;
        struct bio              *bio;
        void                    *holder;
        size_t                  buffer_size;
index 8df45da5a9bf9b8a795623afdedd69ca99589701..264c46b456c2af470dfdc9d5745fe676efa0795d 100644 (file)
@@ -496,7 +496,7 @@ STORE(bch2_fs)
 
                sc.gfp_mask = GFP_KERNEL;
                sc.nr_to_scan = strtoul_or_return(buf);
-               c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
+               c->btree_cache.shrink->scan_objects(c->btree_cache.shrink, &sc);
        }
 
        if (attr == &sysfs_btree_wakeup)
index 09a530325dd05e43d18f7e81a3d64a3fbd95c6ea..7b24e7fe3b5332c984282d737c1088c6a31bc09e 100644 (file)
@@ -188,6 +188,25 @@ DEFINE_EVENT(bch_fs, journal_entry_full,
        TP_ARGS(c)
 );
 
+TRACE_EVENT(journal_entry_close,
+       TP_PROTO(struct bch_fs *c, unsigned bytes),
+       TP_ARGS(c, bytes),
+
+       TP_STRUCT__entry(
+               __field(dev_t,          dev                     )
+               __field(u32,            bytes                   )
+       ),
+
+       TP_fast_assign(
+               __entry->dev                    = c->dev;
+               __entry->bytes                  = bytes;
+       ),
+
+       TP_printk("%d,%d entry bytes %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->bytes)
+);
+
 DEFINE_EVENT(bio, journal_write,
        TP_PROTO(struct bio *bio),
        TP_ARGS(bio)
@@ -1313,21 +1332,38 @@ TRACE_EVENT(write_buffer_flush,
                  __entry->nr, __entry->size, __entry->skipped, __entry->fast)
 );
 
+TRACE_EVENT(write_buffer_flush_sync,
+       TP_PROTO(struct btree_trans *trans, unsigned long caller_ip),
+       TP_ARGS(trans, caller_ip),
+
+       TP_STRUCT__entry(
+               __array(char,                   trans_fn, 32    )
+               __field(unsigned long,          caller_ip       )
+       ),
+
+       TP_fast_assign(
+               strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+               __entry->caller_ip              = caller_ip;
+       ),
+
+       TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
+);
+
 TRACE_EVENT(write_buffer_flush_slowpath,
-       TP_PROTO(struct btree_trans *trans, size_t nr, size_t size),
-       TP_ARGS(trans, nr, size),
+       TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total),
+       TP_ARGS(trans, slowpath, total),
 
        TP_STRUCT__entry(
-               __field(size_t,         nr              )
-               __field(size_t,         size            )
+               __field(size_t,         slowpath        )
+               __field(size_t,         total           )
        ),
 
        TP_fast_assign(
-               __entry->nr     = nr;
-               __entry->size   = size;
+               __entry->slowpath       = slowpath;
+               __entry->total          = total;
        ),
 
-       TP_printk("%zu/%zu", __entry->nr, __entry->size)
+       TP_printk("%zu/%zu", __entry->slowpath, __entry->total)
 );
 
 #endif /* _TRACE_BCACHEFS_H */
index f86c9eeafb35ad9da21ebddda8a182ea27970ff8..c16540552d61bc14121b034a9d6e302045ff0dc5 100644 (file)
@@ -36,7 +36,7 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
                        closure_debug_destroy(cl);
 
                        if (destructor)
-                               destructor(cl);
+                               destructor(&cl->work);
 
                        if (parent)
                                closure_put(parent);
@@ -108,8 +108,9 @@ struct closure_syncer {
        int                     done;
 };
 
-static void closure_sync_fn(struct closure *cl)
+static CLOSURE_CALLBACK(closure_sync_fn)
 {
+       struct closure *cl = container_of(ws, struct closure, work);
        struct closure_syncer *s = cl->s;
        struct task_struct *p;
 
index dfefad40b7a5de06b3d12c8790ab5fd3b77f8b43..eae2df6a8b60bcad0173514b1bf0516886ebd98f 100644 (file)
 static LIST_HEAD(shrinker_list);
 static DEFINE_MUTEX(shrinker_lock);
 
-int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
+struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...)
+{
+       return calloc(sizeof(struct shrinker), 1);
+}
+
+int shrinker_register(struct shrinker *shrinker)
 {
        mutex_lock(&shrinker_lock);
        list_add_tail(&shrinker->list, &shrinker_list);