]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 3913e0cac3 bcachefs: Journal space calculation fix
authorKent Overstreet <kent.overstreet@gmail.com>
Mon, 31 May 2021 19:05:33 +0000 (15:05 -0400)
committerKent Overstreet <kent.overstreet@gmail.com>
Mon, 31 May 2021 19:05:33 +0000 (15:05 -0400)
33 files changed:
.bcachefs_revision
include/trace/events/bcachefs.h
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bset.c
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_interior.h
libbcachefs/buckets.c
libbcachefs/fs-common.c
libbcachefs/fs-ioctl.c
libbcachefs/fs.c
libbcachefs/inode.c
libbcachefs/inode.h
libbcachefs/io.c
libbcachefs/io.h
libbcachefs/journal.c
libbcachefs/journal_io.c
libbcachefs/journal_reclaim.c
libbcachefs/journal_seq_blacklist.c
libbcachefs/journal_types.h
libbcachefs/move.c
libbcachefs/movinggc.c
libbcachefs/opts.h
libbcachefs/recovery.c
libbcachefs/reflink.c
libbcachefs/reflink.h
libbcachefs/super-io.c
libbcachefs/super.c
libbcachefs/sysfs.c

index 8da505a4f642f843f74eed55010caea7c46bd16f..93876ae2f84bf15ced7a00a878e9d6fd63adc2d9 100644 (file)
@@ -1 +1 @@
-ac3ab6a511717db1644ded49a6f417304abba048
+3913e0cac34e0993ab6dde67a2dec1ea485a2e28
index c79338c8ebf7cbfd44cc6a26de1c8fcc7ce29efb..7c90ba01510a756d0a0744d78492ccb515b4174a 100644 (file)
@@ -49,14 +49,14 @@ DECLARE_EVENT_CLASS(bch_fs,
        TP_ARGS(c),
 
        TP_STRUCT__entry(
-               __array(char,           uuid,   16 )
+               __field(dev_t,          dev                     )
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+               __entry->dev            = c->dev;
        ),
 
-       TP_printk("%pU", __entry->uuid)
+       TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
 );
 
 DECLARE_EVENT_CLASS(bio,
@@ -131,7 +131,7 @@ TRACE_EVENT(journal_reclaim_start,
                btree_key_cache_dirty, btree_key_cache_total),
 
        TP_STRUCT__entry(
-               __array(char,           uuid,   16              )
+               __field(dev_t,          dev                     )
                __field(u64,            min_nr                  )
                __field(u64,            prereserved             )
                __field(u64,            prereserved_total       )
@@ -142,7 +142,7 @@ TRACE_EVENT(journal_reclaim_start,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+               __entry->dev                    = c->dev;
                __entry->min_nr                 = min_nr;
                __entry->prereserved            = prereserved;
                __entry->prereserved_total      = prereserved_total;
@@ -152,8 +152,8 @@ TRACE_EVENT(journal_reclaim_start,
                __entry->btree_key_cache_total  = btree_key_cache_total;
        ),
 
-       TP_printk("%pU min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
-                 __entry->uuid,
+       TP_printk("%d,%d min %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->min_nr,
                  __entry->prereserved,
                  __entry->prereserved_total,
@@ -168,16 +168,18 @@ TRACE_EVENT(journal_reclaim_finish,
        TP_ARGS(c, nr_flushed),
 
        TP_STRUCT__entry(
-               __array(char,           uuid,   16 )
-               __field(u64,            nr_flushed )
+               __field(dev_t,          dev                     )
+               __field(u64,            nr_flushed              )
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-               __entry->nr_flushed = nr_flushed;
+               __entry->dev            = c->dev;
+               __entry->nr_flushed     = nr_flushed;
        ),
 
-       TP_printk("%pU flushed %llu", __entry->uuid, __entry->nr_flushed)
+       TP_printk("%d%d flushed %llu",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->nr_flushed)
 );
 
 /* bset.c: */
@@ -194,7 +196,7 @@ DECLARE_EVENT_CLASS(btree_node,
        TP_ARGS(c, b),
 
        TP_STRUCT__entry(
-               __array(char,           uuid,           16      )
+               __field(dev_t,          dev                     )
                __field(u8,             level                   )
                __field(u8,             id                      )
                __field(u64,            inode                   )
@@ -202,15 +204,16 @@ DECLARE_EVENT_CLASS(btree_node,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+               __entry->dev            = c->dev;
                __entry->level          = b->c.level;
                __entry->id             = b->c.btree_id;
                __entry->inode          = b->key.k.p.inode;
                __entry->offset         = b->key.k.p.offset;
        ),
 
-       TP_printk("%pU  %u id %u %llu:%llu",
-                 __entry->uuid, __entry->level, __entry->id,
+       TP_printk("%d,%d  %u id %u %llu:%llu",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->level, __entry->id,
                  __entry->inode, __entry->offset)
 );
 
@@ -254,32 +257,17 @@ DEFINE_EVENT(btree_node, btree_node_reap,
        TP_ARGS(c, b)
 );
 
-DECLARE_EVENT_CLASS(btree_node_cannibalize_lock,
-       TP_PROTO(struct bch_fs *c),
-       TP_ARGS(c),
-
-       TP_STRUCT__entry(
-               __array(char,                   uuid,   16      )
-       ),
-
-       TP_fast_assign(
-               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-       ),
-
-       TP_printk("%pU", __entry->uuid)
-);
-
-DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock_fail,
+DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock_fail,
        TP_PROTO(struct bch_fs *c),
        TP_ARGS(c)
 );
 
-DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock,
+DEFINE_EVENT(bch_fs, btree_node_cannibalize_lock,
        TP_PROTO(struct bch_fs *c),
        TP_ARGS(c)
 );
 
-DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize,
+DEFINE_EVENT(bch_fs, btree_node_cannibalize,
        TP_PROTO(struct bch_fs *c),
        TP_ARGS(c)
 );
@@ -294,18 +282,19 @@ TRACE_EVENT(btree_reserve_get_fail,
        TP_ARGS(c, required, cl),
 
        TP_STRUCT__entry(
-               __array(char,                   uuid,   16      )
+               __field(dev_t,          dev                     )
                __field(size_t,                 required        )
                __field(struct closure *,       cl              )
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+               __entry->dev            = c->dev;
                __entry->required = required;
                __entry->cl = cl;
        ),
 
-       TP_printk("%pU required %zu by %p", __entry->uuid,
+       TP_printk("%d,%d required %zu by %p",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->required, __entry->cl)
 );
 
@@ -483,19 +472,20 @@ TRACE_EVENT(move_data,
        TP_ARGS(c, sectors_moved, keys_moved),
 
        TP_STRUCT__entry(
-               __array(char,           uuid,   16      )
+               __field(dev_t,          dev                     )
                __field(u64,            sectors_moved   )
                __field(u64,            keys_moved      )
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+               __entry->dev                    = c->dev;
                __entry->sectors_moved = sectors_moved;
                __entry->keys_moved = keys_moved;
        ),
 
-       TP_printk("%pU sectors_moved %llu keys_moved %llu",
-               __entry->uuid, __entry->sectors_moved, __entry->keys_moved)
+       TP_printk("%d,%d sectors_moved %llu keys_moved %llu",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->sectors_moved, __entry->keys_moved)
 );
 
 TRACE_EVENT(copygc,
@@ -507,7 +497,7 @@ TRACE_EVENT(copygc,
                buckets_moved, buckets_not_moved),
 
        TP_STRUCT__entry(
-               __array(char,           uuid,   16              )
+               __field(dev_t,          dev                     )
                __field(u64,            sectors_moved           )
                __field(u64,            sectors_not_moved       )
                __field(u64,            buckets_moved           )
@@ -515,17 +505,39 @@ TRACE_EVENT(copygc,
        ),
 
        TP_fast_assign(
-               memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
+               __entry->dev                    = c->dev;
                __entry->sectors_moved          = sectors_moved;
                __entry->sectors_not_moved      = sectors_not_moved;
                __entry->buckets_moved          = buckets_moved;
                __entry->buckets_not_moved = buckets_moved;
        ),
 
-       TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu",
-               __entry->uuid,
-               __entry->sectors_moved, __entry->sectors_not_moved,
-               __entry->buckets_moved, __entry->buckets_not_moved)
+       TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->sectors_moved, __entry->sectors_not_moved,
+                 __entry->buckets_moved, __entry->buckets_not_moved)
+);
+
+TRACE_EVENT(copygc_wait,
+       TP_PROTO(struct bch_fs *c,
+                u64 wait_amount, u64 until),
+       TP_ARGS(c, wait_amount, until),
+
+       TP_STRUCT__entry(
+               __field(dev_t,          dev                     )
+               __field(u64,            wait_amount             )
+               __field(u64,            until                   )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = c->dev;
+               __entry->wait_amount    = wait_amount;
+               __entry->until          = until;
+       ),
+
+       TP_printk("%d,%u waiting for %llu sectors until %llu",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->wait_amount, __entry->until)
 );
 
 TRACE_EVENT(trans_get_iter,
index 24aa2cc7d965f929ebbe951c5b34359f58299fc5..8be95d81180f0bd6a95871cc9286613644c69f18 100644 (file)
@@ -263,7 +263,10 @@ do {                                                                       \
        BCH_DEBUG_PARAM(verify_btree_ondisk,                            \
                "Reread btree nodes at various points to verify the "   \
                "mergesort in the read path against modifications "     \
-               "done in memory")
+               "done in memory")                                       \
+       BCH_DEBUG_PARAM(verify_all_btree_replicas,                      \
+               "When reading btree nodes, read all replicas and "      \
+               "compare them")
 
 /* Parameters that should only be compiled in in debug mode: */
 #define BCH_DEBUG_PARAMS_DEBUG()                                       \
@@ -387,6 +390,14 @@ struct gc_pos {
        unsigned                level;
 };
 
+struct reflink_gc {
+       u64             offset;
+       u32             size;
+       u32             refcount;
+};
+
+typedef GENRADIX(struct reflink_gc) reflink_gc_table;
+
 struct io_count {
        u64                     sectors[2][BCH_DATA_NR];
 };
@@ -564,6 +575,7 @@ struct bch_fs {
        int                     minor;
        struct device           *chardev;
        struct super_block      *vfs_sb;
+       dev_t                   dev;
        char                    name[40];
 
        /* ro/rw, add/remove/resize devices: */
@@ -623,6 +635,7 @@ struct bch_fs {
 
        /* BTREE CACHE */
        struct bio_set          btree_bio;
+       struct workqueue_struct *io_complete_wq;
 
        struct btree_root       btree_roots[BTREE_ID_NR];
        struct mutex            btree_root_lock;
@@ -660,7 +673,8 @@ struct bch_fs {
 
        struct btree_key_cache  btree_key_cache;
 
-       struct workqueue_struct *wq;
+       struct workqueue_struct *btree_update_wq;
+       struct workqueue_struct *btree_error_wq;
        /* copygc needs its own workqueue for index updates.. */
        struct workqueue_struct *copygc_wq;
 
@@ -799,6 +813,9 @@ struct bch_fs {
 
        /* REFLINK */
        u64                     reflink_hint;
+       reflink_gc_table        reflink_gc_table;
+       size_t                  reflink_gc_nr;
+       size_t                  reflink_gc_idx;
 
        /* VFS IO PATH - fs-io.c */
        struct bio_set          writepage_bioset;
index d640a3115adc0cda2d7d68077031bedfcc0abc4d..79c0876aab8bdd26767b1e44da0a1a2cf1f79083 100644 (file)
@@ -1344,6 +1344,7 @@ LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,     struct bch_sb, flags[2],  4, 64);
 
 LE64_BITMASK(BCH_SB_ERASURE_CODE,      struct bch_sb, flags[3],  0, 16);
 LE64_BITMASK(BCH_SB_METADATA_TARGET,   struct bch_sb, flags[3], 16, 28);
+LE64_BITMASK(BCH_SB_SHARD_INUMS,       struct bch_sb, flags[3], 28, 29);
 
 /*
  * Features:
index 26203a5da5046a928a5d0aa4a1316792a4b45bb7..8a149e21d0b47beaf8903a8f123ba24057ce6670 100644 (file)
@@ -1193,13 +1193,11 @@ static struct bkey_packed *bset_search_write_set(const struct btree *b,
 
 static inline void prefetch_four_cachelines(void *p)
 {
-#if (CONFIG_X86_64 && !defined(__clang__))
-       asm(".intel_syntax noprefix;"
-           "prefetcht0 [%0 - 127 + 64 * 0];"
-           "prefetcht0 [%0 - 127 + 64 * 1];"
-           "prefetcht0 [%0 - 127 + 64 * 2];"
-           "prefetcht0 [%0 - 127 + 64 * 3];"
-           ".att_syntax prefix;"
+#if CONFIG_X86_64
+       asm("prefetcht0 (-127 + 64 * 0)(%0);"
+           "prefetcht0 (-127 + 64 * 1)(%0);"
+           "prefetcht0 (-127 + 64 * 2)(%0);"
+           "prefetcht0 (-127 + 64 * 3)(%0);"
            :
            : "r" (p + 127));
 #else
index e28292e0aa89bcd774c411241bac67d93bc17dc5..b03432c13fbb1bd0ab4b3ff5c7b73dc13369c2a8 100644 (file)
@@ -23,6 +23,7 @@
 #include "keylist.h"
 #include "move.h"
 #include "recovery.h"
+#include "reflink.h"
 #include "replicas.h"
 #include "super-io.h"
 
@@ -1282,6 +1283,201 @@ static int bch2_gc_start(struct bch_fs *c,
        return 0;
 }
 
+static int bch2_gc_reflink_done_initial_fn(struct bch_fs *c, struct bkey_s_c k)
+{
+       struct reflink_gc *r;
+       const __le64 *refcount = bkey_refcount_c(k);
+       char buf[200];
+       int ret = 0;
+
+       if (!refcount)
+               return 0;
+
+       r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++);
+       if (!r)
+               return -ENOMEM;
+
+       if (!r ||
+           r->offset != k.k->p.offset ||
+           r->size != k.k->size) {
+               bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
+               return -EINVAL;
+       }
+
+       if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
+                       "reflink key has wrong refcount:\n"
+                       "  %s\n"
+                       "  should be %u",
+                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+                       r->refcount)) {
+               struct bkey_i *new;
+
+               new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+               if (!new) {
+                       ret = -ENOMEM;
+                       goto fsck_err;
+               }
+
+               bkey_reassemble(new, k);
+
+               if (!r->refcount) {
+                       new->k.type = KEY_TYPE_deleted;
+                       new->k.size = 0;
+               } else {
+                       *bkey_refcount(new) = cpu_to_le64(r->refcount);
+               }
+
+               ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new);
+               if (ret)
+                       kfree(new);
+       }
+fsck_err:
+       return ret;
+}
+
+static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
+                               bool metadata_only)
+{
+       struct btree_trans trans;
+       struct btree_iter *iter;
+       struct bkey_s_c k;
+       struct reflink_gc *r;
+       size_t idx = 0;
+       char buf[200];
+       int ret = 0;
+
+       if (metadata_only)
+               return 0;
+
+       if (initial) {
+               c->reflink_gc_idx = 0;
+
+               ret = bch2_btree_and_journal_walk(c, BTREE_ID_reflink,
+                               bch2_gc_reflink_done_initial_fn);
+               goto out;
+       }
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               const __le64 *refcount = bkey_refcount_c(k);
+
+               if (!refcount)
+                       continue;
+
+               r = genradix_ptr(&c->reflink_gc_table, idx);
+               if (!r ||
+                   r->offset != k.k->p.offset ||
+                   r->size != k.k->size) {
+                       bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
+                       ret = -EINVAL;
+                       break;
+               }
+
+               if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
+                               "reflink key has wrong refcount:\n"
+                               "  %s\n"
+                               "  should be %u",
+                               (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+                               r->refcount)) {
+                       struct bkey_i *new;
+
+                       new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+                       if (!new) {
+                               ret = -ENOMEM;
+                               break;
+                       }
+
+                       bkey_reassemble(new, k);
+
+                       if (!r->refcount)
+                               new->k.type = KEY_TYPE_deleted;
+                       else
+                               *bkey_refcount(new) = cpu_to_le64(r->refcount);
+
+                       ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+                                       __bch2_btree_insert(&trans, BTREE_ID_reflink, new));
+                       kfree(new);
+
+                       if (ret)
+                               break;
+               }
+       }
+fsck_err:
+       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_exit(&trans);
+out:
+       genradix_free(&c->reflink_gc_table);
+       c->reflink_gc_nr = 0;
+       return ret;
+}
+
+static int bch2_gc_reflink_start_initial_fn(struct bch_fs *c, struct bkey_s_c k)
+{
+
+       struct reflink_gc *r;
+       const __le64 *refcount = bkey_refcount_c(k);
+
+       if (!refcount)
+               return 0;
+
+       r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
+                              GFP_KERNEL);
+       if (!r)
+               return -ENOMEM;
+
+       r->offset       = k.k->p.offset;
+       r->size         = k.k->size;
+       r->refcount     = 0;
+       return 0;
+}
+
+static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
+                                bool metadata_only)
+{
+       struct btree_trans trans;
+       struct btree_iter *iter;
+       struct bkey_s_c k;
+       struct reflink_gc *r;
+       int ret;
+
+       if (metadata_only)
+               return 0;
+
+       genradix_free(&c->reflink_gc_table);
+       c->reflink_gc_nr = 0;
+
+       if (initial)
+               return bch2_btree_and_journal_walk(c, BTREE_ID_reflink,
+                               bch2_gc_reflink_start_initial_fn);
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
+                          BTREE_ITER_PREFETCH, k, ret) {
+               const __le64 *refcount = bkey_refcount_c(k);
+
+               if (!refcount)
+                       continue;
+
+               r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
+                                      GFP_KERNEL);
+               if (!r) {
+                       ret = -ENOMEM;
+                       break;
+               }
+
+               r->offset       = k.k->p.offset;
+               r->size         = k.k->size;
+               r->refcount     = 0;
+       }
+       bch2_trans_iter_put(&trans, iter);
+
+       bch2_trans_exit(&trans);
+       return 0;
+}
+
 /**
  * bch2_gc - walk _all_ references to buckets, and recompute them:
  *
@@ -1316,7 +1512,8 @@ int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
        closure_wait_event(&c->btree_interior_update_wait,
                           !bch2_btree_interior_updates_nr_pending(c));
 again:
-       ret = bch2_gc_start(c, metadata_only);
+       ret   = bch2_gc_start(c, metadata_only) ?:
+               bch2_gc_reflink_start(c, initial, metadata_only);
        if (ret)
                goto out;
 
@@ -1378,7 +1575,8 @@ out:
                bch2_journal_block(&c->journal);
 
                percpu_down_write(&c->mark_lock);
-               ret = bch2_gc_done(c, initial, metadata_only);
+               ret   = bch2_gc_reflink_done(c, initial, metadata_only) ?:
+                       bch2_gc_done(c, initial, metadata_only);
 
                bch2_journal_unblock(&c->journal);
        } else {
index 094285bd1cc202d5f5306ab48520c79d8067a981..47cfd8a08f91f18f17f35b4b1af77c5f04cb8c29 100644 (file)
@@ -521,7 +521,7 @@ enum btree_validate_ret {
                                                                        \
        switch (write) {                                                \
        case READ:                                                      \
-               bch_err(c, "%s", _buf2);                                        \
+               bch_err(c, "%s", _buf2);                                \
                                                                        \
                switch (type) {                                         \
                case BTREE_ERR_FIXABLE:                                 \
@@ -815,6 +815,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
        bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
                BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
        unsigned u64s;
+       unsigned nonblacklisted_written = 0;
        int ret, retry_read = 0, write = READ;
 
        b->version_ondisk = U16_MAX;
@@ -934,15 +935,31 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
                sort_iter_add(iter,
                              vstruct_idx(i, whiteout_u64s),
                              vstruct_last(i));
+
+               nonblacklisted_written = b->written;
        }
 
        for (bne = write_block(b);
             bset_byte_offset(b, bne) < btree_bytes(c);
             bne = (void *) bne + block_bytes(c))
-               btree_err_on(bne->keys.seq == b->data->keys.seq,
+               btree_err_on(bne->keys.seq == b->data->keys.seq &&
+                            !bch2_journal_seq_is_blacklisted(c,
+                                       le64_to_cpu(bne->keys.journal_seq),
+                                       true),
                             BTREE_ERR_WANT_RETRY, c, ca, b, NULL,
                             "found bset signature after last bset");
 
+       /*
+        * Blacklisted bsets are those that were written after the most recent
+        * (flush) journal write. Since there wasn't a flush, they may not have
+        * made it to all devices - which means we shouldn't write new bsets
+        * after them, as that could leave a gap and then reads from that device
+        * wouldn't find all the bsets in that btree node - which means it's
+        * important that we start writing new bsets after the most recent _non_
+        * blacklisted bset:
+        */
+       b->written = nonblacklisted_written;
+
        sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
        sorted->keys.u64s = 0;
 
@@ -1027,8 +1044,8 @@ static void btree_node_read_work(struct work_struct *work)
        struct btree_read_bio *rb =
                container_of(work, struct btree_read_bio, work);
        struct bch_fs *c        = rb->c;
+       struct btree *b         = rb->b;
        struct bch_dev *ca      = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
-       struct btree *b         = rb->bio.bi_private;
        struct bio *bio         = &rb->bio;
        struct bch_io_failures failed = { .nr = 0 };
        char buf[200];
@@ -1101,7 +1118,263 @@ static void btree_node_read_endio(struct bio *bio)
                bch2_latency_acct(ca, rb->start_time, READ);
        }
 
-       queue_work(system_unbound_wq, &rb->work);
+       queue_work(c->io_complete_wq, &rb->work);
+}
+
+struct btree_node_read_all {
+       struct closure          cl;
+       struct bch_fs           *c;
+       struct btree            *b;
+       unsigned                nr;
+       void                    *buf[BCH_REPLICAS_MAX];
+       struct bio              *bio[BCH_REPLICAS_MAX];
+       int                     err[BCH_REPLICAS_MAX];
+};
+
+static unsigned btree_node_sectors_written(struct bch_fs *c, void *data)
+{
+       struct btree_node *bn = data;
+       struct btree_node_entry *bne;
+       unsigned offset = 0;
+
+       if (le64_to_cpu(bn->magic) !=  bset_magic(c))
+               return 0;
+
+       while (offset < c->opts.btree_node_size) {
+               if (!offset) {
+                       offset += vstruct_sectors(bn, c->block_bits);
+               } else {
+                       bne = data + (offset << 9);
+                       if (bne->keys.seq != bn->keys.seq)
+                               break;
+                       offset += vstruct_sectors(bne, c->block_bits);
+               }
+       }
+
+       return offset;
+}
+
+static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data)
+{
+       struct btree_node *bn = data;
+       struct btree_node_entry *bne;
+
+       if (!offset)
+               return false;
+
+       while (offset < c->opts.btree_node_size) {
+               bne = data + (offset << 9);
+               if (bne->keys.seq == bn->keys.seq)
+                       return true;
+               offset++;
+       }
+
+       return false;
+       return offset;
+}
+
+static void btree_node_read_all_replicas_done(struct closure *cl)
+{
+       struct btree_node_read_all *ra =
+               container_of(cl, struct btree_node_read_all, cl);
+       struct bch_fs *c = ra->c;
+       struct btree *b = ra->b;
+       bool have_good_copy = false;
+       bool dump_bset_maps = false;
+       bool have_retry = false;
+       int ret = 0, write = READ;
+       unsigned i, written, written2;
+       __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2
+               ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0;
+
+       for (i = 0; i < ra->nr; i++) {
+               if (ra->err[i])
+                       continue;
+
+               if (!have_good_copy) {
+                       memcpy(b->data, ra->buf[i], btree_bytes(c));
+                       have_good_copy = true;
+                       written = btree_node_sectors_written(c, b->data);
+               }
+
+               /* Try to get the right btree node: */
+               if (have_good_copy &&
+                   seq &&
+                   b->data->keys.seq != seq &&
+                   ((struct btree_node *) ra->buf[i])->keys.seq == seq) {
+                       memcpy(b->data, ra->buf[i], btree_bytes(c));
+                       written = btree_node_sectors_written(c, b->data);
+               }
+
+               written2 = btree_node_sectors_written(c, ra->buf[i]);
+               if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+                                "btree node sectors written mismatch: %u != %u",
+                                written, written2) ||
+                   btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]),
+                                BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+                                "found bset signature after last bset") ||
+                   btree_err_on(memcmp(b->data, ra->buf[i], written << 9),
+                                BTREE_ERR_FIXABLE, c, NULL, b, NULL,
+                                "btree node replicas content mismatch"))
+                       dump_bset_maps = true;
+
+               if (written2 > written) {
+                       written = written2;
+                       memcpy(b->data, ra->buf[i], btree_bytes(c));
+               }
+       }
+fsck_err:
+       if (dump_bset_maps) {
+               for (i = 0; i < ra->nr; i++) {
+                       char buf[200];
+                       struct printbuf out = PBUF(buf);
+                       struct btree_node *bn = ra->buf[i];
+                       struct btree_node_entry *bne = NULL;
+                       unsigned offset = 0, sectors;
+                       bool gap = false;
+
+                       if (ra->err[i])
+                               continue;
+
+                       while (offset < c->opts.btree_node_size) {
+                               if (!offset) {
+                                       sectors = vstruct_sectors(bn, c->block_bits);
+                               } else {
+                                       bne = ra->buf[i] + (offset << 9);
+                                       if (bne->keys.seq != bn->keys.seq)
+                                               break;
+                                       sectors = vstruct_sectors(bne, c->block_bits);
+                               }
+
+                               pr_buf(&out, " %u-%u", offset, offset + sectors);
+                               if (bne && bch2_journal_seq_is_blacklisted(c,
+                                                       le64_to_cpu(bne->keys.journal_seq), false))
+                                       pr_buf(&out, "*");
+                               offset += sectors;
+                       }
+
+                       while (offset < c->opts.btree_node_size) {
+                               bne = ra->buf[i] + (offset << 9);
+                               if (bne->keys.seq == bn->keys.seq) {
+                                       if (!gap)
+                                               pr_buf(&out, " GAP");
+                                       gap = true;
+
+                                       sectors = vstruct_sectors(bne, c->block_bits);
+                                       pr_buf(&out, " %u-%u", offset, offset + sectors);
+                                       if (bch2_journal_seq_is_blacklisted(c,
+                                                       le64_to_cpu(bne->keys.journal_seq), false))
+                                               pr_buf(&out, "*");
+                               }
+                               offset++;
+                       }
+
+                       bch_err(c, "replica %u:%s", i, buf);
+               }
+       }
+
+       if (have_good_copy)
+               bch2_btree_node_read_done(c, NULL, b, false);
+       else
+               set_btree_node_read_error(b);
+
+       for (i = 0; i < ra->nr; i++) {
+               mempool_free(ra->buf[i], &c->btree_bounce_pool);
+               bio_put(ra->bio[i]);
+       }
+
+       closure_debug_destroy(&ra->cl);
+       kfree(ra);
+
+       clear_btree_node_read_in_flight(b);
+       wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+}
+
+static void btree_node_read_all_replicas_endio(struct bio *bio)
+{
+       struct btree_read_bio *rb =
+               container_of(bio, struct btree_read_bio, bio);
+       struct bch_fs *c        = rb->c;
+       struct btree_node_read_all *ra = rb->ra;
+
+       if (rb->have_ioref) {
+               struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+               bch2_latency_acct(ca, rb->start_time, READ);
+       }
+
+       ra->err[rb->idx] = bio->bi_status;
+       closure_put(&ra->cl);
+}
+
+/*
+ * XXX This allocates multiple times from the same mempools, and can deadlock
+ * under sufficient memory pressure (but is only a debug path)
+ */
+static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync)
+{
+       struct bkey_s_c k = bkey_i_to_s_c(&b->key);
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const union bch_extent_entry *entry;
+       struct extent_ptr_decoded pick;
+       struct btree_node_read_all *ra;
+       unsigned i;
+
+       ra = kzalloc(sizeof(*ra), GFP_NOFS);
+       if (!ra)
+               return -ENOMEM;
+
+       closure_init(&ra->cl, NULL);
+       ra->c   = c;
+       ra->b   = b;
+       ra->nr  = bch2_bkey_nr_ptrs(k);
+
+       for (i = 0; i < ra->nr; i++) {
+               ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
+               ra->bio[i] = bio_alloc_bioset(GFP_NOFS, buf_pages(ra->buf[i],
+                                                                 btree_bytes(c)),
+                                             &c->btree_bio);
+       }
+
+       i = 0;
+       bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) {
+               struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+               struct btree_read_bio *rb =
+                       container_of(ra->bio[i], struct btree_read_bio, bio);
+               rb->c                   = c;
+               rb->b                   = b;
+               rb->ra                  = ra;
+               rb->start_time          = local_clock();
+               rb->have_ioref          = bch2_dev_get_ioref(ca, READ);
+               rb->idx                 = i;
+               rb->pick                = pick;
+               rb->bio.bi_opf          = REQ_OP_READ|REQ_SYNC|REQ_META;
+               rb->bio.bi_iter.bi_sector = pick.ptr.offset;
+               rb->bio.bi_end_io       = btree_node_read_all_replicas_endio;
+               bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c));
+
+               if (rb->have_ioref) {
+                       this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
+                                    bio_sectors(&rb->bio));
+                       bio_set_dev(&rb->bio, ca->disk_sb.bdev);
+
+                       closure_get(&ra->cl);
+                       submit_bio(&rb->bio);
+               } else {
+                       ra->err[i] = BLK_STS_REMOVED;
+               }
+
+               i++;
+       }
+
+       if (sync) {
+               closure_sync(&ra->cl);
+               btree_node_read_all_replicas_done(&ra->cl);
+       } else {
+               continue_at(&ra->cl, btree_node_read_all_replicas_done,
+                           c->io_complete_wq);
+       }
+
+       return 0;
 }
 
 void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
@@ -1117,6 +1390,12 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
        btree_pos_to_text(&PBUF(buf), c, b);
        trace_btree_read(c, b);
 
+       set_btree_node_read_in_flight(b);
+
+       if (bch2_verify_all_btree_replicas &&
+           !btree_node_read_all_replicas(c, b, sync))
+               return;
+
        ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
                                         NULL, &pick);
        if (bch2_fs_fatal_err_on(ret <= 0, c,
@@ -1133,6 +1412,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
                               &c->btree_bio);
        rb = container_of(bio, struct btree_read_bio, bio);
        rb->c                   = c;
+       rb->b                   = b;
+       rb->ra                  = NULL;
        rb->start_time          = local_clock();
        rb->have_ioref          = bch2_dev_get_ioref(ca, READ);
        rb->pick                = pick;
@@ -1140,11 +1421,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
        bio->bi_opf             = REQ_OP_READ|REQ_SYNC|REQ_META;
        bio->bi_iter.bi_sector  = pick.ptr.offset;
        bio->bi_end_io          = btree_node_read_endio;
-       bio->bi_private         = b;
        bch2_bio_map(bio, b->data, btree_bytes(c));
 
-       set_btree_node_read_in_flight(b);
-
        if (rb->have_ioref) {
                this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
                             bio_sectors(bio));
@@ -1153,7 +1431,6 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
                if (sync) {
                        submit_bio_wait(bio);
 
-                       bio->bi_private = b;
                        btree_node_read_work(&rb->work);
                } else {
                        submit_bio(bio);
@@ -1164,8 +1441,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
                if (sync)
                        btree_node_read_work(&rb->work);
                else
-                       queue_work(system_unbound_wq, &rb->work);
-
+                       queue_work(c->io_complete_wq, &rb->work);
        }
 }
 
@@ -1332,7 +1608,7 @@ static void btree_node_write_work(struct work_struct *work)
                bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio);
                spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
 
-               queue_work(c->wq, &c->btree_write_error_work);
+               queue_work(c->btree_error_wq, &c->btree_write_error_work);
                return;
        }
 
@@ -1371,7 +1647,7 @@ static void btree_node_write_endio(struct bio *bio)
                        container_of(orig, struct btree_write_bio, wbio);
 
                INIT_WORK(&wb->work, btree_node_write_work);
-               queue_work(system_unbound_wq, &wb->work);
+               queue_work(c->io_complete_wq, &wb->work);
        }
 }
 
@@ -1441,6 +1717,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
                        return;
 
                if (old & (1 << BTREE_NODE_write_in_flight)) {
+                       /*
+                        * XXX waiting on btree writes with btree locks held -
+                        * this can deadlock, and we hit the write error path
+                        */
                        btree_node_wait_on_io(b);
                        continue;
                }
@@ -1631,7 +1911,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
        atomic64_add(sectors_to_write, &c->btree_writes_sectors);
 
        INIT_WORK(&wbio->work, btree_write_submit);
-       schedule_work(&wbio->work);
+       queue_work(c->io_complete_wq, &wbio->work);
        return;
 err:
        set_btree_node_noevict(b);
index cadcf7f886d73759167ce8f177e0e55723ebf9a5..abbc4675964ab4d44d4639ac8d6f27de854bd257 100644 (file)
@@ -13,6 +13,7 @@ struct bch_fs;
 struct btree_write;
 struct btree;
 struct btree_iter;
+struct btree_node_read_all;
 
 static inline bool btree_node_dirty(struct btree *b)
 {
@@ -33,8 +34,11 @@ static inline void clear_btree_node_dirty(struct bch_fs *c, struct btree *b)
 
 struct btree_read_bio {
        struct bch_fs           *c;
+       struct btree            *b;
+       struct btree_node_read_all *ra;
        u64                     start_time;
        unsigned                have_ioref:1;
+       unsigned                idx:7;
        struct extent_ptr_decoded       pick;
        struct work_struct      work;
        struct bio              bio;
index 7f86a39b5e6014eb9bddeb29963c769e1a228467..bdb068e9d2636fa2b05f70deddff34d47a49c96f 100644 (file)
@@ -2260,6 +2260,7 @@ static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
 void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
                     unsigned expected_nr_iters,
                     size_t expected_mem_bytes)
+       __acquires(&c->btree_trans_barrier)
 {
        memset(trans, 0, sizeof(*trans));
        trans->c                = c;
@@ -2292,6 +2293,7 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
 }
 
 int bch2_trans_exit(struct btree_trans *trans)
+       __releases(&c->btree_trans_barrier)
 {
        struct bch_fs *c = trans->c;
 
index bee7ee690cf15b3264ed7d6be214deacc23121a7..b0484c7acb79b8f7c0b19aca16732411328ef562 100644 (file)
@@ -550,6 +550,22 @@ static void btree_update_nodes_written(struct btree_update *as)
 
        BUG_ON(!journal_pin_active(&as->journal));
 
+       /*
+        * Wait for any in flight writes to finish before we free the old nodes
+        * on disk:
+        */
+       for (i = 0; i < as->nr_old_nodes; i++) {
+               struct btree_node *bn = READ_ONCE(as->old_nodes[i]->data);
+
+               /*
+                * This is technically a use after free, but it's just a read -
+                * but it might cause problems in userspace where freeing the
+                * buffer may unmap it:
+                */
+               if (bn && bn->keys.seq == as->old_nodes_seq[i])
+                       btree_node_wait_on_io(as->old_nodes[i]);
+       }
+
        /*
         * We did an update to a parent node where the pointers we added pointed
         * to child nodes that weren't written yet: now, the child nodes have
@@ -889,13 +905,9 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 
        btree_update_will_delete_key(as, &b->key);
 
-       /*
-        * XXX: Waiting on io with btree node locks held, we don't want to be
-        * doing this. We can't have btree writes happening after the space has
-        * been freed, but we really only need to block before
-        * btree_update_nodes_written_trans() happens.
-        */
-       btree_node_wait_on_io(b);
+       as->old_nodes[as->nr_old_nodes] = b;
+       as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
+       as->nr_old_nodes++;
 }
 
 void bch2_btree_update_done(struct btree_update *as)
@@ -908,7 +920,8 @@ void bch2_btree_update_done(struct btree_update *as)
 
        bch2_btree_reserve_put(as);
 
-       continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq);
+       continue_at(&as->cl, btree_update_set_nodes_written,
+                   as->c->btree_interior_update_worker);
 }
 
 struct btree_update *
@@ -1826,7 +1839,10 @@ void async_btree_node_rewrite_work(struct work_struct *work)
 
 void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 {
-       struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS);
+       struct async_btree_rewrite *a;
+
+       if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
+               return;
 
        if (!percpu_ref_tryget(&c->writes))
                return;
@@ -1844,7 +1860,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
        a->seq          = b->data->keys.seq;
 
        INIT_WORK(&a->work, async_btree_node_rewrite_work);
-       queue_work(system_long_wq, &a->work);
+       queue_work(c->btree_interior_update_worker, &a->work);
 }
 
 static void __bch2_btree_node_update_key(struct bch_fs *c,
index 7eef3dbb6ef178ee0c9082b6699ffe36469fb35e..7ed67b47e1b9377bf960c15de653074e3a786c74 100644 (file)
@@ -92,6 +92,10 @@ struct btree_update {
        struct btree                    *new_nodes[BTREE_UPDATE_NODES_MAX];
        unsigned                        nr_new_nodes;
 
+       struct btree                    *old_nodes[BTREE_UPDATE_NODES_MAX];
+       __le64                          old_nodes_seq[BTREE_UPDATE_NODES_MAX];
+       unsigned                        nr_old_nodes;
+
        open_bucket_idx_t               open_buckets[BTREE_UPDATE_NODES_MAX *
                                                     BCH_REPLICAS_MAX];
        open_bucket_idx_t               nr_open_buckets;
index cbd295e494bd6c4d55da2897f037cffff2a9175b..d07085a2fd1b23bfe1af4d1b4534d25e7cf3191d 100644 (file)
@@ -14,6 +14,7 @@
 #include "ec.h"
 #include "error.h"
 #include "movinggc.h"
+#include "reflink.h"
 #include "replicas.h"
 
 #include <linux/preempt.h>
@@ -1072,6 +1073,124 @@ static int bch2_mark_stripe(struct bch_fs *c,
        return 0;
 }
 
+static int __reflink_p_frag_references(struct bkey_s_c_reflink_p p,
+                                      u64 p_start, u64 p_end,
+                                      u64 v_start, u64 v_end)
+{
+       if (p_start == p_end)
+               return false;
+
+       p_start += le64_to_cpu(p.v->idx);
+       p_end   += le64_to_cpu(p.v->idx);
+
+       if (p_end <= v_start)
+               return false;
+       if (p_start >= v_end)
+               return false;
+       return true;
+}
+
+static int reflink_p_frag_references(struct bkey_s_c_reflink_p p,
+                                    u64 start, u64 end,
+                                    struct bkey_s_c k)
+{
+       return __reflink_p_frag_references(p, start, end,
+                                          bkey_start_offset(k.k),
+                                          k.k->p.offset);
+}
+
+static int __bch2_mark_reflink_p(struct bch_fs *c,
+                       struct bkey_s_c_reflink_p p,
+                       u64 idx, unsigned sectors,
+                       unsigned front_frag,
+                       unsigned back_frag,
+                       unsigned flags,
+                       size_t *r_idx)
+{
+       struct reflink_gc *r;
+       int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+       int frags_referenced;
+
+       while (1) {
+               if (*r_idx >= c->reflink_gc_nr)
+                       goto not_found;
+               r = genradix_ptr(&c->reflink_gc_table, *r_idx);
+               BUG_ON(!r);
+
+               if (r->offset > idx)
+                       break;
+               (*r_idx)++;
+       }
+
+       frags_referenced =
+               __reflink_p_frag_references(p, 0, front_frag,
+                                           r->offset - r->size, r->offset) +
+               __reflink_p_frag_references(p, back_frag, p.k->size,
+                                           r->offset - r->size, r->offset);
+
+       if (frags_referenced == 2) {
+               BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT));
+               add = -add;
+       } else if (frags_referenced == 1) {
+               BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE));
+               add = 0;
+       }
+
+       BUG_ON((s64) r->refcount + add < 0);
+
+       r->refcount += add;
+       return min_t(u64, sectors, r->offset - idx);
+not_found:
+       bch2_fs_inconsistent(c,
+               "%llu:%llu len %u points to nonexistent indirect extent %llu",
+               p.k->p.inode, p.k->p.offset, p.k->size, idx);
+       bch2_inconsistent_error(c);
+       return -EIO;
+}
+
+static int bch2_mark_reflink_p(struct bch_fs *c,
+                              struct bkey_s_c_reflink_p p, unsigned offset,
+                              s64 sectors, unsigned flags)
+{
+       u64 idx = le64_to_cpu(p.v->idx) + offset;
+       struct reflink_gc *ref;
+       size_t l, r, m;
+       unsigned front_frag, back_frag;
+       s64 ret = 0;
+
+       if (sectors < 0)
+               sectors = -sectors;
+
+       BUG_ON(offset + sectors > p.k->size);
+
+       front_frag = offset;
+       back_frag = offset + sectors;
+
+       l = 0;
+       r = c->reflink_gc_nr;
+       while (l < r) {
+               m = l + (r - l) / 2;
+
+               ref = genradix_ptr(&c->reflink_gc_table, m);
+               if (ref->offset <= idx)
+                       l = m + 1;
+               else
+                       r = m;
+       }
+
+       while (sectors) {
+               ret = __bch2_mark_reflink_p(c, p, idx, sectors,
+                               front_frag, back_frag, flags, &l);
+               if (ret < 0)
+                       return ret;
+
+               idx     += ret;
+               sectors -= ret;
+       }
+
+       return 0;
+}
+
 static int bch2_mark_key_locked(struct bch_fs *c,
                   struct bkey_s_c old,
                   struct bkey_s_c new,
@@ -1127,6 +1246,10 @@ static int bch2_mark_key_locked(struct bch_fs *c,
                fs_usage->persistent_reserved[replicas - 1]     += sectors;
                break;
        }
+       case KEY_TYPE_reflink_p:
+               ret = bch2_mark_reflink_p(c, bkey_s_c_to_reflink_p(k),
+                                         offset, sectors, flags);
+               break;
        }
 
        preempt_enable();
@@ -1689,35 +1812,6 @@ static int bch2_trans_mark_stripe(struct btree_trans *trans,
        return ret;
 }
 
-static __le64 *bkey_refcount(struct bkey_i *k)
-{
-       switch (k->k.type) {
-       case KEY_TYPE_reflink_v:
-               return &bkey_i_to_reflink_v(k)->v.refcount;
-       case KEY_TYPE_indirect_inline_data:
-               return &bkey_i_to_indirect_inline_data(k)->v.refcount;
-       default:
-               return NULL;
-       }
-}
-
-static bool reflink_p_frag_references(struct bkey_s_c_reflink_p p,
-                                     u64 start, u64 end,
-                                     struct bkey_s_c k)
-{
-       if (start == end)
-               return false;
-
-       start   += le64_to_cpu(p.v->idx);
-       end     += le64_to_cpu(p.v->idx);
-
-       if (end <= bkey_start_offset(k.k))
-               return false;
-       if (start >= k.k->p.offset)
-               return false;
-       return true;
-}
-
 static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
                        struct bkey_s_c_reflink_p p,
                        u64 idx, unsigned sectors,
@@ -1731,6 +1825,7 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
        struct bkey_i *n;
        __le64 *refcount;
        int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+       int frags_referenced;
        s64 ret;
 
        ret = trans_get_key(trans, BTREE_ID_reflink,
@@ -1738,18 +1833,20 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
        if (ret < 0)
                return ret;
 
-       if (reflink_p_frag_references(p, 0, front_frag, k) &&
-           reflink_p_frag_references(p, back_frag, p.k->size, k)) {
+       sectors = min_t(u64, sectors, k.k->p.offset - idx);
+
+       frags_referenced =
+               reflink_p_frag_references(p, 0, front_frag, k) +
+               reflink_p_frag_references(p, back_frag, p.k->size, k);
+
+       if (frags_referenced == 2) {
                BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE_SPLIT));
                add = -add;
-       } else if (reflink_p_frag_references(p, 0, front_frag, k) ||
-                  reflink_p_frag_references(p, back_frag, p.k->size, k)) {
+       } else if (frags_referenced == 1) {
                BUG_ON(!(flags & BTREE_TRIGGER_OVERWRITE));
                goto out;
        }
 
-       sectors = min_t(u64, sectors, k.k->p.offset - idx);
-
        n = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
        ret = PTR_ERR_OR_ZERO(n);
        if (ret)
@@ -1804,14 +1901,13 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
                ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors,
                                        front_frag, back_frag, flags);
                if (ret < 0)
-                       break;
+                       return ret;
 
-               idx += ret;
-               sectors = max_t(s64, 0LL, sectors - ret);
-               ret = 0;
+               idx     += ret;
+               sectors -= ret;
        }
 
-       return ret;
+       return 0;
 }
 
 int bch2_trans_mark_key(struct btree_trans *trans,
index 08c6af886df7b651c8346e855ba923655838b714..00a63fecb976e6da858c1adc64de69d40fcf3710 100644 (file)
@@ -23,6 +23,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
        struct btree_iter *inode_iter = NULL;
        struct bch_hash_info hash = bch2_hash_info_init(c, new_inode);
        u64 now = bch2_current_time(c);
+       u64 cpu = raw_smp_processor_id();
        u64 dir_offset = 0;
        int ret;
 
@@ -36,7 +37,7 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum,
        if (!name)
                new_inode->bi_flags |= BCH_INODE_UNLINKED;
 
-       inode_iter = bch2_inode_create(trans, new_inode, U32_MAX);
+       inode_iter = bch2_inode_create(trans, new_inode, U32_MAX, cpu);
        ret = PTR_ERR_OR_ZERO(inode_iter);
        if (ret)
                goto err;
index eb871634eeae772883292972de80e453dd811c92..d8cc32e043df83d3d22d1ae55aded1160e3585d2 100644 (file)
@@ -13,6 +13,9 @@
 #include <linux/mount.h>
 
 #define FS_IOC_GOINGDOWN            _IOR('X', 125, __u32)
+#define FSOP_GOING_FLAGS_DEFAULT       0x0     /* going down */
+#define FSOP_GOING_FLAGS_LOGFLUSH      0x1     /* flush log but not data */
+#define FSOP_GOING_FLAGS_NOLOGFLUSH    0x2     /* don't flush log nor data */
 
 struct flags_set {
        unsigned                mask;
@@ -247,11 +250,54 @@ err1:
        return ret;
 }
 
+static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
+{
+       u32 flags;
+       int ret = 0;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (get_user(flags, arg))
+               return -EFAULT;
+
+       bch_notice(c, "shutdown by ioctl type %u", flags);
+
+       down_write(&c->vfs_sb->s_umount);
+
+       switch (flags) {
+       case FSOP_GOING_FLAGS_DEFAULT:
+               ret = freeze_bdev(c->vfs_sb->s_bdev);
+               if (ret)
+                       goto err;
+
+               bch2_journal_flush(&c->journal);
+               c->vfs_sb->s_flags |= SB_RDONLY;
+               bch2_fs_emergency_read_only(c);
+               thaw_bdev(c->vfs_sb->s_bdev);
+               break;
+
+       case FSOP_GOING_FLAGS_LOGFLUSH:
+               bch2_journal_flush(&c->journal);
+               fallthrough;
+
+       case FSOP_GOING_FLAGS_NOLOGFLUSH:
+               c->vfs_sb->s_flags |= SB_RDONLY;
+               bch2_fs_emergency_read_only(c);
+               break;
+       default:
+               ret = -EINVAL;
+               break;
+       }
+err:
+       up_write(&c->vfs_sb->s_umount);
+       return ret;
+}
+
 long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
        struct bch_inode_info *inode = file_bch_inode(file);
-       struct super_block *sb = inode->v.i_sb;
-       struct bch_fs *c = sb->s_fs_info;
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
 
        switch (cmd) {
        case FS_IOC_GETFLAGS:
@@ -276,15 +322,7 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg)
                return -ENOTTY;
 
        case FS_IOC_GOINGDOWN:
-               if (!capable(CAP_SYS_ADMIN))
-                       return -EPERM;
-
-               down_write(&sb->s_umount);
-               sb->s_flags |= SB_RDONLY;
-               if (bch2_fs_emergency_read_only(c))
-                       bch_err(c, "emergency read only due to ioctl");
-               up_write(&sb->s_umount);
-               return 0;
+               return bch2_ioc_goingdown(c, (u32 __user *) arg);
 
        default:
                return bch2_fs_ioctl(c, cmd, (void __user *) arg);
index 25a9fc14fd0a3e53e02212f26c9c9152356ac2c9..e8a329c9561fd514cf185abb0e80744e99e1afdd 100644 (file)
@@ -1578,6 +1578,8 @@ got_sb:
                break;
        }
 
+       c->dev = sb->s_dev;
+
 #ifdef CONFIG_BCACHEFS_POSIX_ACL
        if (c->opts.acl)
                sb->s_flags     |= SB_POSIXACL;
index c5892e42aaec24718e902018d5047283d33dfabb..6b43a9716cf0bc867601471e3be7e6d7a371ae0b 100644 (file)
@@ -472,23 +472,28 @@ static inline u32 bkey_generation(struct bkey_s_c k)
 
 struct btree_iter *bch2_inode_create(struct btree_trans *trans,
                                     struct bch_inode_unpacked *inode_u,
-                                    u32 snapshot)
+                                    u32 snapshot, u64 cpu)
 {
        struct bch_fs *c = trans->c;
        struct btree_iter *iter = NULL;
        struct bkey_s_c k;
        u64 min, max, start, pos, *hint;
        int ret = 0;
+       unsigned bits = (c->opts.inodes_32bit ? 31 : 63);
 
-       u64 cpu = raw_smp_processor_id();
-       unsigned bits = (c->opts.inodes_32bit
-               ? 31 : 63) - c->inode_shard_bits;
+       if (c->opts.shard_inode_numbers) {
+               bits -= c->inode_shard_bits;
 
-       min = (cpu << bits);
-       max = (cpu << bits) | ~(ULLONG_MAX << bits);
+               min = (cpu << bits);
+               max = (cpu << bits) | ~(ULLONG_MAX << bits);
 
-       min = max_t(u64, min, BLOCKDEV_INODE_MAX);
-       hint = c->unused_inode_hints + cpu;
+               min = max_t(u64, min, BLOCKDEV_INODE_MAX);
+               hint = c->unused_inode_hints + cpu;
+       } else {
+               min = BLOCKDEV_INODE_MAX;
+               max = ~(ULLONG_MAX << bits);
+               hint = c->unused_inode_hints;
+       }
 
        start = READ_ONCE(*hint);
 
index 558d5464095d2506daf02bd614fab45f86b556a3..2cb081ae44d92f1c0c589528836397f4a9e967f9 100644 (file)
@@ -70,7 +70,7 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *,
                     struct bch_inode_unpacked *);
 
 struct btree_iter *bch2_inode_create(struct btree_trans *,
-                                    struct bch_inode_unpacked *, u32);
+                                    struct bch_inode_unpacked *, u32, u64);
 
 int bch2_inode_rm(struct bch_fs *, u64, bool);
 
index 9b6aece794f2c9cf3ec74fe633a69ac4a385696f..157b2a0fc58240a8ada5e0b2c15bc052ee736f25 100644 (file)
@@ -1439,7 +1439,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
        bch2_migrate_read_done(&op->write, rbio);
 
        closure_init(cl, NULL);
-       closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
+       closure_call(&op->write.op.cl, bch2_write, c->btree_update_wq, cl);
        closure_return_with_destructor(cl, promote_done);
 }
 
@@ -1822,6 +1822,13 @@ static void __bch2_read_endio(struct work_struct *work)
        if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
                goto csum_err;
 
+       /*
+        * XXX
+        * We need to rework the narrow_crcs path to deliver the read completion
+        * first, and then punt to a different workqueue, otherwise we're
+        * holding up reads while doing btree updates which is bad for memory
+        * reclaim.
+        */
        if (unlikely(rbio->narrow_crcs))
                bch2_rbio_narrow_crcs(rbio);
 
index 144dc9346c026c36371d483d6aac02b37a877639..bc0a0bd6f849438a82474c7e3ce2b331f6950be7 100644 (file)
@@ -58,7 +58,7 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
 {
        return op->alloc_reserve == RESERVE_MOVINGGC
                ? op->c->copygc_wq
-               : op->c->wq;
+               : op->c->btree_update_wq;
 }
 
 int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
index 52efa463d9f78ef09ff4515a3acdbcecc4da6ed2..af5386d959c879cdc49e86f9d12e41c56a661588 100644 (file)
@@ -118,7 +118,9 @@ void bch2_journal_halt(struct journal *j)
 
 void __bch2_journal_buf_put(struct journal *j)
 {
-       closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+       closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
 }
 
 /*
@@ -304,7 +306,7 @@ static int journal_entry_open(struct journal *j)
                                       j->res_get_blocked_start);
        j->res_get_blocked_start = 0;
 
-       mod_delayed_work(system_freezable_wq,
+       mod_delayed_work(c->io_complete_wq,
                         &j->write_work,
                         msecs_to_jiffies(j->write_delay_ms));
        journal_wake(j);
@@ -805,10 +807,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                long b;
 
                if (new_fs) {
-                       percpu_down_read(&c->mark_lock);
                        b = bch2_bucket_alloc_new_fs(ca);
                        if (b < 0) {
-                               percpu_up_read(&c->mark_lock);
                                ret = -ENOSPC;
                                goto err;
                        }
@@ -825,7 +825,8 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                        b = sector_to_bucket(ca, ob->ptr.offset);
                }
 
-               spin_lock(&c->journal.lock);
+               if (c)
+                       spin_lock(&c->journal.lock);
 
                /*
                 * XXX
@@ -852,14 +853,14 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                if (pos <= ja->cur_idx)
                        ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
 
-               spin_unlock(&c->journal.lock);
+               if (c)
+                       spin_unlock(&c->journal.lock);
 
                if (new_fs) {
                        bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
                                                  ca->mi.bucket_size,
                                                  gc_phase(GC_PHASE_SB),
                                                  0);
-                       percpu_up_read(&c->mark_lock);
                } else {
                        ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL,
                                bch2_trans_mark_metadata_bucket(&trans, ca,
index 635cceb4dd21ddd8ddd55b7130e0f325eaa057d9..2da6839fcdc0cda49a9e6b48680af6b0d55d314d 100644 (file)
@@ -834,7 +834,7 @@ static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
        unsigned i;
 
        for (i = 0; i < j->nr_ptrs; i++) {
-               struct bch_dev *ca = c->devs[j->ptrs[i].dev];
+               struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
                u64 offset;
 
                div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset);
@@ -1233,8 +1233,6 @@ static void journal_write_done(struct closure *cl)
        struct journal *j = container_of(cl, struct journal, io);
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct journal_buf *w = journal_last_unwritten_buf(j);
-       struct bch_devs_list devs =
-               bch2_bkey_devs(bkey_i_to_s_c(&w->key));
        struct bch_replicas_padded replicas;
        union journal_res_state old, new;
        u64 v, seq;
@@ -1242,11 +1240,12 @@ static void journal_write_done(struct closure *cl)
 
        bch2_time_stats_update(j->write_time, j->write_start_time);
 
-       if (!devs.nr) {
+       if (!w->devs_written.nr) {
                bch_err(c, "unable to write journal to sufficient devices");
                err = -EIO;
        } else {
-               bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs);
+               bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+                                        w->devs_written);
                if (bch2_mark_replicas(c, &replicas.e))
                        err = -EIO;
        }
@@ -1258,7 +1257,7 @@ static void journal_write_done(struct closure *cl)
        seq = le64_to_cpu(w->data->seq);
 
        if (seq >= j->pin.front)
-               journal_seq_pin(j, seq)->devs = devs;
+               journal_seq_pin(j, seq)->devs = w->devs_written;
 
        j->seq_ondisk           = seq;
        if (err && (!j->err_seq || seq < j->err_seq))
@@ -1296,27 +1295,27 @@ static void journal_write_done(struct closure *cl)
        journal_wake(j);
 
        if (test_bit(JOURNAL_NEED_WRITE, &j->flags))
-               mod_delayed_work(system_freezable_wq, &j->write_work, 0);
+               mod_delayed_work(c->io_complete_wq, &j->write_work, 0);
        spin_unlock(&j->lock);
 
        if (new.unwritten_idx != new.idx &&
            !journal_state_count(new, new.unwritten_idx))
-               closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL);
+               closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
 }
 
 static void journal_write_endio(struct bio *bio)
 {
        struct bch_dev *ca = bio->bi_private;
        struct journal *j = &ca->fs->journal;
+       struct journal_buf *w = journal_last_unwritten_buf(j);
+       unsigned long flags;
 
-       if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write error: %s",
+       if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s",
+                              le64_to_cpu(w->data->seq),
                               bch2_blk_status_to_str(bio->bi_status)) ||
            bch2_meta_write_fault("journal")) {
-               struct journal_buf *w = journal_last_unwritten_buf(j);
-               unsigned long flags;
-
                spin_lock_irqsave(&j->err_lock, flags);
-               bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx);
+               bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
                spin_unlock_irqrestore(&j->err_lock, flags);
        }
 
@@ -1370,7 +1369,7 @@ static void do_journal_write(struct closure *cl)
                        le64_to_cpu(w->data->seq);
        }
 
-       continue_at(cl, journal_write_done, system_highpri_wq);
+       continue_at(cl, journal_write_done, c->io_complete_wq);
        return;
 }
 
@@ -1402,7 +1401,8 @@ void bch2_journal_write(struct closure *cl)
            test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)) {
                w->noflush = true;
                SET_JSET_NO_FLUSH(jset, true);
-               jset->last_seq = w->last_seq = 0;
+               jset->last_seq  = 0;
+               w->last_seq     = 0;
 
                j->nr_noflush_writes++;
        } else {
@@ -1509,14 +1509,12 @@ retry_alloc:
                        journal_debug_buf);
                kfree(journal_debug_buf);
                bch2_fatal_error(c);
-               continue_at(cl, journal_write_done, system_highpri_wq);
+               continue_at(cl, journal_write_done, c->io_complete_wq);
                return;
        }
 
-       /*
-        * XXX: we really should just disable the entire journal in nochanges
-        * mode
-        */
+       w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
+
        if (c->opts.nochanges)
                goto no_io;
 
@@ -1542,14 +1540,14 @@ retry_alloc:
 
        bch2_bucket_seq_cleanup(c);
 
-       continue_at(cl, do_journal_write, system_highpri_wq);
+       continue_at(cl, do_journal_write, c->io_complete_wq);
        return;
 no_io:
        bch2_bucket_seq_cleanup(c);
 
-       continue_at(cl, journal_write_done, system_highpri_wq);
+       continue_at(cl, journal_write_done, c->io_complete_wq);
        return;
 err:
        bch2_inconsistent_error(c);
-       continue_at(cl, journal_write_done, system_highpri_wq);
+       continue_at(cl, journal_write_done, c->io_complete_wq);
 }
index 427be2da1dfccaadfc3c15081af9e23af1375540..7a0ae5d3431c0aa29e7e075e6628b64365a4916f 100644 (file)
@@ -93,6 +93,10 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca,
         * until we write it out - thus, account for it here:
         */
        while ((unwritten = get_unwritten_sectors(j, &idx))) {
+               /* entry won't fit on this device, skip: */
+               if (unwritten > ca->mi.bucket_size)
+                       continue;
+
                if (unwritten >= sectors) {
                        if (!buckets) {
                                sectors = 0;
index e1b63f3879f44e50cc2fdd92ca3de8db03a3c7fa..f2060f903cbcf90489de1712511c2925b69d7198 100644 (file)
@@ -111,8 +111,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
        bl->start[nr].start     = cpu_to_le64(start);
        bl->start[nr].end       = cpu_to_le64(end);
 out_write_sb:
-       c->disk_sb.sb->features[0] |=
-               1ULL << BCH_FEATURE_journal_seq_blacklist_v3;
+       c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
 
        ret = bch2_write_super(c);
 out:
@@ -298,8 +297,7 @@ void bch2_blacklist_entries_gc(struct work_struct *work)
                BUG_ON(new_nr && !bl);
 
                if (!new_nr)
-                       c->disk_sb.sb->features[0] &=
-                               ~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
+                       c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3));
 
                bch2_write_super(c);
        }
index cacab22a35c160e3a4c9525b568ae4a50e0ea4b9..61674ae1ab5fee1e3adc5696b8e193968552a4d6 100644 (file)
@@ -21,6 +21,7 @@ struct journal_buf {
        struct jset             *data;
 
        __BKEY_PADDED(key, BCH_REPLICAS_MAX);
+       struct bch_devs_list    devs_written;
 
        struct closure_waitlist wait;
        u64                     last_seq;       /* copy of data->last_seq */
index 778ff72cf5b257200dc831331f7c6f2eaf8e0695..2fa763e35392027aa49a7221ccad620828234281 100644 (file)
@@ -523,6 +523,11 @@ static int lookup_inode(struct btree_trans *trans, struct bpos pos,
        if (ret)
                goto err;
 
+       if (!k.k || bkey_cmp(k.k->p, pos)) {
+               ret = -ENOENT;
+               goto err;
+       }
+
        ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO;
        if (ret)
                goto err;
@@ -921,8 +926,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
                              rewrite_old_nodes_pred, c, stats);
        if (!ret) {
                mutex_lock(&c->sb_lock);
-               c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
-               c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
+               c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
+               c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
                c->disk_sb.sb->version_min = c->disk_sb.sb->version;
                bch2_write_super(c);
                mutex_unlock(&c->sb_lock);
index 61c5901f09802443bd9ab5f3e93385e4f45c3c4f..2acca0ddb6fd64a140fffe7735a3239661425b6f 100644 (file)
@@ -317,6 +317,8 @@ static int bch2_copygc_thread(void *arg)
        set_freezable();
 
        while (!kthread_should_stop()) {
+               cond_resched();
+
                if (kthread_wait_freezable(c->copy_gc_enabled))
                        break;
 
@@ -324,6 +326,7 @@ static int bch2_copygc_thread(void *arg)
                wait = bch2_copygc_wait_amount(c);
 
                if (wait > clock->max_slop) {
+                       trace_copygc_wait(c, wait, last + wait);
                        c->copygc_wait = last + wait;
                        bch2_kthread_io_clock_wait(clock, last + wait,
                                        MAX_SCHEDULE_TIMEOUT);
index 001e865c555560b1ceff0543915f50304fac7574..1e2fc5de5ca41f81bee8ff654e43f4323dca0294 100644 (file)
@@ -165,8 +165,13 @@ enum opt_type {
        x(inodes_32bit,                 u8,                             \
          OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                             \
          OPT_BOOL(),                                                   \
-         BCH_SB_INODE_32BIT,           false,                          \
+         BCH_SB_INODE_32BIT,           true,                           \
          NULL,         "Constrain inode numbers to 32 bits")           \
+       x(shard_inode_numbers,          u8,                             \
+         OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                             \
+         OPT_BOOL(),                                                   \
+         BCH_SB_SHARD_INUMS,           false,                          \
+         NULL,         "Shard new inode numbers by CPU id")            \
        x(gc_reserve_percent,           u8,                             \
          OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                             \
          OPT_UINT(5, 21),                                              \
index cd538ecc1f3f871689520bdffb599955c4915adb..9bd6348842e0733d90b881d03f773115cbaa5b87 100644 (file)
@@ -716,7 +716,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
        case BCH_JSET_ENTRY_dev_usage: {
                struct jset_entry_dev_usage *u =
                        container_of(entry, struct jset_entry_dev_usage, entry);
-               struct bch_dev *ca = bch_dev_bkey_exists(c, u->dev);
+               struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
                unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
                unsigned nr_types = (bytes - sizeof(struct jset_entry_dev_usage)) /
                        sizeof(struct jset_entry_dev_usage_type);
@@ -755,7 +755,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
                struct jset_entry_clock *clock =
                        container_of(entry, struct jset_entry_clock, entry);
 
-               atomic64_set(&c->io_clock[clock->rw].now, clock->time);
+               atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time));
        }
        }
 
@@ -1217,13 +1217,13 @@ use_clean:
 
        mutex_lock(&c->sb_lock);
        if (c->opts.version_upgrade) {
-               c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
-               c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
+               c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
+               c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
                write_sb = true;
        }
 
        if (!test_bit(BCH_FS_ERROR, &c->flags)) {
-               c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info;
+               c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
                write_sb = true;
        }
 
@@ -1278,12 +1278,12 @@ int bch2_fs_initialize(struct bch_fs *c)
        bch_notice(c, "initializing new filesystem");
 
        mutex_lock(&c->sb_lock);
-       c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_extents_above_btree_updates_done;
-       c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_bformat_overflow_done;
+       c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
+       c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done);
 
        if (c->opts.version_upgrade) {
-               c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current);
-               c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL;
+               c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
+               c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL);
                bch2_write_super(c);
        }
 
index c624fabe1e1cd74380baeeb7aef4f0e41886c041..a420729288d4200a4ba002038e8348412ce69e55 100644 (file)
@@ -151,7 +151,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
 
        set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
 
-       refcount        = (void *) &r_v->v;
+       refcount        = bkey_refcount(r_v);
        *refcount       = 0;
        memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
 
@@ -181,18 +181,19 @@ err:
 
 static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end)
 {
-       struct bkey_s_c k = bch2_btree_iter_peek(iter);
+       struct bkey_s_c k;
        int ret;
 
        for_each_btree_key_continue(iter, 0, k, ret) {
                if (bkey_cmp(iter->pos, end) >= 0)
-                       return bkey_s_c_null;
+                       break;
 
                if (bkey_extent_is_data(k.k))
-                       break;
+                       return k;
        }
 
-       return k;
+       bch2_btree_iter_set_pos(iter, end);
+       return bkey_s_c_null;
 }
 
 s64 bch2_remap_range(struct bch_fs *c,
@@ -205,8 +206,8 @@ s64 bch2_remap_range(struct bch_fs *c,
        struct bkey_s_c src_k;
        struct bkey_buf new_dst, new_src;
        struct bpos dst_end = dst_start, src_end = src_start;
-       struct bpos dst_want, src_want;
-       u64 src_done, dst_done;
+       struct bpos src_want;
+       u64 dst_done;
        int ret = 0, ret2 = 0;
 
        if (!percpu_ref_tryget(&c->writes))
@@ -226,7 +227,8 @@ s64 bch2_remap_range(struct bch_fs *c,
        dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, dst_start,
                                       BTREE_ITER_INTENT);
 
-       while (ret == 0 || ret == -EINTR) {
+       while ((ret == 0 || ret == -EINTR) &&
+              bkey_cmp(dst_iter->pos, dst_end) < 0) {
                struct disk_reservation disk_res = { 0 };
 
                bch2_trans_begin(&trans);
@@ -236,32 +238,29 @@ s64 bch2_remap_range(struct bch_fs *c,
                        break;
                }
 
+               dst_done = dst_iter->pos.offset - dst_start.offset;
+               src_want = POS(src_start.inode, src_start.offset + dst_done);
+               bch2_btree_iter_set_pos(src_iter, src_want);
+
                src_k = get_next_src(src_iter, src_end);
                ret = bkey_err(src_k);
                if (ret)
                        continue;
 
-               src_done = bpos_min(src_iter->pos, src_end).offset -
-                       src_start.offset;
-               dst_want = POS(dst_start.inode, dst_start.offset + src_done);
-
-               if (bkey_cmp(dst_iter->pos, dst_want) < 0) {
-                       ret = bch2_fpunch_at(&trans, dst_iter, dst_want,
-                                            journal_seq, i_sectors_delta);
+               if (bkey_cmp(src_want, src_iter->pos) < 0) {
+                       ret = bch2_fpunch_at(&trans, dst_iter,
+                                       bpos_min(dst_end,
+                                                POS(dst_iter->pos.inode, dst_iter->pos.offset +
+                                                    src_iter->pos.offset - src_want.offset)),
+                                                journal_seq, i_sectors_delta);
                        continue;
                }
 
-               BUG_ON(bkey_cmp(dst_iter->pos, dst_want));
-
-               if (!bkey_cmp(dst_iter->pos, dst_end))
-                       break;
-
                if (src_k.k->type != KEY_TYPE_reflink_p) {
                        bch2_bkey_buf_reassemble(&new_src, c, src_k);
                        src_k = bkey_i_to_s_c(new_src.k);
 
-                       bch2_cut_front(src_iter->pos,   new_src.k);
-                       bch2_cut_back(src_end,          new_src.k);
+                       bch2_btree_iter_set_pos(src_iter, bkey_start_pos(src_k.k));
 
                        ret = bch2_make_extent_indirect(&trans, src_iter,
                                                new_src.k);
@@ -278,7 +277,7 @@ s64 bch2_remap_range(struct bch_fs *c,
                                bkey_reflink_p_init(new_dst.k);
 
                        u64 offset = le64_to_cpu(src_p.v->idx) +
-                               (src_iter->pos.offset -
+                               (src_want.offset -
                                 bkey_start_offset(src_k.k));
 
                        dst_p->v.idx = cpu_to_le64(offset);
@@ -288,20 +287,13 @@ s64 bch2_remap_range(struct bch_fs *c,
 
                new_dst.k->k.p = dst_iter->pos;
                bch2_key_resize(&new_dst.k->k,
-                               min(src_k.k->p.offset - src_iter->pos.offset,
+                               min(src_k.k->p.offset - src_want.offset,
                                    dst_end.offset - dst_iter->pos.offset));
-
                ret = bch2_extent_update(&trans, dst_iter, new_dst.k,
                                         &disk_res, journal_seq,
                                         new_i_size, i_sectors_delta,
                                         true);
                bch2_disk_reservation_put(c, &disk_res);
-               if (ret)
-                       continue;
-
-               dst_done = dst_iter->pos.offset - dst_start.offset;
-               src_want = POS(src_start.inode, src_start.offset + dst_done);
-               bch2_btree_iter_set_pos(src_iter, src_want);
        }
        bch2_trans_iter_put(&trans, dst_iter);
        bch2_trans_iter_put(&trans, src_iter);
index 9d5e7dc58f2bcf35ab99f5627719ed734b2f6906..bfc785619ee89d17270fa75342c85fe4f2712d54 100644 (file)
@@ -34,6 +34,30 @@ void bch2_indirect_inline_data_to_text(struct printbuf *,
        .val_to_text    = bch2_indirect_inline_data_to_text,    \
 }
 
+static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
+{
+       switch (k.k->type) {
+       case KEY_TYPE_reflink_v:
+               return &bkey_s_c_to_reflink_v(k).v->refcount;
+       case KEY_TYPE_indirect_inline_data:
+               return &bkey_s_c_to_indirect_inline_data(k).v->refcount;
+       default:
+               return NULL;
+       }
+}
+
+static inline __le64 *bkey_refcount(struct bkey_i *k)
+{
+       switch (k->k.type) {
+       case KEY_TYPE_reflink_v:
+               return &bkey_i_to_reflink_v(k)->v.refcount;
+       case KEY_TYPE_indirect_inline_data:
+               return &bkey_i_to_indirect_inline_data(k)->v.refcount;
+       default:
+               return NULL;
+       }
+}
+
 s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos,
                     u64, u64 *, u64, s64 *);
 
index 74a75ced031e47cb9c901a36b6a4e9874032f712..977885166d55f0eba89e52ed454ab081f4acb362 100644 (file)
@@ -982,7 +982,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
 
        mutex_lock(&c->sb_lock);
        SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-       c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALWAYS;
+       c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS);
        ret = bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
 
@@ -999,7 +999,7 @@ static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
         * The u64s field counts from the start of data, ignoring the shared
         * fields.
         */
-       entry->u64s = u64s - 1;
+       entry->u64s = cpu_to_le16(u64s - 1);
 
        *end = vstruct_next(*end);
        return entry;
@@ -1092,7 +1092,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
 
                clock->entry.type = BCH_JSET_ENTRY_clock;
                clock->rw       = i;
-               clock->time     = atomic64_read(&c->io_clock[i].now);
+               clock->time     = cpu_to_le64(atomic64_read(&c->io_clock[i].now));
        }
 }
 
@@ -1109,10 +1109,10 @@ void bch2_fs_mark_clean(struct bch_fs *c)
 
        SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
 
-       c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_info;
-       c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_alloc_metadata;
-       c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
-       c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);
+       c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
+       c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata);
+       c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates));
+       c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled));
 
        u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;
 
index 3b1e9203bfcc32a4a623387d4356c9fd56fd05d7..4c6793639fc4a542518d56329e63c091ac579ffa 100644 (file)
@@ -509,10 +509,14 @@ static void __bch2_fs_free(struct bch_fs *c)
        kfree(c->unused_inode_hints);
        free_heap(&c->copygc_heap);
 
+       if (c->io_complete_wq )
+               destroy_workqueue(c->io_complete_wq );
        if (c->copygc_wq)
                destroy_workqueue(c->copygc_wq);
-       if (c->wq)
-               destroy_workqueue(c->wq);
+       if (c->btree_error_wq)
+               destroy_workqueue(c->btree_error_wq);
+       if (c->btree_update_wq)
+               destroy_workqueue(c->btree_update_wq);
 
        bch2_free_super(&c->disk_sb);
        kvpfree(c, sizeof(*c));
@@ -760,10 +764,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 
        c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
 
-       if (!(c->wq = alloc_workqueue("bcachefs",
+       if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
+                               WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+           !(c->btree_error_wq = alloc_workqueue("bcachefs_error",
                                WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
            !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
                                WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+           !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
+                               WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) ||
            percpu_ref_init(&c->writes, bch2_writes_disabled,
                            PERCPU_REF_INIT_DEAD, GFP_KERNEL) ||
            mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
@@ -1437,7 +1445,7 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca,
 
 /* Device add/removal: */
 
-int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
+static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
        struct btree_trans trans;
        size_t i;
index 21ef7719cf55019be71657f6e99520548cc5fa72..84a7acb04d01b1915f673164ef98dc93011de5de 100644 (file)
@@ -312,7 +312,7 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
        return 0;
 }
 
-void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
+static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c)
 {
        pr_buf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]);
        bch2_bpos_to_text(out, c->gc_gens_pos);