]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 2cb70a82bc bcachefs: delete some debug code
authorKent Overstreet <kent.overstreet@gmail.com>
Wed, 27 Jun 2018 18:41:51 +0000 (14:41 -0400)
committerKent Overstreet <kent.overstreet@gmail.com>
Wed, 27 Jun 2018 18:50:43 +0000 (14:50 -0400)
42 files changed:
.bcachefs_revision
include/linux/slab.h
libbcachefs.c
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/btree_cache.c
libbcachefs/btree_cache.h
libbcachefs/btree_gc.c
libbcachefs/btree_gc.h
libbcachefs/btree_io.c
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_locking.h
libbcachefs/btree_update.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_interior.h
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/checksum.h
libbcachefs/dirent.c
libbcachefs/fs-io.c
libbcachefs/fs.c
libbcachefs/journal.c
libbcachefs/journal_io.c
libbcachefs/journal_io.h
libbcachefs/journal_reclaim.c
libbcachefs/journal_reclaim.h
libbcachefs/journal_types.h
libbcachefs/migrate.c
libbcachefs/opts.h
libbcachefs/recovery.c [new file with mode: 0644]
libbcachefs/recovery.h [new file with mode: 0644]
libbcachefs/replicas.c
libbcachefs/str_hash.h
libbcachefs/super-io.c
libbcachefs/super-io.h
libbcachefs/super.c
libbcachefs/sysfs.c
libbcachefs/tests.c [new file with mode: 0644]
libbcachefs/tests.h [new file with mode: 0644]
libbcachefs/util.c
libbcachefs/xattr.c

index 51df9f0e172feb8a39906da735889156fd6eae84..a8916efbd2e823b87b069b898bff4326ce5aae95 100644 (file)
@@ -1 +1 @@
-9abf628c701ad92670d697624f674cc01d42705e
+2cb70a82bc0ca05d8c3cf666d221badd5724e339
index 9229e7503726f8ba854455da31b3930c0b5d434f..c19f190b1fb0d95cb73fee10ad3697419939df21 100644 (file)
@@ -112,4 +112,14 @@ static inline void *vmap(struct page **pages, unsigned int count,
 
 #define vmalloc_to_page(addr)          ((struct page *) (addr))
 
+static inline void *kmemdup(const void *src, size_t len, gfp_t gfp)
+{
+       void *p;
+
+       p = kmalloc(len, gfp);
+       if (p)
+               memcpy(p, src, len);
+       return p;
+}
+
 #endif /* __TOOLS_LINUX_SLAB_H */
index 3278645b8a1f5d41f46b3ac39acd0ccc5ff25c43..49790d8979223a43d118dc2831252ed3193a94fb 100644 (file)
@@ -519,6 +519,11 @@ static void bch2_sb_print_disk_groups(struct bch_sb *sb, struct bch_sb_field *f,
 {
 }
 
+static void bch2_sb_print_clean(struct bch_sb *sb, struct bch_sb_field *f,
+                               enum units units)
+{
+}
+
 typedef void (*sb_field_print_fn)(struct bch_sb *, struct bch_sb_field *, enum units);
 
 struct bch_sb_field_toolops {
index 4702b016945e418edc89132b5b0f129ed96c0f77..1482b80a8672655edf2f37e0a42ccbaa176f8fcf 100644 (file)
@@ -259,6 +259,10 @@ do {                                                                       \
                "Reread btree nodes at various points to verify the "   \
                "mergesort in the read path against modifications "     \
                "done in memory")                                       \
+       BCH_DEBUG_PARAM(journal_seq_verify,                             \
+               "Store the journal sequence number in the version "     \
+               "number of every btree key, and verify that btree "     \
+               "update ordering is preserved during recovery")
 
 #define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG()
 
@@ -314,7 +318,13 @@ enum bch_time_stats {
 struct btree;
 
 enum gc_phase {
-       GC_PHASE_SB             = BTREE_ID_NR + 1,
+       GC_PHASE_START,
+       GC_PHASE_SB,
+
+#define DEF_BTREE_ID(kwd, val, name) GC_PHASE_BTREE_##kwd,
+       DEFINE_BCH_BTREE_IDS()
+#undef DEF_BTREE_ID
+
        GC_PHASE_PENDING_DELETE,
        GC_PHASE_ALLOC,
        GC_PHASE_DONE
index ab8b944634e87387648884cd679b37ff374e2fbf..b6e7b983bc5bf76297ff9de1d209d204cec5c149 100644 (file)
@@ -426,6 +426,16 @@ enum bch_csum_type {
        BCH_CSUM_NR                     = 7,
 };
 
+static const unsigned bch_crc_bytes[] = {
+       [BCH_CSUM_NONE]                         = 0,
+       [BCH_CSUM_CRC32C_NONZERO]               = 4,
+       [BCH_CSUM_CRC32C]                       = 4,
+       [BCH_CSUM_CRC64_NONZERO]                = 8,
+       [BCH_CSUM_CRC64]                        = 8,
+       [BCH_CSUM_CHACHA20_POLY1305_80]         = 10,
+       [BCH_CSUM_CHACHA20_POLY1305_128]        = 16,
+};
+
 static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
 {
        switch (type) {
@@ -783,6 +793,11 @@ struct bch_dirent {
 } __attribute__((packed, aligned(8)));
 BKEY_VAL_TYPE(dirent,          BCH_DIRENT);
 
+#define BCH_NAME_MAX   (U8_MAX * sizeof(u64) -                         \
+                        sizeof(struct bkey) -                          \
+                        offsetof(struct bch_dirent, d_name))
+
+
 /* Xattrs */
 
 enum {
@@ -868,7 +883,8 @@ struct bch_sb_field {
        x(crypt,        2)      \
        x(replicas,     3)      \
        x(quota,        4)      \
-       x(disk_groups,  5)
+       x(disk_groups,  5)      \
+       x(clean,        6)
 
 enum bch_sb_field_type {
 #define x(f, nr)       BCH_SB_FIELD_##f = nr,
@@ -1038,6 +1054,37 @@ struct bch_sb_field_disk_groups {
        struct bch_disk_group   entries[0];
 };
 
+/*
+ * On clean shutdown, store btree roots and current journal sequence number in
+ * the superblock:
+ */
+struct jset_entry {
+       __le16                  u64s;
+       __u8                    btree_id;
+       __u8                    level;
+       __u8                    type; /* designates what this jset holds */
+       __u8                    pad[3];
+
+       union {
+               struct bkey_i   start[0];
+               __u64           _data[0];
+       };
+};
+
+struct bch_sb_field_clean {
+       struct bch_sb_field     field;
+
+       __le32                  flags;
+       __le16                  read_clock;
+       __le16                  write_clock;
+       __le64                  journal_seq;
+
+       union {
+               struct jset_entry start[0];
+               __u64           _data[0];
+       };
+};
+
 /* Superblock: */
 
 /*
@@ -1255,19 +1302,6 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
 #define BCACHE_JSET_VERSION_JKEYS      2
 #define BCACHE_JSET_VERSION            2
 
-struct jset_entry {
-       __le16                  u64s;
-       __u8                    btree_id;
-       __u8                    level;
-       __u8                    type; /* designates what this jset holds */
-       __u8                    pad[3];
-
-       union {
-               struct bkey_i   start[0];
-               __u64           _data[0];
-       };
-};
-
 #define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64))
 
 #define BCH_JSET_ENTRY_TYPES()                 \
index c950f2564f25d9e907a564fdf07843b1f3b880a6..b0dc4c8a85cb0edf3360a02934857e403422d5e2 100644 (file)
@@ -649,7 +649,14 @@ struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter,
        struct btree *b;
        struct bset_tree *t;
 
-       /* btree_node_fill() requires parent to be locked: */
+       /*
+        * XXX: locking optimization
+        *
+        * we can make the locking looser here - caller can drop lock on parent
+        * node before locking child node (and potentially blocking): we just
+        * have to have bch2_btree_node_fill() call relock on the parent and
+        * return -EINTR if that fails
+        */
        EBUG_ON(!btree_node_locked(iter, level + 1));
        EBUG_ON(level >= BTREE_MAX_DEPTH);
 retry:
@@ -749,23 +756,22 @@ retry:
 struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
                                          struct btree_iter *iter,
                                          struct btree *b,
+                                         bool may_drop_locks,
                                          enum btree_node_sibling sib)
 {
        struct btree *parent;
        struct btree_node_iter node_iter;
        struct bkey_packed *k;
        BKEY_PADDED(k) tmp;
-       struct btree *ret;
+       struct btree *ret = NULL;
        unsigned level = b->level;
 
        parent = btree_iter_node(iter, level + 1);
        if (!parent)
                return NULL;
 
-       if (!bch2_btree_node_relock(iter, level + 1)) {
-               bch2_btree_iter_set_locks_want(iter, level + 2);
-               return ERR_PTR(-EINTR);
-       }
+       if (!bch2_btree_node_relock(iter, level + 1))
+               goto out_upgrade;
 
        node_iter = iter->l[parent->level].iter;
 
@@ -778,34 +784,66 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
                        : (bch2_btree_node_iter_advance(&node_iter, parent),
                           bch2_btree_node_iter_peek_all(&node_iter, parent));
                if (!k)
-                       return NULL;
+                       goto out;
        } while (bkey_deleted(k));
 
        bch2_bkey_unpack(parent, &tmp.k, k);
 
        ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent);
 
-       if (IS_ERR(ret) && PTR_ERR(ret) == -EINTR) {
-               btree_node_unlock(iter, level);
+       if (PTR_ERR_OR_ZERO(ret) == -EINTR && may_drop_locks) {
+               struct btree_iter *linked;
 
-               if (!bch2_btree_node_relock(iter, level + 1)) {
-                       bch2_btree_iter_set_locks_want(iter, level + 2);
-                       return ERR_PTR(-EINTR);
-               }
+               if (!bch2_btree_node_relock(iter, level + 1))
+                       goto out_upgrade;
 
-               ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent);
-       }
+               /*
+                * We might have got -EINTR because trylock failed, and we're
+                * holding other locks that would cause us to deadlock:
+                */
+               for_each_linked_btree_iter(iter, linked)
+                       if (btree_iter_cmp(iter, linked) < 0)
+                               __bch2_btree_iter_unlock(linked);
+
+               if (sib == btree_prev_sib)
+                       btree_node_unlock(iter, level);
 
-       if (!bch2_btree_node_relock(iter, level)) {
-               btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+               ret = bch2_btree_node_get(c, iter, &tmp.k, level,
+                                         SIX_LOCK_intent);
 
-               if (!IS_ERR(ret)) {
-                       six_unlock_intent(&ret->lock);
-                       ret = ERR_PTR(-EINTR);
+               /*
+                * before btree_iter_relock() calls btree_iter_verify_locks():
+                */
+               if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
+                       btree_node_unlock(iter, level + 1);
+
+               if (!bch2_btree_node_relock(iter, level)) {
+                       btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+
+                       if (!IS_ERR(ret)) {
+                               six_unlock_intent(&ret->lock);
+                               ret = ERR_PTR(-EINTR);
+                       }
                }
+
+               bch2_btree_iter_relock(iter);
        }
+out:
+       if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED)
+               btree_node_unlock(iter, level + 1);
+
+       bch2_btree_iter_verify_locks(iter);
+
+       BUG_ON((!may_drop_locks || !IS_ERR(ret)) &&
+              (iter->uptodate >= BTREE_ITER_NEED_RELOCK ||
+               !btree_node_locked(iter, level)));
 
        return ret;
+out_upgrade:
+       if (may_drop_locks)
+               bch2_btree_iter_upgrade(iter, level + 2);
+       ret = ERR_PTR(-EINTR);
+       goto out;
 }
 
 void bch2_btree_node_prefetch(struct bch_fs *c, const struct bkey_i *k,
index e021d6e9422ad10034a1626d5c251f3cfccd5c39..43109d086479d3775e5fd91947747b434bdda788 100644 (file)
@@ -26,7 +26,7 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
                                  enum six_lock_type);
 
 struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
-                                         struct btree *,
+                                         struct btree *, bool,
                                          enum btree_node_sibling);
 
 void bch2_btree_node_prefetch(struct bch_fs *, const struct bkey_i *,
index 02b14e38ffda9daad7bad2efc98165c6024cb07b..969c1f19414e3c52d5001af30c37e7ab90b8c6b7 100644 (file)
@@ -148,6 +148,9 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
                ? BCH_DATA_BTREE : BCH_DATA_USER;
        int ret = 0;
 
+       BUG_ON(journal_seq_verify(c) &&
+              k.k->version.lo > journal_cur_seq(&c->journal));
+
        if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
            fsck_err_on(!bch2_bkey_replicas_marked(c, data_type, k), c,
                        "superblock not marked as containing replicas (type %u)",
@@ -243,6 +246,11 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
        unsigned max_stale;
        int ret = 0;
 
+       gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
+
+       if (!c->btree_roots[btree_id].b)
+               return 0;
+
        /*
         * if expensive_debug_checks is on, run range_checks on all leaf nodes:
         */
@@ -454,7 +462,7 @@ static void bch2_gc_start(struct bch_fs *c)
         * Indicates to buckets code that gc is now in progress - done under
         * usage_lock to avoid racing with bch2_mark_key():
         */
-       __gc_pos_set(c, GC_POS_MIN);
+       __gc_pos_set(c, gc_phase(GC_PHASE_START));
 
        /* Save a copy of the existing bucket stats while we recompute them: */
        for_each_member_device(ca, c, i) {
@@ -535,22 +543,18 @@ void bch2_gc(struct bch_fs *c)
 
        bch2_gc_start(c);
 
-       /* Walk btree: */
-       while (c->gc_pos.phase < (int) BTREE_ID_NR) {
-               int ret = c->btree_roots[c->gc_pos.phase].b
-                       ? bch2_gc_btree(c, (int) c->gc_pos.phase)
-                       : 0;
+       bch2_mark_superblocks(c);
 
+       /* Walk btree: */
+       for (i = 0; i < BTREE_ID_NR; i++) {
+               int ret = bch2_gc_btree(c, i);
                if (ret) {
                        bch_err(c, "btree gc failed: %d", ret);
                        set_bit(BCH_FS_GC_FAILURE, &c->flags);
                        goto out;
                }
-
-               gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
        }
 
-       bch2_mark_superblocks(c);
        bch2_mark_pending_btree_node_frees(c);
        bch2_mark_allocator_buckets(c);
 
@@ -780,13 +784,13 @@ next:
                bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key);
 
        /* Insert the newly coalesced nodes */
-       bch2_btree_insert_node(as, parent, iter, &keylist);
+       bch2_btree_insert_node(as, parent, iter, &keylist, 0);
 
        BUG_ON(!bch2_keylist_empty(&keylist));
 
        BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]);
 
-       BUG_ON(!bch2_btree_iter_node_replace(iter, new_nodes[0]));
+       bch2_btree_iter_node_replace(iter, new_nodes[0]);
 
        for (i = 0; i < nr_new_nodes; i++)
                bch2_btree_open_bucket_put(c, new_nodes[i]);
@@ -1003,6 +1007,8 @@ static int bch2_initial_gc_btree(struct bch_fs *c, enum btree_id id)
 
        btree_node_range_checks_init(&r, 0);
 
+       gc_pos_set(c, gc_pos_btree(id, POS_MIN, 0));
+
        if (!c->btree_roots[id].b)
                return 0;
 
@@ -1041,36 +1047,33 @@ err:
        return bch2_btree_iter_unlock(&iter) ?: ret;
 }
 
-static int __bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
+int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
 {
        unsigned iter = 0;
        enum btree_id id;
-       int ret;
+       int ret = 0;
 
-       mutex_lock(&c->sb_lock);
-       if (!bch2_sb_get_replicas(c->disk_sb.sb)) {
-               if (BCH_SB_INITIALIZED(c->disk_sb.sb))
-                       bch_info(c, "building replicas info");
-               set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
-       }
-       mutex_unlock(&c->sb_lock);
+       down_write(&c->gc_lock);
 again:
        bch2_gc_start(c);
 
+       bch2_mark_superblocks(c);
+
        for (id = 0; id < BTREE_ID_NR; id++) {
                ret = bch2_initial_gc_btree(c, id);
                if (ret)
-                       return ret;
+                       goto err;
        }
 
        ret = bch2_journal_mark(c, journal);
        if (ret)
-               return ret;
+               goto err;
 
        if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
                if (iter++ > 2) {
                        bch_info(c, "Unable to fix bucket gens, looping");
-                       return -EINVAL;
+                       ret = -EINVAL;
+                       goto err;
                }
 
                bch_info(c, "Fixed gens, restarting initial mark and sweep:");
@@ -1085,21 +1088,9 @@ again:
        if (c->sb.encryption_type)
                atomic64_add(1 << 16, &c->key_version);
 
-       bch2_mark_superblocks(c);
-
        gc_pos_set(c, gc_phase(GC_PHASE_DONE));
        set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
-
-       return 0;
-}
-
-int bch2_initial_gc(struct bch_fs *c, struct list_head *journal)
-{
-       int ret;
-
-       down_write(&c->gc_lock);
-       ret = __bch2_initial_gc(c, journal);
+err:
        up_write(&c->gc_lock);
-
        return ret;
 }
index 4d1ab9dbe9c85d3e405c7857780c84bf05fdf2f7..214a3fe3aabed2c0ca6d311f5c438046b01b6fbf 100644 (file)
@@ -46,8 +46,6 @@ static inline struct gc_pos gc_phase(enum gc_phase phase)
        };
 }
 
-#define GC_POS_MIN     gc_phase(0)
-
 static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
 {
        if (l.phase != r.phase)
@@ -59,17 +57,23 @@ static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
        return 0;
 }
 
+static inline struct gc_pos gc_pos_btree(enum btree_id id,
+                                        struct bpos pos, unsigned level)
+{
+       return (struct gc_pos) {
+               .phase  = GC_PHASE_BTREE_EXTENTS + id,
+               .pos    = pos,
+               .level  = level,
+       };
+}
+
 /*
  * GC position of the pointers within a btree node: note, _not_ for &b->key
  * itself, that lives in the parent node:
  */
 static inline struct gc_pos gc_pos_btree_node(struct btree *b)
 {
-       return (struct gc_pos) {
-               .phase  = b->btree_id,
-               .pos    = b->key.k.p,
-               .level  = b->level,
-       };
+       return gc_pos_btree(b->btree_id, b->key.k.p, b->level);
 }
 
 /*
@@ -81,11 +85,7 @@ static inline struct gc_pos gc_pos_btree_node(struct btree *b)
  */
 static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
 {
-       return (struct gc_pos) {
-               .phase  = (int) id,
-               .pos    = POS_MAX,
-               .level  = U8_MAX,
-       };
+       return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH);
 }
 
 static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob)
index 74ffad4c38f3b6b8db912e6bffb059d082de5002..0c825bcbc45ca1331b41fb487b92561bd727f5ea 100644 (file)
@@ -920,7 +920,7 @@ static int btree_err_msg(struct bch_fs *c, struct btree *b, struct bset *i,
        char *out = buf, *end = buf + len;
 
        out += scnprintf(out, end - out,
-                        "error validating btree node %s "
+                        "error validating btree node %s"
                         "at btree %u level %u/%u\n"
                         "pos %llu:%llu node offset %u",
                         write ? "before write " : "",
@@ -1120,7 +1120,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 
                        bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
                        btree_err(BTREE_ERR_FIXABLE, c, b, i,
-                                 "invalid bkey:\n%s\n%s", buf, invalid);
+                                 "invalid bkey:\n%s\n%s", invalid, buf);
 
                        i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
                        memmove_u64s_down(k, bkey_next(k),
index 95ee9f615e7da875fd6627c91c1326fc3202af65..682a91434775a5150d4a0d1562b5bb42020e67ec 100644 (file)
@@ -34,11 +34,9 @@ void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
        EBUG_ON(iter->l[b->level].b != b);
        EBUG_ON(iter->lock_seq[b->level] + 1 != b->lock.state.seq);
 
-       for_each_linked_btree_node(iter, b, linked)
+       for_each_btree_iter_with_node(iter, b, linked)
                linked->lock_seq[b->level] += 2;
 
-       iter->lock_seq[b->level] += 2;
-
        six_unlock_write(&b->lock);
 }
 
@@ -48,6 +46,8 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
        struct btree_iter *linked;
        unsigned readers = 0;
 
+       EBUG_ON(btree_node_read_locked(iter, b->level));
+
        for_each_linked_btree_iter(iter, linked)
                if (linked->l[b->level].b == b &&
                    btree_node_read_locked(linked, b->level))
@@ -66,15 +66,51 @@ void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
                     &b->lock.state.counter);
 }
 
-bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
+/*
+ * Lock a btree node if we already have it locked on one of our linked
+ * iterators:
+ */
+static inline bool btree_node_lock_increment(struct btree_iter *iter,
+                                            struct btree *b, unsigned level,
+                                            enum btree_node_locked_type want)
 {
        struct btree_iter *linked;
+
+       for_each_linked_btree_iter(iter, linked)
+               if (linked->l[level].b == b &&
+                   btree_node_locked_type(linked, level) >= want) {
+                       six_lock_increment(&b->lock, want);
+                       return true;
+               }
+
+       return false;
+}
+
+bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
+{
        struct btree *b = iter->l[level].b;
-       int want = btree_lock_want(iter, level);
-       int have = btree_node_locked_type(iter, level);
+       int want = __btree_lock_want(iter, level);
 
-       if (want == have)
-               return true;
+       if (!is_btree_node(iter, level))
+               return false;
+
+       if (race_fault())
+               return false;
+
+       if (!six_relock_type(&b->lock, want, iter->lock_seq[level]) &&
+           !(iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 &&
+             btree_node_lock_increment(iter, b, level, want)))
+               return false;
+
+       mark_btree_node_locked(iter, level, want);
+       return true;
+}
+
+static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
+{
+       struct btree *b = iter->l[level].b;
+
+       EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);
 
        if (!is_btree_node(iter, level))
                return false;
@@ -82,42 +118,62 @@ bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
        if (race_fault())
                return false;
 
-       if (have != BTREE_NODE_UNLOCKED
-           ? six_trylock_convert(&b->lock, have, want)
-           : six_relock_type(&b->lock, want, iter->lock_seq[level]))
+       if (btree_node_intent_locked(iter, level))
+               return true;
+
+       if (btree_node_locked(iter, level)
+           ? six_lock_tryupgrade(&b->lock)
+           : six_relock_type(&b->lock, SIX_LOCK_intent, iter->lock_seq[level]))
                goto success;
 
-       for_each_linked_btree_iter(iter, linked)
-               if (linked->l[level].b == b &&
-                   btree_node_locked_type(linked, level) == want &&
-                   iter->lock_seq[level] == b->lock.state.seq) {
-                       btree_node_unlock(iter, level);
-                       six_lock_increment(&b->lock, want);
-                       goto success;
-               }
+       if (iter->lock_seq[level] >> 1 == b->lock.state.seq >> 1 &&
+           btree_node_lock_increment(iter, b, level, BTREE_NODE_INTENT_LOCKED)) {
+               btree_node_unlock(iter, level);
+               goto success;
+       }
 
        return false;
 success:
-       mark_btree_node_unlocked(iter, level);
-       mark_btree_node_locked(iter, level, want);
+       mark_btree_node_intent_locked(iter, level);
        return true;
 }
 
-bool bch2_btree_iter_relock(struct btree_iter *iter)
+static inline bool btree_iter_get_locks(struct btree_iter *iter,
+                                       bool upgrade)
 {
-       unsigned l;
+       unsigned l = iter->level;
+       int fail_idx = -1;
 
-       for (l = iter->level;
-            l < max_t(unsigned, iter->locks_want, 1) && iter->l[l].b;
-            l++)
-               if (!bch2_btree_node_relock(iter, l)) {
+       do {
+               if (!btree_iter_node(iter, l))
+                       break;
+
+               if (!(upgrade
+                     ? bch2_btree_node_upgrade(iter, l)
+                     : bch2_btree_node_relock(iter, l))) {
+                       fail_idx = l;
                        btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-                       return false;
                }
 
+               l++;
+       } while (l < iter->locks_want);
+
+       /*
+        * When we fail to get a lock, we have to ensure that any child nodes
+        * can't be relocked so bch2_btree_iter_traverse has to walk back up to
+        * the node that we failed to relock:
+        */
+       while (fail_idx >= 0) {
+               btree_node_unlock(iter, fail_idx);
+               iter->l[fail_idx].b = BTREE_ITER_NOT_END;
+               --fail_idx;
+       }
+
        if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
                iter->uptodate = BTREE_ITER_NEED_PEEK;
-       return true;
+
+       bch2_btree_iter_verify_locks(iter);
+       return iter->uptodate < BTREE_ITER_NEED_RELOCK;
 }
 
 /* Slowpath: */
@@ -128,6 +184,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 {
        struct bch_fs *c = iter->c;
        struct btree_iter *linked;
+       bool ret = true;
 
        /* Can't have children locked before ancestors: */
        EBUG_ON(iter->nodes_locked && level > __ffs(iter->nodes_locked));
@@ -140,15 +197,11 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
        EBUG_ON(type == SIX_LOCK_intent &&
                iter->nodes_locked != iter->nodes_intent_locked);
 
-       for_each_linked_btree_iter(iter, linked)
-               if (linked->l[level].b == b &&
-                   btree_node_locked_type(linked, level) == type) {
-                       six_lock_increment(&b->lock, type);
-                       return true;
-               }
+       if (btree_node_lock_increment(iter, b, level, type))
+               return true;
 
        /*
-        * Must lock btree nodes in key order - this case hapens when locking
+        * Must lock btree nodes in key order - this case happens when locking
         * the prev sibling in btree node merging:
         */
        if (iter->nodes_locked &&
@@ -160,6 +213,10 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                if (!linked->nodes_locked)
                        continue;
 
+               /* We have to lock btree nodes in key order: */
+               if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
+                       ret = false;
+
                /*
                 * Can't block taking an intent lock if we have _any_ nodes read
                 * locked:
@@ -175,15 +232,12 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                if (type == SIX_LOCK_intent &&
                    linked->nodes_locked != linked->nodes_intent_locked) {
                        linked->locks_want = max_t(unsigned,
-                                                  linked->locks_want,
-                                                  iter->locks_want);
-                       return false;
+                                       linked->locks_want,
+                                       __fls(linked->nodes_locked) + 1);
+                       btree_iter_get_locks(linked, true);
+                       ret = false;
                }
 
-               /* We have to lock btree nodes in key order: */
-               if (__btree_iter_cmp(iter->btree_id, pos, linked) < 0)
-                       return false;
-
                /*
                 * Interior nodes must be locked before their descendants: if
                 * another iterator has possible descendants locked of the node
@@ -194,82 +248,133 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
                        linked->locks_want = max_t(unsigned,
                                                   linked->locks_want,
                                                   iter->locks_want);
-                       return false;
+                       btree_iter_get_locks(linked, true);
+                       ret = false;
                }
        }
 
-       __btree_node_lock_type(c, b, type);
-       return true;
+       if (ret)
+               __btree_node_lock_type(c, b, type);
+       return ret;
 }
 
 /* Btree iterator locking: */
 
-static void btree_iter_drop_extra_locks(struct btree_iter *iter)
+#ifdef CONFIG_BCACHEFS_DEBUG
+void bch2_btree_iter_verify_locks(struct btree_iter *iter)
 {
        unsigned l;
 
-       while (iter->nodes_locked &&
-              (l = __fls(iter->nodes_locked)) > iter->locks_want) {
-               if (l > iter->level) {
-                       btree_node_unlock(iter, l);
-               } else {
-                       if (btree_node_intent_locked(iter, l)) {
-                               six_lock_downgrade(&iter->l[l].b->lock);
-                               iter->nodes_intent_locked ^= 1 << l;
-                       }
-                       break;
-               }
+       if (iter->uptodate == BTREE_ITER_END) {
+               BUG_ON(iter->nodes_locked);
+               return;
+       }
+
+       for (l = 0; btree_iter_node(iter, l); l++) {
+               if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
+                   !btree_node_locked(iter, l))
+                       continue;
+
+               BUG_ON(btree_lock_want(iter, l) !=
+                      btree_node_locked_type(iter, l));
        }
 }
+#endif
+
+__flatten
+static bool __bch2_btree_iter_relock(struct btree_iter *iter)
+{
+       if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
+               return true;
+
+       if (iter->uptodate > BTREE_ITER_NEED_TRAVERSE)
+               return false;
+
+       return btree_iter_get_locks(iter, false);
+}
 
-bool __bch2_btree_iter_set_locks_want(struct btree_iter *iter,
-                                    unsigned new_locks_want)
+bool bch2_btree_iter_relock(struct btree_iter *iter)
 {
        struct btree_iter *linked;
+       bool ret = true;
 
-       /* Drop locks we don't want anymore: */
-       if (new_locks_want < iter->locks_want)
-               for_each_linked_btree_iter(iter, linked)
-                       if (linked->locks_want > new_locks_want) {
-                               linked->locks_want = max_t(unsigned, 1,
-                                                          new_locks_want);
-                               btree_iter_drop_extra_locks(linked);
-                       }
+       for_each_btree_iter(iter, linked)
+               ret &= __bch2_btree_iter_relock(linked);
+
+       return ret;
+}
+
+bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
+                              unsigned new_locks_want)
+{
+       struct btree_iter *linked;
+
+       EBUG_ON(iter->locks_want >= new_locks_want);
 
        iter->locks_want = new_locks_want;
-       btree_iter_drop_extra_locks(iter);
 
-       if (bch2_btree_iter_relock(iter))
+       if (btree_iter_get_locks(iter, true))
                return true;
 
        /*
-        * Just an optimization: ancestor nodes must be locked before child
-        * nodes, so set locks_want on iterators that might lock ancestors
-        * before us to avoid getting -EINTR later:
+        * Ancestor nodes must be locked before child nodes, so set locks_want
+        * on iterators that might lock ancestors before us to avoid getting
+        * -EINTR later:
         */
        for_each_linked_btree_iter(iter, linked)
                if (linked->btree_id == iter->btree_id &&
-                   btree_iter_cmp(linked, iter) <= 0)
-                       linked->locks_want = max_t(unsigned, linked->locks_want,
-                                                  new_locks_want);
+                   btree_iter_cmp(linked, iter) <= 0 &&
+                   linked->locks_want < new_locks_want) {
+                       linked->locks_want = new_locks_want;
+                       btree_iter_get_locks(linked, true);
+               }
+
        return false;
 }
 
-static void __bch2_btree_iter_unlock(struct btree_iter *iter)
+void __bch2_btree_iter_downgrade(struct btree_iter *iter,
+                                unsigned downgrade_to)
 {
-       btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+       struct btree_iter *linked;
+       unsigned l;
+
+       /*
+        * We downgrade linked iterators as well because btree_iter_upgrade
+        * might have had to modify locks_want on linked iterators due to lock
+        * ordering:
+        */
+       for_each_btree_iter(iter, linked) {
+               unsigned new_locks_want = downgrade_to ?:
+                       (linked->flags & BTREE_ITER_INTENT ? 1 : 0);
+
+               if (linked->locks_want <= new_locks_want)
+                       continue;
 
-       while (iter->nodes_locked)
-               btree_node_unlock(iter, __ffs(iter->nodes_locked));
+               linked->locks_want = new_locks_want;
+
+               while (linked->nodes_locked &&
+                      (l = __fls(linked->nodes_locked)) >= linked->locks_want) {
+                       if (l > linked->level) {
+                               btree_node_unlock(linked, l);
+                       } else {
+                               if (btree_node_intent_locked(linked, l)) {
+                                       six_lock_downgrade(&linked->l[l].b->lock);
+                                       linked->nodes_intent_locked ^= 1 << l;
+                               }
+                               break;
+                       }
+               }
+
+               bch2_btree_iter_verify_locks(linked);
+       }
 }
 
 int bch2_btree_iter_unlock(struct btree_iter *iter)
 {
        struct btree_iter *linked;
 
-       for_each_linked_btree_iter(iter, linked)
+       for_each_btree_iter(iter, linked)
                __bch2_btree_iter_unlock(linked);
-       __bch2_btree_iter_unlock(iter);
 
        return iter->flags & BTREE_ITER_ERROR ? -EIO : 0;
 }
@@ -320,11 +425,8 @@ void bch2_btree_iter_verify(struct btree_iter *iter, struct btree *b)
 {
        struct btree_iter *linked;
 
-       if (iter->l[b->level].b == b)
-               __bch2_btree_iter_verify(iter, b);
-
-       for_each_linked_btree_node(iter, b, linked)
-               __bch2_btree_iter_verify(iter, b);
+       for_each_btree_iter_with_node(iter, b, linked)
+               __bch2_btree_iter_verify(linked, b);
 }
 
 #endif
@@ -456,12 +558,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *iter,
                __bch2_btree_node_iter_fix(iter, b, node_iter, t,
                                          where, clobber_u64s, new_u64s);
 
-       if (iter->l[b->level].b == b)
-               __bch2_btree_node_iter_fix(iter, b,
-                                         &iter->l[b->level].iter, t,
-                                         where, clobber_u64s, new_u64s);
-
-       for_each_linked_btree_node(iter, b, linked)
+       for_each_btree_iter_with_node(iter, b, linked)
                __bch2_btree_node_iter_fix(linked, b,
                                          &linked->l[b->level].iter, t,
                                          where, clobber_u64s, new_u64s);
@@ -613,11 +710,12 @@ static inline void btree_iter_node_set(struct btree_iter *iter,
  * A btree node is being replaced - update the iterator to point to the new
  * node:
  */
-bool bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
+void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
 {
+       enum btree_node_locked_type t;
        struct btree_iter *linked;
 
-       for_each_linked_btree_iter(iter, linked)
+       for_each_btree_iter(iter, linked)
                if (btree_iter_pos_in_node(linked, b)) {
                        /*
                         * bch2_btree_iter_node_drop() has already been called -
@@ -626,52 +724,28 @@ bool bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
                         */
                        BUG_ON(btree_node_locked(linked, b->level));
 
-                       /*
-                        * If @linked wants this node read locked, we don't want
-                        * to actually take the read lock now because it's not
-                        * legal to hold read locks on other nodes while we take
-                        * write locks, so the journal can make forward
-                        * progress...
-                        *
-                        * Instead, btree_iter_node_set() sets things up so
-                        * bch2_btree_node_relock() will succeed:
-                        */
-
-                       if (btree_want_intent(linked, b->level)) {
-                               six_lock_increment(&b->lock, SIX_LOCK_intent);
-                               mark_btree_node_intent_locked(linked, b->level);
+                       t = btree_lock_want(linked, b->level);
+                       if (t != BTREE_NODE_UNLOCKED) {
+                               six_lock_increment(&b->lock, t);
+                               mark_btree_node_locked(linked, b->level, t);
                        }
 
                        btree_iter_node_set(linked, b);
                }
 
-       if (!btree_iter_pos_in_node(iter, b)) {
-               six_unlock_intent(&b->lock);
-               return false;
-       }
-
-       mark_btree_node_intent_locked(iter, b->level);
-       btree_iter_node_set(iter, b);
-       return true;
-}
-
-void bch2_btree_iter_node_drop_linked(struct btree_iter *iter, struct btree *b)
-{
-       struct btree_iter *linked;
-
-       for_each_linked_btree_iter(iter, linked)
-               bch2_btree_iter_node_drop(linked, b);
+       six_unlock_intent(&b->lock);
 }
 
 void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
 {
+       struct btree_iter *linked;
        unsigned level = b->level;
 
-       if (iter->l[level].b == b) {
-               btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-               btree_node_unlock(iter, level);
-               iter->l[level].b = BTREE_ITER_NOT_END;
-       }
+       for_each_btree_iter(iter, linked)
+               if (linked->l[level].b == b) {
+                       btree_node_unlock(linked, level);
+                       linked->l[level].b = BTREE_ITER_NOT_END;
+               }
 }
 
 /*
@@ -682,9 +756,8 @@ void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
 {
        struct btree_iter *linked;
 
-       for_each_linked_btree_node(iter, b, linked)
+       for_each_btree_iter_with_node(iter, b, linked)
                __btree_iter_init(linked, b);
-       __btree_iter_init(iter, b);
 }
 
 static inline int btree_iter_lock_root(struct btree_iter *iter,
@@ -713,7 +786,7 @@ static inline int btree_iter_lock_root(struct btree_iter *iter,
                        return 0;
                }
 
-               lock_type = btree_lock_want(iter, iter->level);
+               lock_type = __btree_lock_want(iter, iter->level);
                if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
                                              iter, lock_type)))
                        return -EINTR;
@@ -771,7 +844,7 @@ static inline int btree_iter_down(struct btree_iter *iter)
        struct btree_iter_level *l = &iter->l[iter->level];
        struct btree *b;
        unsigned level = iter->level - 1;
-       enum six_lock_type lock_type = btree_lock_want(iter, level);
+       enum six_lock_type lock_type = __btree_lock_want(iter, level);
        BKEY_PADDED(k) tmp;
 
        BUG_ON(!btree_node_locked(iter, iter->level));
@@ -799,6 +872,12 @@ static void btree_iter_up(struct btree_iter *iter)
        btree_node_unlock(iter, iter->level++);
 }
 
+static void btree_iter_set_end(struct btree_iter *iter)
+{
+       iter->uptodate = BTREE_ITER_END;
+       __bch2_btree_iter_unlock(iter);
+}
+
 int __must_check __bch2_btree_iter_traverse(struct btree_iter *);
 
 static int btree_iter_traverse_error(struct btree_iter *iter, int ret)
@@ -871,7 +950,7 @@ io_error:
        BUG_ON(ret != -EIO);
 
        iter->flags |= BTREE_ITER_ERROR;
-       iter->l[iter->level].b = NULL;
+       iter->l[iter->level].b = BTREE_ITER_NOT_END;
        goto out;
 }
 
@@ -888,9 +967,12 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
 {
        unsigned depth_want = iter->level;
 
-       if (unlikely(!iter->l[iter->level].b))
+       if (unlikely(iter->uptodate == BTREE_ITER_END))
                return 0;
 
+       BUG_ON(iter->level >= BTREE_MAX_DEPTH);
+       BUG_ON(!iter->l[iter->level].b);
+
        iter->flags &= ~BTREE_ITER_AT_END_OF_LEAF;
 
        /* make sure we have all the intent locks we need - ugh */
@@ -959,6 +1041,7 @@ int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
        }
 
        iter->uptodate = BTREE_ITER_NEED_PEEK;
+       bch2_btree_iter_verify_locks(iter);
        return 0;
 }
 
@@ -966,13 +1049,15 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter)
 {
        int ret;
 
-       if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
+       if (__bch2_btree_iter_relock(iter))
                return 0;
 
        ret = __bch2_btree_iter_traverse(iter);
        if (unlikely(ret))
                ret = btree_iter_traverse_error(iter, ret);
 
+       BUG_ON(ret == -EINTR && !btree_iter_linked(iter));
+
        return ret;
 }
 
@@ -984,18 +1069,29 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
        int ret;
 
        EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
+       bch2_btree_iter_verify_locks(iter);
+
+       if (iter->uptodate == BTREE_ITER_UPTODATE)
+               return iter->l[iter->level].b;
+
+       if (unlikely(iter->uptodate == BTREE_ITER_END))
+               return NULL;
 
        ret = bch2_btree_iter_traverse(iter);
        if (ret)
                return ERR_PTR(ret);
 
        b = iter->l[iter->level].b;
-
-       if (b) {
-               EBUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
-               iter->pos = b->key.k.p;
+       if (!b) {
+               btree_iter_set_end(iter);
+               return NULL;
        }
 
+       BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);
+
+       iter->pos = b->key.k.p;
+       iter->uptodate = BTREE_ITER_UPTODATE;
+
        return b;
 }
 
@@ -1005,24 +1101,39 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter, unsigned depth)
        int ret;
 
        EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS);
+       bch2_btree_iter_verify_locks(iter);
 
        btree_iter_up(iter);
 
-       if (!btree_iter_node(iter, iter->level))
+       if (!btree_iter_node(iter, iter->level)) {
+               btree_iter_set_end(iter);
                return NULL;
+       }
 
-       /* parent node usually won't be locked: redo traversal if necessary */
-       btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
-       ret = bch2_btree_iter_traverse(iter);
-       if (ret)
-               return NULL;
+       if (!bch2_btree_node_relock(iter, iter->level)) {
+               btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
+               ret = bch2_btree_iter_traverse(iter);
+               if (ret)
+                       return NULL;
+       }
 
        b = iter->l[iter->level].b;
-       if (!b)
-               return b;
+       BUG_ON(!b);
 
        if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
-               /* Haven't gotten to the end of the parent node: */
+               /*
+                * Haven't gotten to the end of the parent node: go back down to
+                * the next child node
+                */
+
+               /*
+                * We don't really want to be unlocking here except we can't
+                * directly tell btree_iter_traverse() "traverse to this level"
+                * except by setting iter->level, so we have to unlock so we
+                * don't screw up our lock invariants:
+                */
+               if (btree_node_read_locked(iter, iter->level))
+                       btree_node_unlock(iter, iter->level);
 
                /* ick: */
                iter->pos       = iter->btree_id == BTREE_ID_INODES
@@ -1086,8 +1197,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
        EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
                (iter->btree_id == BTREE_ID_EXTENTS));
        EBUG_ON(iter->flags & BTREE_ITER_SLOTS);
-       EBUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
-               !btree_node_locked(iter, 0));
+       bch2_btree_iter_verify_locks(iter);
 
        if (iter->uptodate == BTREE_ITER_UPTODATE) {
                struct bkey_packed *k =
@@ -1117,7 +1227,7 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
                /* got to the end of the leaf, iterator needs to be traversed: */
                iter->pos = l->b->key.k.p;
                if (!bkey_cmp(iter->pos, POS_MAX)) {
-                       iter->uptodate = BTREE_ITER_END;
+                       btree_iter_set_end(iter);
                        return bkey_s_c_null;
                }
 
@@ -1144,7 +1254,7 @@ struct bkey_s_c bch2_btree_iter_peek_next_leaf(struct btree_iter *iter)
 
        iter->pos = l->b->key.k.p;
        if (!bkey_cmp(iter->pos, POS_MAX)) {
-               iter->uptodate = BTREE_ITER_END;
+               btree_iter_set_end(iter);
                return bkey_s_c_null;
        }
 
@@ -1163,6 +1273,7 @@ struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
        EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
                (iter->btree_id == BTREE_ID_EXTENTS));
        EBUG_ON(iter->flags & BTREE_ITER_SLOTS);
+       bch2_btree_iter_verify_locks(iter);
 
        if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
                k = bch2_btree_iter_peek(iter);
@@ -1225,7 +1336,7 @@ recheck:
        if (iter->flags & BTREE_ITER_IS_EXTENTS) {
                if (n.p.offset == KEY_OFFSET_MAX) {
                        if (n.p.inode == KEY_INODE_MAX) {
-                               iter->uptodate = BTREE_ITER_END;
+                               btree_iter_set_end(iter);
                                return bkey_s_c_null;
                        }
 
@@ -1259,8 +1370,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
        EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
                (iter->btree_id == BTREE_ID_EXTENTS));
        EBUG_ON(!(iter->flags & BTREE_ITER_SLOTS));
-       EBUG_ON(iter->uptodate == BTREE_ITER_UPTODATE &&
-               !btree_node_locked(iter, 0));
+       bch2_btree_iter_verify_locks(iter);
 
        if (iter->uptodate == BTREE_ITER_UPTODATE) {
                struct bkey_s_c ret = { .k = &iter->k };
@@ -1286,6 +1396,11 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 
 struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
 {
+       EBUG_ON(!!(iter->flags & BTREE_ITER_IS_EXTENTS) !=
+               (iter->btree_id == BTREE_ID_EXTENTS));
+       EBUG_ON(!(iter->flags & BTREE_ITER_SLOTS));
+       bch2_btree_iter_verify_locks(iter);
+
        iter->pos = btree_type_successor(iter->btree_id, iter->k.p);
 
        if (unlikely(iter->uptodate != BTREE_ITER_UPTODATE)) {
@@ -1347,13 +1462,11 @@ void bch2_btree_iter_unlink(struct btree_iter *iter)
        if (!btree_iter_linked(iter))
                return;
 
-       for_each_linked_btree_iter(iter, linked) {
-
+       for_each_linked_btree_iter(iter, linked)
                if (linked->next == iter) {
                        linked->next = iter->next;
                        return;
                }
-       }
 
        BUG();
 }
@@ -1366,9 +1479,9 @@ void bch2_btree_iter_link(struct btree_iter *iter, struct btree_iter *new)
        iter->next = new;
 
        if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
-               unsigned nr_iters = 1;
+               unsigned nr_iters = 0;
 
-               for_each_linked_btree_iter(iter, new)
+               for_each_btree_iter(iter, new)
                        nr_iters++;
 
                BUG_ON(nr_iters > SIX_LOCK_MAX_RECURSE);
index 0097a2a20a18f119bdd092b03ca555977c89370d..99e51b27675dc62f422fb76dc4acfc49971eb7b9 100644 (file)
@@ -28,40 +28,47 @@ static inline bool btree_iter_linked(const struct btree_iter *iter)
        return iter->next != iter;
 }
 
-/**
- * for_each_linked_btree_iter - iterate over all iterators linked with @_iter
- */
-#define for_each_linked_btree_iter(_iter, _linked)                     \
-       for ((_linked) = (_iter)->next;                                 \
-            (_linked) != (_iter);                                      \
-            (_linked) = (_linked)->next)
+static inline bool __iter_has_node(const struct btree_iter *iter,
+                                  const struct btree *b)
+{
+       /*
+        * We don't compare the low bits of the lock sequence numbers because
+        * @iter might have taken a write lock on @b, and we don't want to skip
+        * the linked iterator if the sequence numbers were equal before taking
+        * that write lock. The lock sequence number is incremented by taking
+        * and releasing write locks and is even when unlocked:
+        */
+
+       return iter->l[b->level].b == b &&
+               iter->lock_seq[b->level] >> 1 == b->lock.state.seq >> 1;
+}
 
 static inline struct btree_iter *
-__next_linked_btree_node(struct btree_iter *iter, struct btree *b,
-                        struct btree_iter *linked)
-{
-       do {
-               linked = linked->next;
-
-               if (linked == iter)
-                       return NULL;
-
-               /*
-                * We don't compare the low bits of the lock sequence numbers
-                * because @iter might have taken a write lock on @b, and we
-                * don't want to skip the linked iterator if the sequence
-                * numbers were equal before taking that write lock. The lock
-                * sequence number is incremented by taking and releasing write
-                * locks and is even when unlocked:
-                */
-       } while (linked->l[b->level].b != b ||
-                linked->lock_seq[b->level] >> 1 != b->lock.state.seq >> 1);
+__next_linked_iter(struct btree_iter *iter, struct btree_iter *linked)
+{
+       return linked->next != iter ? linked->next : NULL;
+}
+
+static inline struct btree_iter *
+__next_iter_with_node(struct btree_iter *iter, struct btree *b,
+                     struct btree_iter *linked)
+{
+       while (linked && !__iter_has_node(linked, b))
+               linked = __next_linked_iter(iter, linked);
 
        return linked;
 }
 
 /**
- * for_each_linked_btree_node - iterate over all iterators linked with @_iter
+ * for_each_btree_iter - iterate over all iterators linked with @_iter,
+ * including @_iter
+ */
+#define for_each_btree_iter(_iter, _linked)                            \
+       for ((_linked) = (_iter); (_linked);                            \
+            (_linked) = __next_linked_iter(_iter, _linked))
+
+/**
+ * for_each_btree_iter_with_node - iterate over all iterators linked with @_iter
  * that also point to @_b
  *
  * @_b is assumed to be locked by @_iter
@@ -69,15 +76,27 @@ __next_linked_btree_node(struct btree_iter *iter, struct btree *b,
  * Filters out iterators that don't have a valid btree_node iterator for @_b -
  * i.e. iterators for which bch2_btree_node_relock() would not succeed.
  */
-#define for_each_linked_btree_node(_iter, _b, _linked)                 \
+#define for_each_btree_iter_with_node(_iter, _b, _linked)              \
        for ((_linked) = (_iter);                                       \
-            ((_linked) = __next_linked_btree_node(_iter, _b, _linked));)
+            ((_linked) = __next_iter_with_node(_iter, _b, _linked));   \
+            (_linked) = __next_linked_iter(_iter, _linked))
+
+/**
+ * for_each_linked_btree_iter - iterate over all iterators linked with @_iter,
+ * _not_ including @_iter
+ */
+#define for_each_linked_btree_iter(_iter, _linked)                     \
+       for ((_linked) = (_iter)->next;                                 \
+            (_linked) != (_iter);                                      \
+            (_linked) = (_linked)->next)
 
 #ifdef CONFIG_BCACHEFS_DEBUG
 void bch2_btree_iter_verify(struct btree_iter *, struct btree *);
+void bch2_btree_iter_verify_locks(struct btree_iter *);
 #else
 static inline void bch2_btree_iter_verify(struct btree_iter *iter,
-                                        struct btree *b) {}
+                                         struct btree *b) {}
+static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
 #endif
 
 void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
@@ -85,22 +104,28 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
                             struct bkey_packed *, unsigned, unsigned);
 
 int bch2_btree_iter_unlock(struct btree_iter *);
-bool __bch2_btree_iter_set_locks_want(struct btree_iter *, unsigned);
 
-static inline bool bch2_btree_iter_set_locks_want(struct btree_iter *iter,
-                                                unsigned new_locks_want)
+bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned);
+
+static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter,
+                                          unsigned new_locks_want)
 {
        new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH);
 
-       if (iter->locks_want == new_locks_want &&
-           iter->nodes_intent_locked == (1 << new_locks_want) - 1)
-               return true;
+       return iter->locks_want < new_locks_want
+               ?  __bch2_btree_iter_upgrade(iter, new_locks_want)
+               : iter->uptodate <= BTREE_ITER_NEED_PEEK;
+}
+
+void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned);
 
-       return __bch2_btree_iter_set_locks_want(iter, new_locks_want);
+static inline void bch2_btree_iter_downgrade(struct btree_iter *iter)
+{
+       if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0)
+               __bch2_btree_iter_downgrade(iter, 0);
 }
 
-bool bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
-void bch2_btree_iter_node_drop_linked(struct btree_iter *, struct btree *);
+void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *);
 void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *);
 
 void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *);
index f48084bc26aec8d9d82f04c18909fb54693af378..1d975207a16353b5db507cec7070decf3e1fd9cf 100644 (file)
@@ -75,16 +75,23 @@ static inline void mark_btree_node_intent_locked(struct btree_iter *iter,
        mark_btree_node_locked(iter, level, SIX_LOCK_intent);
 }
 
-static inline enum six_lock_type btree_lock_want(struct btree_iter *iter, int level)
+static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level)
 {
        return level < iter->locks_want
                ? SIX_LOCK_intent
                : SIX_LOCK_read;
 }
 
-static inline bool btree_want_intent(struct btree_iter *iter, int level)
+static inline enum btree_node_locked_type
+btree_lock_want(struct btree_iter *iter, int level)
 {
-       return btree_lock_want(iter, level) == SIX_LOCK_intent;
+       if (level < iter->level)
+               return BTREE_NODE_UNLOCKED;
+       if (level < iter->locks_want)
+               return BTREE_NODE_INTENT_LOCKED;
+       if (level == iter->level)
+               return BTREE_NODE_READ_LOCKED;
+       return BTREE_NODE_UNLOCKED;
 }
 
 static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
@@ -98,6 +105,14 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
        mark_btree_node_unlocked(iter, level);
 }
 
+static inline void __bch2_btree_iter_unlock(struct btree_iter *iter)
+{
+       btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);
+
+       while (iter->nodes_locked)
+               btree_node_unlock(iter, __ffs(iter->nodes_locked));
+}
+
 static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type)
 {
        switch (type) {
@@ -150,8 +165,11 @@ bool __bch2_btree_node_relock(struct btree_iter *, unsigned);
 static inline bool bch2_btree_node_relock(struct btree_iter *iter,
                                          unsigned level)
 {
-       return likely(btree_lock_want(iter, level) ==
-                     btree_node_locked_type(iter, level)) ||
+       EBUG_ON(btree_node_locked(iter, level) &&
+               btree_node_locked_type(iter, level) !=
+               __btree_lock_want(iter, level));
+
+       return likely(btree_node_locked(iter, level)) ||
                __bch2_btree_node_relock(iter, level);
 }
 
index f357095d5b4f1f536729df2730096708cb39a942..aac97958cc3b2b90f7a0a866e08648acc74bfd30 100644 (file)
@@ -85,31 +85,49 @@ int __bch2_btree_insert_at(struct btree_insert *);
                        __VA_ARGS__                                     \
                }})
 
+enum {
+       __BTREE_INSERT_ATOMIC,
+       __BTREE_INSERT_NOUNLOCK,
+       __BTREE_INSERT_NOFAIL,
+       __BTREE_INSERT_USE_RESERVE,
+       __BTREE_INSERT_USE_ALLOC_RESERVE,
+       __BTREE_INSERT_JOURNAL_REPLAY,
+       __BTREE_INSERT_NOWAIT,
+       __BTREE_INSERT_GC_LOCK_HELD,
+       __BCH_HASH_SET_MUST_CREATE,
+       __BCH_HASH_SET_MUST_REPLACE,
+};
+
+/*
+ * Don't drop/retake locks before doing btree update, instead return -EINTR if
+ * we had to drop locks for any reason
+ */
+#define BTREE_INSERT_ATOMIC            (1 << __BTREE_INSERT_ATOMIC)
+
 /*
- * Don't drop/retake locks: instead return -EINTR if need to upgrade to intent
- * locks, -EAGAIN if need to wait on btree reserve
+ * Don't drop locks _after_ successfully updating btree:
  */
-#define BTREE_INSERT_ATOMIC            (1 << 0)
+#define BTREE_INSERT_NOUNLOCK          (1 << __BTREE_INSERT_NOUNLOCK)
 
 /* Don't check for -ENOSPC: */
-#define BTREE_INSERT_NOFAIL            (1 << 1)
+#define BTREE_INSERT_NOFAIL            (1 << __BTREE_INSERT_NOFAIL)
 
 /* for copygc, or when merging btree nodes */
-#define BTREE_INSERT_USE_RESERVE       (1 << 2)
-#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << 3)
+#define BTREE_INSERT_USE_RESERVE       (1 << __BTREE_INSERT_USE_RESERVE)
+#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
 
 /*
  * Insert is for journal replay: don't get journal reservations, or mark extents
  * (bch_mark_key)
  */
-#define BTREE_INSERT_JOURNAL_REPLAY    (1 << 4)
+#define BTREE_INSERT_JOURNAL_REPLAY    (1 << __BTREE_INSERT_JOURNAL_REPLAY)
 
 /* Don't block on allocation failure (for new btree nodes: */
-#define BTREE_INSERT_NOWAIT            (1 << 5)
-#define BTREE_INSERT_GC_LOCK_HELD      (1 << 6)
+#define BTREE_INSERT_NOWAIT            (1 << __BTREE_INSERT_NOWAIT)
+#define BTREE_INSERT_GC_LOCK_HELD      (1 << __BTREE_INSERT_GC_LOCK_HELD)
 
-#define BCH_HASH_SET_MUST_CREATE       (1 << 7)
-#define BCH_HASH_SET_MUST_REPLACE      (1 << 8)
+#define BCH_HASH_SET_MUST_CREATE       (1 << __BCH_HASH_SET_MUST_CREATE)
+#define BCH_HASH_SET_MUST_REPLACE      (1 << __BCH_HASH_SET_MUST_REPLACE)
 
 int bch2_btree_delete_at(struct btree_iter *, unsigned);
 
index 92e19c4eae7ec572312c83eea9bf4543679090fc..3e13f78476a29eadc16f32a785488083d9fdcaaf 100644 (file)
@@ -223,8 +223,7 @@ found:
        mutex_unlock(&c->btree_interior_update_lock);
 }
 
-static void __btree_node_free(struct bch_fs *c, struct btree *b,
-                             struct btree_iter *iter)
+static void __btree_node_free(struct bch_fs *c, struct btree *b)
 {
        trace_btree_node_free(c, b);
 
@@ -237,21 +236,11 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
 
        clear_btree_node_noevict(b);
 
-       btree_node_lock_type(c, b, SIX_LOCK_write);
-
        bch2_btree_node_hash_remove(&c->btree_cache, b);
 
        mutex_lock(&c->btree_cache.lock);
        list_move(&b->list, &c->btree_cache.freeable);
        mutex_unlock(&c->btree_cache.lock);
-
-       /*
-        * By using six_unlock_write() directly instead of
-        * bch2_btree_node_unlock_write(), we don't update the iterator's
-        * sequence numbers and cause future bch2_btree_node_relock() calls to
-        * fail:
-        */
-       six_unlock_write(&b->lock);
 }
 
 void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
@@ -264,7 +253,9 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
 
        clear_btree_node_dirty(b);
 
-       __btree_node_free(c, b, NULL);
+       btree_node_lock_type(c, b, SIX_LOCK_write);
+       __btree_node_free(c, b);
+       six_unlock_write(&b->lock);
 
        bch2_open_bucket_put_refs(c, &ob.nr, ob.refs);
 }
@@ -283,9 +274,9 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
         */
        btree_update_drop_new_node(c, b);
 
-       bch2_btree_iter_node_drop_linked(iter, b);
-
-       __btree_node_free(c, b, iter);
+       __bch2_btree_node_lock_write(b, iter);
+       __btree_node_free(c, b);
+       six_unlock_write(&b->lock);
 
        bch2_btree_iter_node_drop(iter, b);
 }
@@ -499,7 +490,9 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser
                        bch2_btree_open_bucket_put(c, b);
                }
 
-               __btree_node_free(c, b, NULL);
+               btree_node_lock_type(c, b, SIX_LOCK_write);
+               __btree_node_free(c, b);
+               six_unlock_write(&b->lock);
 
                six_unlock_intent(&b->lock);
        }
@@ -1362,7 +1355,8 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 }
 
 static void btree_split(struct btree_update *as, struct btree *b,
-                       struct btree_iter *iter, struct keylist *keys)
+                       struct btree_iter *iter, struct keylist *keys,
+                       unsigned flags)
 {
        struct bch_fs *c = as->c;
        struct btree *parent = btree_node_parent(iter, b);
@@ -1425,7 +1419,7 @@ static void btree_split(struct btree_update *as, struct btree *b,
 
        if (parent) {
                /* Split a non root node */
-               bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
+               bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
        } else if (n3) {
                bch2_btree_set_root(as, n3, iter);
        } else {
@@ -1491,9 +1485,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
 
        btree_update_updated_node(as, b);
 
-       for_each_linked_btree_node(iter, b, linked)
+       for_each_btree_iter_with_node(iter, b, linked)
                bch2_btree_node_iter_peek(&linked->l[b->level].iter, b);
-       bch2_btree_node_iter_peek(&iter->l[b->level].iter, b);
 
        bch2_btree_iter_verify(iter, b);
 }
@@ -1511,7 +1504,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
  * for leaf nodes -- inserts into interior nodes have to be atomic.
  */
 void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
-                           struct btree_iter *iter, struct keylist *keys)
+                           struct btree_iter *iter, struct keylist *keys,
+                           unsigned flags)
 {
        struct bch_fs *c = as->c;
        int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
@@ -1551,14 +1545,14 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b,
 
        btree_node_interior_verify(b);
 
-       bch2_foreground_maybe_merge(c, iter, b->level);
+       bch2_foreground_maybe_merge(c, iter, b->level, flags);
        return;
 split:
-       btree_split(as, b, iter, keys);
+       btree_split(as, b, iter, keys, flags);
 }
 
 int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
-                         unsigned btree_reserve_flags)
+                         unsigned flags)
 {
        struct btree *b = iter->l[0].b;
        struct btree_update *as;
@@ -1570,16 +1564,17 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
         * We already have a disk reservation and open buckets pinned; this
         * allocation must not block:
         */
-       for_each_linked_btree_iter(iter, linked)
+       for_each_btree_iter(iter, linked)
                if (linked->btree_id == BTREE_ID_EXTENTS)
-                       btree_reserve_flags |= BTREE_INSERT_USE_RESERVE;
-       if (iter->btree_id == BTREE_ID_EXTENTS)
-               btree_reserve_flags |= BTREE_INSERT_USE_RESERVE;
+                       flags |= BTREE_INSERT_USE_RESERVE;
 
        closure_init_stack(&cl);
 
        /* Hack, because gc and splitting nodes doesn't mix yet: */
        if (!down_read_trylock(&c->gc_lock)) {
+               if (flags & BTREE_INSERT_NOUNLOCK)
+                       return -EINTR;
+
                bch2_btree_iter_unlock(iter);
                down_read(&c->gc_lock);
 
@@ -1591,39 +1586,43 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
         * XXX: figure out how far we might need to split,
         * instead of locking/reserving all the way to the root:
         */
-       if (!bch2_btree_iter_set_locks_want(iter, U8_MAX)) {
+       if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
                ret = -EINTR;
                goto out;
        }
 
        as = bch2_btree_update_start(c, iter->btree_id,
-                                    btree_update_reserve_required(c, b),
-                                    btree_reserve_flags, &cl);
+               btree_update_reserve_required(c, b), flags,
+               !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
        if (IS_ERR(as)) {
                ret = PTR_ERR(as);
                if (ret == -EAGAIN) {
+                       BUG_ON(flags & BTREE_INSERT_NOUNLOCK);
                        bch2_btree_iter_unlock(iter);
-                       up_read(&c->gc_lock);
-                       closure_sync(&cl);
-                       return -EINTR;
+                       ret = -EINTR;
                }
                goto out;
        }
 
-       btree_split(as, b, iter, NULL);
+       btree_split(as, b, iter, NULL, flags);
        bch2_btree_update_done(as);
 
-       bch2_btree_iter_set_locks_want(iter, 1);
+       /*
+        * We haven't successfully inserted yet, so don't downgrade all the way
+        * back to read locks;
+        */
+       __bch2_btree_iter_downgrade(iter, 1);
 out:
        up_read(&c->gc_lock);
        closure_sync(&cl);
        return ret;
 }
 
-int __bch2_foreground_maybe_merge(struct bch_fs *c,
-                                 struct btree_iter *iter,
-                                 unsigned level,
-                                 enum btree_node_sibling sib)
+void __bch2_foreground_maybe_merge(struct bch_fs *c,
+                                  struct btree_iter *iter,
+                                  unsigned level,
+                                  unsigned flags,
+                                  enum btree_node_sibling sib)
 {
        struct btree_update *as;
        struct bkey_format_state new_s;
@@ -1636,29 +1635,29 @@ int __bch2_foreground_maybe_merge(struct bch_fs *c,
 
        closure_init_stack(&cl);
 retry:
-       if (!bch2_btree_node_relock(iter, level))
-               return 0;
+       BUG_ON(!btree_node_locked(iter, level));
 
        b = iter->l[level].b;
 
        parent = btree_node_parent(iter, b);
        if (!parent)
-               return 0;
+               goto out;
 
        if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c))
-               return 0;
+               goto out;
 
        /* XXX: can't be holding read locks */
-       m = bch2_btree_node_get_sibling(c, iter, b, sib);
+       m = bch2_btree_node_get_sibling(c, iter, b,
+                       !(flags & BTREE_INSERT_NOUNLOCK), sib);
        if (IS_ERR(m)) {
                ret = PTR_ERR(m);
-               goto out;
+               goto err;
        }
 
        /* NULL means no sibling: */
        if (!m) {
                b->sib_u64s[sib] = U16_MAX;
-               return 0;
+               goto out;
        }
 
        if (sib == btree_prev_sib) {
@@ -1688,33 +1687,26 @@ retry:
 
        if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) {
                six_unlock_intent(&m->lock);
-               return 0;
+               goto out;
        }
 
        /* We're changing btree topology, doesn't mix with gc: */
-       if (!down_read_trylock(&c->gc_lock)) {
-               six_unlock_intent(&m->lock);
-               bch2_btree_iter_unlock(iter);
+       if (!down_read_trylock(&c->gc_lock))
+               goto err_cycle_gc_lock;
 
-               down_read(&c->gc_lock);
-               up_read(&c->gc_lock);
+       if (!bch2_btree_iter_upgrade(iter, U8_MAX)) {
                ret = -EINTR;
-               goto out;
-       }
-
-       if (!bch2_btree_iter_set_locks_want(iter, U8_MAX)) {
-               ret = -EINTR;
-               goto out_unlock;
+               goto err_unlock;
        }
 
        as = bch2_btree_update_start(c, iter->btree_id,
                         btree_update_reserve_required(c, parent) + 1,
                         BTREE_INSERT_NOFAIL|
                         BTREE_INSERT_USE_RESERVE,
-                        &cl);
+                        !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL);
        if (IS_ERR(as)) {
                ret = PTR_ERR(as);
-               goto out_unlock;
+               goto err_unlock;
        }
 
        trace_btree_merge(c, b);
@@ -1744,7 +1736,7 @@ retry:
 
        bch2_btree_node_write(c, n, SIX_LOCK_intent);
 
-       bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
+       bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
 
        bch2_btree_open_bucket_put(c, n);
        bch2_btree_node_free_inmem(c, b, iter);
@@ -1754,26 +1746,53 @@ retry:
        bch2_btree_iter_verify(iter, n);
 
        bch2_btree_update_done(as);
-out_unlock:
-       if (ret != -EINTR && ret != -EAGAIN)
-               bch2_btree_iter_set_locks_want(iter, 1);
+
        six_unlock_intent(&m->lock);
        up_read(&c->gc_lock);
 out:
-       if (ret == -EAGAIN || ret == -EINTR) {
-               bch2_btree_iter_unlock(iter);
-               ret = -EINTR;
-       }
-
+       /*
+        * Don't downgrade locks here: we're called after successful insert,
+        * and the caller will downgrade locks after a successful insert
+        * anyways (in case e.g. a split was required first)
+        *
+        * And we're also called when inserting into interior nodes in the
+        * split path, and downgrading to read locks in there is potentially
+        * confusing:
+        */
        closure_sync(&cl);
+       return;
+
+err_cycle_gc_lock:
+       six_unlock_intent(&m->lock);
+
+       if (flags & BTREE_INSERT_NOUNLOCK)
+               goto out;
+
+       bch2_btree_iter_unlock(iter);
+
+       down_read(&c->gc_lock);
+       up_read(&c->gc_lock);
+       ret = -EINTR;
+       goto err;
+
+err_unlock:
+       six_unlock_intent(&m->lock);
+       up_read(&c->gc_lock);
+err:
+       BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK));
 
-       if (ret == -EINTR) {
+       if ((ret == -EAGAIN || ret == -EINTR) &&
+           !(flags & BTREE_INSERT_NOUNLOCK)) {
+               bch2_btree_iter_unlock(iter);
+               closure_sync(&cl);
                ret = bch2_btree_iter_traverse(iter);
-               if (!ret)
-                       goto retry;
+               if (ret)
+                       goto out;
+
+               goto retry;
        }
 
-       return ret;
+       goto out;
 }
 
 static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
@@ -1806,7 +1825,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 
        if (parent) {
                bch2_keylist_add(&as->parent_keys, &n->key);
-               bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
+               bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags);
        } else {
                bch2_btree_set_root(as, n, iter);
        }
@@ -1815,7 +1834,7 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 
        bch2_btree_node_free_inmem(c, b, iter);
 
-       BUG_ON(!bch2_btree_iter_node_replace(iter, n));
+       bch2_btree_iter_node_replace(iter, n);
 
        bch2_btree_update_done(as);
        return 0;
@@ -1830,7 +1849,6 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
                            __le64 seq, unsigned flags)
 {
-       unsigned locks_want = iter->locks_want;
        struct closure cl;
        struct btree *b;
        int ret;
@@ -1839,7 +1857,7 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
 
        closure_init_stack(&cl);
 
-       bch2_btree_iter_set_locks_want(iter, U8_MAX);
+       bch2_btree_iter_upgrade(iter, U8_MAX);
 
        if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) {
                if (!down_read_trylock(&c->gc_lock)) {
@@ -1866,7 +1884,7 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter,
                closure_sync(&cl);
        }
 
-       bch2_btree_iter_set_locks_want(iter, locks_want);
+       bch2_btree_iter_downgrade(iter);
 
        if (!(flags & BTREE_INSERT_GC_LOCK_HELD))
                up_read(&c->gc_lock);
@@ -1920,7 +1938,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
                }
 
                bch2_keylist_add(&as->parent_keys, &new_key->k_i);
-               bch2_btree_insert_node(as, parent, iter, &as->parent_keys);
+               bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0);
 
                if (new_hash) {
                        mutex_lock(&c->btree_cache.lock);
@@ -1982,6 +2000,9 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 
        closure_init_stack(&cl);
 
+       if (!bch2_btree_iter_upgrade(iter, U8_MAX))
+               return -EINTR;
+
        if (!down_read_trylock(&c->gc_lock)) {
                bch2_btree_iter_unlock(iter);
                down_read(&c->gc_lock);
@@ -2041,6 +2062,8 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
                goto err_free_update;
 
        __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key);
+
+       bch2_btree_iter_downgrade(iter);
 err:
        if (new_hash) {
                mutex_lock(&c->btree_cache.lock);
index abf14e4c41dcc76bdd56f51b191e9ef57d6ae41c..3a17de5ca43e49433e263c08f54048795d48a6d0 100644 (file)
@@ -146,35 +146,51 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *,
                                               struct btree *);
 
 void bch2_btree_insert_node(struct btree_update *, struct btree *,
-                           struct btree_iter *, struct keylist *);
+                           struct btree_iter *, struct keylist *,
+                           unsigned);
 int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned);
 
-int __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
-                                 unsigned, enum btree_node_sibling);
+void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *,
+                                  unsigned, unsigned, enum btree_node_sibling);
 
-static inline int bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
+static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c,
                                        struct btree_iter *iter,
-                                       unsigned level,
+                                       unsigned level, unsigned flags,
                                        enum btree_node_sibling sib)
 {
        struct btree *b;
 
+       /*
+        * iterators are inconsistent when they hit end of leaf, until
+        * traversed again
+        *
+        * XXX inconsistent how?
+        */
+       if (iter->flags & BTREE_ITER_AT_END_OF_LEAF)
+               return;
+
+       if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE)
+               return;
+
        if (!bch2_btree_node_relock(iter, level))
-               return 0;
+               return;
 
        b = iter->l[level].b;
        if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
-               return 0;
+               return;
 
-       return __bch2_foreground_maybe_merge(c, iter, level, sib);
+       __bch2_foreground_maybe_merge(c, iter, level, flags, sib);
 }
 
 static inline void bch2_foreground_maybe_merge(struct bch_fs *c,
                                               struct btree_iter *iter,
-                                              unsigned level)
+                                              unsigned level,
+                                              unsigned flags)
 {
-       bch2_foreground_maybe_merge_sibling(c, iter, level, btree_prev_sib);
-       bch2_foreground_maybe_merge_sibling(c, iter, level, btree_next_sib);
+       bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+                                           btree_prev_sib);
+       bch2_foreground_maybe_merge_sibling(c, iter, level, flags,
+                                           btree_next_sib);
 }
 
 void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
index cc41140fbe3a59214ab0841e2d512c232c7b28d8..a62d8307036790c8ab930a54ac7335d52e969a23 100644 (file)
@@ -227,19 +227,36 @@ btree_insert_key_leaf(struct btree_insert *trans,
        return ret;
 }
 
+#define trans_for_each_entry(trans, i)                                 \
+       for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
+
+/*
+ * We sort transaction entries so that if multiple iterators point to the same
+ * leaf node they'll be adjacent:
+ */
 static bool same_leaf_as_prev(struct btree_insert *trans,
                              struct btree_insert_entry *i)
 {
-       /*
-        * Because we sorted the transaction entries, if multiple iterators
-        * point to the same leaf node they'll always be adjacent now:
-        */
        return i != trans->entries &&
                i[0].iter->l[0].b == i[-1].iter->l[0].b;
 }
 
-#define trans_for_each_entry(trans, i)                                 \
-       for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++)
+static inline struct btree_insert_entry *trans_next_leaf(struct btree_insert *trans,
+                                                        struct btree_insert_entry *i)
+{
+       struct btree *b = i->iter->l[0].b;
+
+       do {
+               i++;
+       } while (i < trans->entries + trans->nr && b == i->iter->l[0].b);
+
+       return i;
+}
+
+#define trans_for_each_leaf(trans, i)                                  \
+       for ((i) = (trans)->entries;                                    \
+            (i) < (trans)->entries + (trans)->nr;                      \
+            (i) = trans_next_leaf(trans, i))
 
 inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
                                            struct btree_iter *iter)
@@ -262,19 +279,16 @@ static void multi_lock_write(struct bch_fs *c, struct btree_insert *trans)
 {
        struct btree_insert_entry *i;
 
-       trans_for_each_entry(trans, i)
-               if (!same_leaf_as_prev(trans, i))
-                       bch2_btree_node_lock_for_insert(c, i->iter->l[0].b,
-                                                       i->iter);
+       trans_for_each_leaf(trans, i)
+               bch2_btree_node_lock_for_insert(c, i->iter->l[0].b, i->iter);
 }
 
 static void multi_unlock_write(struct btree_insert *trans)
 {
        struct btree_insert_entry *i;
 
-       trans_for_each_entry(trans, i)
-               if (!same_leaf_as_prev(trans, i))
-                       bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
+       trans_for_each_leaf(trans, i)
+               bch2_btree_node_unlock_write(i->iter->l[0].b, i->iter);
 }
 
 static inline int btree_trans_cmp(struct btree_insert_entry l,
@@ -285,56 +299,24 @@ static inline int btree_trans_cmp(struct btree_insert_entry l,
 
 /* Normal update interface: */
 
-/**
- * __bch_btree_insert_at - insert keys at given iterator positions
- *
- * This is main entry point for btree updates.
- *
- * Return values:
- * -EINTR: locking changed, this function should be called again. Only returned
- *  if passed BTREE_INSERT_ATOMIC.
- * -EROFS: filesystem read only
- * -EIO: journal or btree node IO error
+/*
+ * Get journal reservation, take write locks, and attempt to do btree update(s):
  */
-int __bch2_btree_insert_at(struct btree_insert *trans)
+static inline int do_btree_insert_at(struct btree_insert *trans,
+                                    struct btree_iter **split,
+                                    bool *cycle_gc_lock)
 {
        struct bch_fs *c = trans->c;
        struct btree_insert_entry *i;
-       struct btree_iter *split = NULL;
-       bool cycle_gc_lock = false;
        unsigned u64s;
        int ret;
 
-       trans_for_each_entry(trans, i) {
-               BUG_ON(i->iter->level);
-               BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
-               BUG_ON(debug_check_bkeys(c) &&
-                      bch2_bkey_invalid(c, i->iter->btree_id,
-                                        bkey_i_to_s_c(i->k)));
-       }
-
-       bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
-
-       if (unlikely(!percpu_ref_tryget(&c->writes)))
-               return -EROFS;
-retry_locks:
-       ret = -EINTR;
-       trans_for_each_entry(trans, i) {
-               if (!bch2_btree_iter_set_locks_want(i->iter, 1))
-                       goto err;
+       trans_for_each_entry(trans, i)
+               BUG_ON(i->done);
 
-               if (i->iter->uptodate == BTREE_ITER_NEED_TRAVERSE) {
-                       ret = bch2_btree_iter_traverse(i->iter);
-                       if (ret)
-                               goto err;
-               }
-       }
-retry:
-       trans->did_work = false;
        u64s = 0;
        trans_for_each_entry(trans, i)
-               if (!i->done)
-                       u64s += jset_u64s(i->k->k.u64s + i->extra_res);
+               u64s += jset_u64s(i->k->k.u64s + i->extra_res);
 
        memset(&trans->journal_res, 0, sizeof(trans->journal_res));
 
@@ -344,13 +326,13 @@ retry:
                                      u64s, u64s)
                : 0;
        if (ret)
-               goto err;
+               return ret;
 
        multi_lock_write(c, trans);
 
        if (race_fault()) {
                ret = -EINTR;
-               goto unlock;
+               goto out;
        }
 
        u64s = 0;
@@ -365,129 +347,210 @@ retry:
                 * bch2_btree_node_write(), converting an unwritten bset to a
                 * written one
                 */
-               if (!i->done) {
-                       u64s += i->k->k.u64s + i->extra_res;
-                       if (!bch2_btree_node_insert_fits(c,
-                                       i->iter->l[0].b, u64s)) {
-                               split = i->iter;
-                               goto unlock;
-                       }
+               u64s += i->k->k.u64s + i->extra_res;
+               if (!bch2_btree_node_insert_fits(c,
+                               i->iter->l[0].b, u64s)) {
+                       ret = -EINTR;
+                       *split = i->iter;
+                       goto out;
                }
        }
 
-       ret = 0;
-       split = NULL;
-       cycle_gc_lock = false;
+       if (journal_seq_verify(c) &&
+           !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+               trans_for_each_entry(trans, i)
+                       i->k->k.version.lo = trans->journal_res.seq;
 
        trans_for_each_entry(trans, i) {
-               if (i->done)
-                       continue;
-
                switch (btree_insert_key_leaf(trans, i)) {
                case BTREE_INSERT_OK:
                        i->done = true;
                        break;
                case BTREE_INSERT_JOURNAL_RES_FULL:
                case BTREE_INSERT_NEED_TRAVERSE:
-                       ret = -EINTR;
-                       break;
                case BTREE_INSERT_NEED_RESCHED:
-                       ret = -EAGAIN;
+                       ret = -EINTR;
                        break;
                case BTREE_INSERT_BTREE_NODE_FULL:
-                       split = i->iter;
+                       ret = -EINTR;
+                       *split = i->iter;
                        break;
                case BTREE_INSERT_ENOSPC:
                        ret = -ENOSPC;
                        break;
                case BTREE_INSERT_NEED_GC_LOCK:
-                       cycle_gc_lock = true;
                        ret = -EINTR;
+                       *cycle_gc_lock = true;
                        break;
                default:
                        BUG();
                }
 
-               if (!trans->did_work && (ret || split))
+               /*
+                * If we did some work (i.e. inserted part of an extent),
+                * we have to do all the other updates as well:
+                */
+               if (!trans->did_work && (ret || *split))
                        break;
        }
-unlock:
+out:
        multi_unlock_write(trans);
        bch2_journal_res_put(&c->journal, &trans->journal_res);
 
-       if (split)
-               goto split;
-       if (ret)
-               goto err;
+       return ret;
+}
 
-       trans_for_each_entry(trans, i)
-               if (i->iter->flags & BTREE_ITER_AT_END_OF_LEAF)
-                       goto out;
+/**
+ * __bch_btree_insert_at - insert keys at given iterator positions
+ *
+ * This is main entry point for btree updates.
+ *
+ * Return values:
+ * -EINTR: locking changed, this function should be called again. Only returned
+ *  if passed BTREE_INSERT_ATOMIC.
+ * -EROFS: filesystem read only
+ * -EIO: journal or btree node IO error
+ */
+int __bch2_btree_insert_at(struct btree_insert *trans)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_insert_entry *i;
+       struct btree_iter *linked, *split = NULL;
+       bool cycle_gc_lock = false;
+       unsigned flags;
+       int ret;
+
+       for_each_btree_iter(trans->entries[0].iter, linked)
+               bch2_btree_iter_verify_locks(linked);
+
+       /* for the sake of sanity: */
+       BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC));
 
        trans_for_each_entry(trans, i) {
-               /*
-                * iterators are inconsistent when they hit end of leaf, until
-                * traversed again
-                */
-               if (i->iter->uptodate < BTREE_ITER_NEED_TRAVERSE &&
-                   !same_leaf_as_prev(trans, i))
-                       bch2_foreground_maybe_merge(c, i->iter, 0);
+               BUG_ON(i->iter->level);
+               BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos));
+               BUG_ON(debug_check_bkeys(c) &&
+                      bch2_bkey_invalid(c, i->iter->btree_id,
+                                        bkey_i_to_s_c(i->k)));
+               BUG_ON(i->iter->uptodate == BTREE_ITER_END);
        }
-out:
-       /* make sure we didn't lose an error: */
-       if (!ret && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
-               trans_for_each_entry(trans, i)
-                       BUG_ON(!i->done);
 
+       bubble_sort(trans->entries, trans->nr, btree_trans_cmp);
+
+       if (unlikely(!percpu_ref_tryget(&c->writes)))
+               return -EROFS;
+retry:
+       split = NULL;
+       cycle_gc_lock = false;
+
+       trans_for_each_entry(trans, i) {
+               if (!bch2_btree_iter_upgrade(i->iter, 1)) {
+                       ret = -EINTR;
+                       goto err;
+               }
+
+               if (i->iter->flags & BTREE_ITER_ERROR) {
+                       ret = -EIO;
+                       goto err;
+               }
+       }
+
+       ret = do_btree_insert_at(trans, &split, &cycle_gc_lock);
+       if (unlikely(ret))
+               goto err;
+
+       trans_for_each_leaf(trans, i)
+               bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags);
+
+       trans_for_each_entry(trans, i)
+               bch2_btree_iter_downgrade(i->iter);
+out:
        percpu_ref_put(&c->writes);
+
+       if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+               /* make sure we didn't drop or screw up locks: */
+               for_each_btree_iter(trans->entries[0].iter, linked) {
+                       bch2_btree_iter_verify_locks(linked);
+                       BUG_ON((trans->flags & BTREE_INSERT_NOUNLOCK) &&
+                              trans->did_work &&
+                              linked->uptodate >= BTREE_ITER_NEED_RELOCK);
+               }
+
+               /* make sure we didn't lose an error: */
+               if (!ret)
+                       trans_for_each_entry(trans, i)
+                               BUG_ON(!i->done);
+       }
+
+       BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR);
+
        return ret;
-split:
-       /*
-        * have to drop journal res before splitting, because splitting means
-        * allocating new btree nodes, and holding a journal reservation
-        * potentially blocks the allocator:
-        */
-       ret = bch2_btree_split_leaf(c, split, trans->flags);
+err:
+       flags = trans->flags;
 
        /*
-        * This can happen when we insert part of an extent - with an update
-        * with multiple keys, we don't want to redo the entire update - that's
-        * just too confusing:
+        * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
+        * update; if we haven't done anything yet it doesn't apply
         */
-       if (!ret &&
-           (trans->flags & BTREE_INSERT_ATOMIC) &&
-           trans->did_work)
-               ret = -EINTR;
+       if (!trans->did_work)
+               flags &= ~BTREE_INSERT_NOUNLOCK;
 
-       if (ret)
-               goto err;
+       if (split) {
+               ret = bch2_btree_split_leaf(c, split, flags);
+
+               /*
+                * if the split succeeded without dropping locks the insert will
+                * still be atomic (in the BTREE_INSERT_ATOMIC sense, what the
+                * caller peeked() and is overwriting won't have changed)
+                */
+#if 0
+               /*
+                * XXX:
+                * split -> btree node merging (of parent node) might still drop
+                * locks when we're not passing it BTREE_INSERT_NOUNLOCK
+                */
+               if (!ret && !trans->did_work)
+                       goto retry;
+#endif
+
+               /*
+                * don't care if we got ENOSPC because we told split it
+                * couldn't block:
+                */
+               if (!ret || (flags & BTREE_INSERT_NOUNLOCK))
+                       ret = -EINTR;
+       }
 
-       /*
-        * if the split didn't have to drop locks the insert will still be
-        * atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked()
-        * and is overwriting won't have changed)
-        */
-       goto retry_locks;
-err:
        if (cycle_gc_lock) {
-               down_read(&c->gc_lock);
+               if (!down_read_trylock(&c->gc_lock)) {
+                       if (flags & BTREE_INSERT_NOUNLOCK)
+                               goto out;
+
+                       bch2_btree_iter_unlock(trans->entries[0].iter);
+                       down_read(&c->gc_lock);
+               }
                up_read(&c->gc_lock);
        }
 
        if (ret == -EINTR) {
+               if (flags & BTREE_INSERT_NOUNLOCK)
+                       goto out;
+
                trans_for_each_entry(trans, i) {
                        int ret2 = bch2_btree_iter_traverse(i->iter);
                        if (ret2) {
                                ret = ret2;
                                goto out;
                        }
+
+                       BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK);
                }
 
                /*
                 * BTREE_ITER_ATOMIC means we have to return -EINTR if we
                 * dropped locks:
                 */
-               if (!(trans->flags & BTREE_INSERT_ATOMIC))
+               if (!(flags & BTREE_INSERT_ATOMIC))
                        goto retry;
        }
 
@@ -549,7 +612,7 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
        bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k),
                             BTREE_ITER_INTENT);
        ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags,
-                                 BTREE_INSERT_ENTRY(&iter, k));
+                                  BTREE_INSERT_ENTRY(&iter, k));
        bch2_btree_iter_unlock(&iter);
 
        return ret;
@@ -584,6 +647,11 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
                if (bkey_cmp(iter.pos, end) >= 0)
                        break;
 
+               if (k.k->type == KEY_TYPE_DISCARD) {
+                       bch2_btree_iter_next(&iter);
+                       continue;
+               }
+
                bkey_init(&delete.k);
 
                /*
@@ -615,8 +683,8 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
                }
 
                ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq,
-                                         BTREE_INSERT_NOFAIL,
-                                         BTREE_INSERT_ENTRY(&iter, &delete));
+                                          BTREE_INSERT_NOFAIL,
+                                          BTREE_INSERT_ENTRY(&iter, &delete));
                if (ret)
                        break;
 
index b17189ee2e4faf2673ec44cf9bdc1b2becbf875a..43112445040501828e49274b2d1fde18b09f98ad 100644 (file)
@@ -358,8 +358,9 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
            old.data_type != new.data_type) {
                BUG_ON(!c);
                bch2_fs_inconsistent(c,
-                       "different types of data in same bucket: %u, %u",
-                       old.data_type, new.data_type);
+                       "different types of data in same bucket: %s, %s",
+                       bch2_data_types[old.data_type],
+                       bch2_data_types[new.data_type]);
        }
 
        dev_usage = this_cpu_ptr(ca->usage_percpu);
index 2690cc4baeead200b0fb1fb9bdadcd9ce8d2a703..031b36f3f36ec94fd483c47b3b2e892e380e49bb 100644 (file)
@@ -109,14 +109,6 @@ static inline bool bch2_checksum_type_valid(const struct bch_fs *c,
        return true;
 }
 
-static const unsigned bch_crc_bytes[] = {
-       [BCH_CSUM_NONE]                         = 0,
-       [BCH_CSUM_CRC32C]                       = 4,
-       [BCH_CSUM_CRC64]                        = 8,
-       [BCH_CSUM_CHACHA20_POLY1305_80]         = 10,
-       [BCH_CSUM_CHACHA20_POLY1305_128]        = 16,
-};
-
 /* returns true if not equal */
 static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r)
 {
index df9913f8967b8641cf3b1a4e0afee0541430f4f5..36dca6b22a9144fac247ccd1a22bdac7734c5c1a 100644 (file)
@@ -12,7 +12,8 @@
 
 unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
 {
-       unsigned len = bkey_val_bytes(d.k) - sizeof(struct bch_dirent);
+       unsigned len = bkey_val_bytes(d.k) -
+               offsetof(struct bch_dirent, d_name);
 
        while (len && !d.v->d_name[len - 1])
                --len;
@@ -22,7 +23,8 @@ unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
 
 static unsigned dirent_val_u64s(unsigned len)
 {
-       return DIV_ROUND_UP(sizeof(struct bch_dirent) + len, sizeof(u64));
+       return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len,
+                           sizeof(u64));
 }
 
 static u64 bch2_dirent_hash(const struct bch_hash_info *info,
@@ -98,7 +100,7 @@ const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k)
                if (bkey_val_u64s(k.k) > dirent_val_u64s(len))
                        return "value too big";
 
-               if (len > NAME_MAX)
+               if (len > BCH_NAME_MAX)
                        return "dirent name too big";
 
                if (memchr(d.v->d_name, '/', len))
@@ -141,9 +143,14 @@ static struct bkey_i_dirent *dirent_create_key(u8 type,
        struct bkey_i_dirent *dirent;
        unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len);
 
+       if (name->len > BCH_NAME_MAX)
+               return ERR_PTR(-ENAMETOOLONG);
+
+       BUG_ON(u64s > U8_MAX);
+
        dirent = kmalloc(u64s * sizeof(u64), GFP_NOFS);
        if (!dirent)
-               return NULL;
+               return ERR_PTR(-ENOMEM);
 
        bkey_dirent_init(&dirent->k_i);
        dirent->k.u64s = u64s;
@@ -153,7 +160,8 @@ static struct bkey_i_dirent *dirent_create_key(u8 type,
        memcpy(dirent->v.d_name, name->name, name->len);
        memset(dirent->v.d_name + name->len, 0,
               bkey_val_bytes(&dirent->k) -
-              (sizeof(struct bch_dirent) + name->len));
+              offsetof(struct bch_dirent, d_name) -
+              name->len);
 
        EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
 
@@ -169,8 +177,8 @@ int bch2_dirent_create(struct bch_fs *c, u64 dir_inum,
        int ret;
 
        dirent = dirent_create_key(type, name, dst_inum);
-       if (!dirent)
-               return -ENOMEM;
+       if (IS_ERR(dirent))
+               return PTR_ERR(dirent);
 
        ret = bch2_hash_set(bch2_dirent_hash_desc, hash_info, c, dir_inum,
                           journal_seq, &dirent->k_i, flags);
@@ -204,7 +212,7 @@ int bch2_dirent_rename(struct bch_fs *c,
        struct bpos src_pos = bch2_dirent_pos(src_dir, src_name);
        struct bpos dst_pos = bch2_dirent_pos(dst_dir, dst_name);
        bool need_whiteout;
-       int ret = -ENOMEM;
+       int ret;
 
        bch2_btree_iter_init(&src_iter, c, BTREE_ID_DIRENTS, src_pos,
                             BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
@@ -218,15 +226,19 @@ int bch2_dirent_rename(struct bch_fs *c,
 
        if (mode == BCH_RENAME_EXCHANGE) {
                new_src = dirent_create_key(0, src_name, 0);
-               if (!new_src)
+               if (IS_ERR(new_src)) {
+                       ret = PTR_ERR(new_src);
                        goto err;
+               }
        } else {
                new_src = (void *) &delete;
        }
 
        new_dst = dirent_create_key(0, dst_name, 0);
-       if (!new_dst)
+       if (IS_ERR(new_dst)) {
+               ret = PTR_ERR(new_dst);
                goto err;
+       }
 retry:
        /*
         * Note that on -EINTR/dropped locks we're not restarting the lookup
index d7b17195ee84307aac832aa7d5b2fcc06d54d8d3..737b9be33e0af3b8ffcc5f89baa10789064404c2 100644 (file)
@@ -257,12 +257,12 @@ static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h)
        int ret;
 
        mutex_lock(&h->inode->ei_update_lock);
-       if (h->new_i_size != U64_MAX)
-               i_size_write(&h->inode->v, h->new_i_size);
-
        i_sectors_acct(c, h->inode, &h->quota_res, h->sectors);
 
        ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h);
+
+       if (!ret && h->new_i_size != U64_MAX)
+               i_size_write(&h->inode->v, h->new_i_size);
        mutex_unlock(&h->inode->ei_update_lock);
 
        bch2_quota_reservation_put(c, h->inode, &h->quota_res);
@@ -348,17 +348,25 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
                        return BTREE_INSERT_NEED_TRAVERSE;
                }
 
-               BUG_ON(h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY);
+               /* truncate in progress? */
+               if (h->inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)
+                       goto no_i_size_update;
 
                h->inode_u.bi_size = offset;
                do_pack = true;
 
                inode->ei_inode.bi_size = offset;
 
-               if (h->op->is_dio)
-                       i_size_write(&inode->v, offset);
+               spin_lock(&inode->v.i_lock);
+               if (offset > inode->v.i_size) {
+                       if (h->op->is_dio)
+                               i_size_write(&inode->v, offset);
+                       else
+                               BUG();
+               }
+               spin_unlock(&inode->v.i_lock);
        }
-
+no_i_size_update:
        if (sectors) {
                if (!h->need_inode_update) {
                        h->need_inode_update = true;
@@ -1457,8 +1465,10 @@ int bch2_write_end(struct file *file, struct address_space *mapping,
                copied = 0;
        }
 
+       spin_lock(&inode->v.i_lock);
        if (pos + copied > inode->v.i_size)
                i_size_write(&inode->v, pos + copied);
+       spin_unlock(&inode->v.i_lock);
 
        if (copied) {
                if (!PageUptodate(page))
@@ -1563,8 +1573,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
        nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE);
        inode->ei_last_dirtied = (unsigned long) current;
 
+       spin_lock(&inode->v.i_lock);
        if (pos + copied > inode->v.i_size)
                i_size_write(&inode->v, pos + copied);
+       spin_unlock(&inode->v.i_lock);
 
        if (copied < len &&
            ((offset + copied) & (PAGE_SIZE - 1))) {
@@ -2047,10 +2059,17 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        int ret;
 
-       ret = filemap_write_and_wait_range(inode->v.i_mapping, start, end);
+       ret = file_write_and_wait_range(file, start, end);
        if (ret)
                return ret;
 
+       if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC))
+               goto out;
+
+       ret = sync_inode_metadata(&inode->v, 1);
+       if (ret)
+               return ret;
+out:
        if (c->opts.journal_flush_disabled)
                return 0;
 
@@ -2149,25 +2168,61 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from)
                                    from, from + PAGE_SIZE);
 }
 
+static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr)
+{
+       struct bch_fs *c = inode->v.i_sb->s_fs_info;
+       struct address_space *mapping = inode->v.i_mapping;
+       int ret;
+
+       ret = filemap_write_and_wait_range(mapping,
+                       inode->ei_inode.bi_size, S64_MAX);
+       if (ret)
+               return ret;
+
+       truncate_setsize(&inode->v, iattr->ia_size);
+       setattr_copy(&inode->v, iattr);
+
+       mutex_lock(&inode->ei_update_lock);
+       inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v);
+       ret = bch2_write_inode_size(c, inode, inode->v.i_size);
+       mutex_unlock(&inode->ei_update_lock);
+
+       return ret;
+}
+
 int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct address_space *mapping = inode->v.i_mapping;
-       bool shrink = iattr->ia_size <= inode->v.i_size;
        struct i_sectors_hook i_sectors_hook =
                i_sectors_hook_init(inode, BCH_INODE_I_SIZE_DIRTY);
+       bool shrink;
        int ret = 0;
 
        inode_dio_wait(&inode->v);
        pagecache_block_get(&mapping->add_lock);
 
-       truncate_setsize(&inode->v, iattr->ia_size);
+       BUG_ON(inode->v.i_size < inode->ei_inode.bi_size);
+
+       shrink = iattr->ia_size <= inode->v.i_size;
+
+       if (!shrink) {
+               ret = bch2_extend(inode, iattr);
+               goto err_put_pagecache;
+       }
+
+       ret = bch2_truncate_page(inode, iattr->ia_size);
+       if (unlikely(ret))
+               goto err_put_pagecache;
 
-       /* sync appends.. */
-       /* XXX what protects inode->i_size? */
        if (iattr->ia_size > inode->ei_inode.bi_size)
                ret = filemap_write_and_wait_range(mapping,
-                                                  inode->ei_inode.bi_size, S64_MAX);
+                               inode->ei_inode.bi_size,
+                               iattr->ia_size - 1);
+       else if (iattr->ia_size & (PAGE_SIZE - 1))
+               ret = filemap_write_and_wait_range(mapping,
+                               round_down(iattr->ia_size, PAGE_SIZE),
+                               iattr->ia_size - 1);
        if (ret)
                goto err_put_pagecache;
 
@@ -2175,41 +2230,31 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr)
 
        ret = i_sectors_dirty_start(c, &i_sectors_hook);
        if (unlikely(ret))
-               goto err;
+               goto err_put_pagecache;
 
-       /*
-        * There might be persistent reservations (from fallocate())
-        * above i_size, which bch2_inode_truncate() will discard - we're
-        * only supposed to discard them if we're doing a real truncate
-        * here (new i_size < current i_size):
-        */
-       if (shrink) {
-               ret = bch2_truncate_page(inode, iattr->ia_size);
-               if (unlikely(ret))
-                       goto err;
+       truncate_setsize(&inode->v, iattr->ia_size);
 
-               ret = bch2_inode_truncate(c, inode->v.i_ino,
-                                         round_up(iattr->ia_size, PAGE_SIZE) >> 9,
-                                         &i_sectors_hook.hook,
-                                         &inode->ei_journal_seq);
-               if (unlikely(ret))
-                       goto err;
-       }
+       ret = bch2_inode_truncate(c, inode->v.i_ino,
+                                 round_up(iattr->ia_size, PAGE_SIZE) >> 9,
+                                 &i_sectors_hook.hook,
+                                 &inode->ei_journal_seq);
+       if (unlikely(ret))
+               goto err_put_sectors_dirty;
 
        setattr_copy(&inode->v, iattr);
        inode->v.i_mtime = inode->v.i_ctime = current_time(&inode->v);
-err:
-       /*
-        * On error - in particular, bch2_truncate_page() error - don't clear
-        * I_SIZE_DIRTY, as we've left data above i_size!:
-        */
-       if (ret)
-               i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY;
-
+out:
        ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
 err_put_pagecache:
        pagecache_block_put(&mapping->add_lock);
        return ret;
+err_put_sectors_dirty:
+       /*
+        * On error - in particular, bch2_truncate_page() error - don't clear
+        * I_SIZE_DIRTY, as we've left data above i_size!:
+        */
+       i_sectors_hook.flags &= ~BCH_INODE_I_SIZE_DIRTY;
+       goto out;
 }
 
 /* fallocate: */
@@ -2389,7 +2434,6 @@ btree_iter_err:
        if (ret)
                goto err_put_sectors_dirty;
 
-       i_size_write(&inode->v, new_size);
        i_sectors_hook.new_i_size = new_size;
 err_put_sectors_dirty:
        ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret;
index dc6c651df2f307f499bf5353d60b4c62d50d2ddd..3b7f78e731b8c1381e62e91fd3bdb77ed0252dd4 100644 (file)
@@ -106,6 +106,8 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
                        break;
                }
 
+               BUG_ON(inode_u.bi_size != inode->ei_inode.bi_size);
+
                if (set) {
                        ret = set(inode, &inode_u, p);
                        if (ret)
@@ -114,6 +116,10 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
 
                BUG_ON(i_nlink < nlink_bias(inode->v.i_mode));
 
+               BUG_ON(inode_u.bi_size != inode->ei_inode.bi_size &&
+                      !(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) &&
+                      inode_u.bi_size > i_size_read(&inode->v));
+
                inode_u.bi_mode = inode->v.i_mode;
                inode_u.bi_uid  = i_uid_read(&inode->v);
                inode_u.bi_gid  = i_gid_read(&inode->v);
@@ -129,11 +135,17 @@ int __must_check __bch2_write_inode(struct bch_fs *c,
                ret = bch2_btree_insert_at(c, NULL, NULL,
                                &inode->ei_journal_seq,
                                BTREE_INSERT_ATOMIC|
+                               BTREE_INSERT_NOUNLOCK|
                                BTREE_INSERT_NOFAIL,
                                BTREE_INSERT_ENTRY(&iter, &inode_p.inode.k_i));
        } while (ret == -EINTR);
 
        if (!ret) {
+               /*
+                * the btree node lock protects inode->ei_inode, not
+                * ei_update_lock; this is important for inode updates via
+                * bchfs_write_index_update
+                */
                inode->ei_inode = inode_u;
                inode->ei_qid   = bch_qid(&inode_u);
        }
@@ -1107,7 +1119,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
               le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
        buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
        buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
-       buf->f_namelen  = NAME_MAX;
+       buf->f_namelen  = BCH_NAME_MAX;
 
        return 0;
 }
index addd51f08c9abf9f2110e84a8917b88834244e5b..b4fe27f8f5ca3fb780d9ffea3c095c67a810841e 100644 (file)
@@ -75,6 +75,19 @@ static inline size_t journal_entry_u64s_reserve(struct journal_buf *buf)
        return BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
 }
 
+static inline bool journal_entry_empty(struct jset *j)
+{
+       struct jset_entry *i;
+
+       if (j->seq != j->last_seq)
+               return false;
+
+       vstruct_for_each(j, i)
+               if (i->type || i->u64s)
+                       return false;
+       return true;
+}
+
 static enum {
        JOURNAL_ENTRY_ERROR,
        JOURNAL_ENTRY_INUSE,
@@ -129,6 +142,11 @@ static enum {
        /* XXX: why set this here, and not in bch2_journal_write()? */
        buf->data->last_seq     = cpu_to_le64(journal_last_seq(j));
 
+       if (journal_entry_empty(buf->data))
+               clear_bit(JOURNAL_NOT_EMPTY, &j->flags);
+       else
+               set_bit(JOURNAL_NOT_EMPTY, &j->flags);
+
        journal_pin_new_entry(j, 1);
 
        bch2_journal_buf_init(j);
@@ -884,8 +902,18 @@ void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca)
 
 void bch2_fs_journal_stop(struct journal *j)
 {
+       struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
        wait_event(j->wait, journal_flush_write(j));
 
+       /* do we need to write another journal entry? */
+       if (test_bit(JOURNAL_NOT_EMPTY, &j->flags) ||
+           c->btree_roots_dirty)
+               bch2_journal_meta(j);
+
+       BUG_ON(!bch2_journal_error(j) &&
+              test_bit(JOURNAL_NOT_EMPTY, &j->flags));
+
        cancel_delayed_work_sync(&j->write_work);
        cancel_delayed_work_sync(&j->reclaim_work);
 }
index 36ba6a4daf84097953cf5f84983bd45dab97bfb0..8a4e7b2a92ce7cdcea1d8184036639f9bd998dc5 100644 (file)
 
 #include <trace/events/bcachefs.h>
 
-static struct jset_entry *bch2_journal_find_entry(struct jset *j, unsigned type,
-                                                enum btree_id id)
-{
-       struct jset_entry *entry;
-
-       for_each_jset_entry_type(entry, j, type)
-               if (entry->btree_id == id)
-                       return entry;
-
-       return NULL;
-}
-
-struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *c, struct jset *j,
-                                          enum btree_id id, unsigned *level)
-{
-       struct bkey_i *k;
-       struct jset_entry *entry =
-               bch2_journal_find_entry(j, BCH_JSET_ENTRY_btree_root, id);
-
-       if (!entry)
-               return NULL;
-
-       if (!entry->u64s)
-               return ERR_PTR(-EINVAL);
-
-       k = entry->start;
-       *level = entry->level;
-       *level = entry->level;
-       return k;
-}
-
 struct journal_list {
        struct closure          cl;
        struct mutex            lock;
@@ -717,6 +686,37 @@ void bch2_journal_entries_free(struct list_head *list)
        }
 }
 
+int bch2_journal_set_seq(struct bch_fs *c, u64 last_seq, u64 end_seq)
+{
+       struct journal *j = &c->journal;
+       struct journal_entry_pin_list *p;
+       u64 seq, nr = end_seq - last_seq + 1;
+
+       if (nr > j->pin.size) {
+               free_fifo(&j->pin);
+               init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
+               if (!j->pin.data) {
+                       bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
+                       return -ENOMEM;
+               }
+       }
+
+       atomic64_set(&j->seq, end_seq);
+       j->last_seq_ondisk = last_seq;
+
+       j->pin.front    = last_seq;
+       j->pin.back     = end_seq + 1;
+
+       fifo_for_each_entry_ptr(p, &j->pin, seq) {
+               INIT_LIST_HEAD(&p->list);
+               INIT_LIST_HEAD(&p->flushed);
+               atomic_set(&p->count, 0);
+               p->devs.nr = 0;
+       }
+
+       return 0;
+}
+
 int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 {
        struct journal *j = &c->journal;
@@ -724,10 +724,9 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
        struct journal_replay *i;
        struct journal_entry_pin_list *p;
        struct bch_dev *ca;
-       u64 cur_seq, end_seq, seq;
+       u64 cur_seq, end_seq;
        unsigned iter;
-       size_t entries = 0;
-       u64 nr, keys = 0;
+       size_t keys = 0, entries = 0;
        bool degraded = false;
        int ret = 0;
 
@@ -783,43 +782,13 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
                }
        }
 
-       list_for_each_entry(i, list, list) {
-               struct jset_entry *entry;
-               struct bkey_i *k, *_n;
-
-               for_each_jset_key(k, _n, entry, &i->j)
-                       keys++;
-       }
-
        i = list_last_entry(list, struct journal_replay, list);
 
-       nr = le64_to_cpu(i->j.seq) - le64_to_cpu(i->j.last_seq) + 1;
-
-       fsck_err_on(c->sb.clean && (keys || nr > 1), c,
-                   "filesystem marked clean but journal not empty (%llu keys in %llu entries)",
-                   keys, nr);
-
-       if (nr > j->pin.size) {
-               free_fifo(&j->pin);
-               init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
-               if (!j->pin.data) {
-                       bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
-                       return -ENOMEM;
-               }
-       }
-
-       atomic64_set(&j->seq, le64_to_cpu(i->j.seq));
-       j->last_seq_ondisk = le64_to_cpu(i->j.last_seq);
-
-       j->pin.front    = le64_to_cpu(i->j.last_seq);
-       j->pin.back     = le64_to_cpu(i->j.seq) + 1;
-
-       fifo_for_each_entry_ptr(p, &j->pin, seq) {
-               INIT_LIST_HEAD(&p->list);
-               INIT_LIST_HEAD(&p->flushed);
-               atomic_set(&p->count, 0);
-               p->devs.nr = 0;
-       }
+       ret = bch2_journal_set_seq(c,
+                                  le64_to_cpu(i->j.last_seq),
+                                  le64_to_cpu(i->j.seq));
+       if (ret)
+               return ret;
 
        mutex_lock(&j->blacklist_lock);
 
@@ -842,6 +811,8 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
                                struct journal_replay, list)->j.seq);
 
        list_for_each_entry(i, list, list) {
+               struct jset_entry *entry;
+               struct bkey_i *k, *_n;
                bool blacklisted;
 
                mutex_lock(&j->blacklist_lock);
@@ -863,10 +834,13 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
                        journal_last_seq(j), end_seq);
 
                cur_seq = le64_to_cpu(i->j.seq) + 1;
+
+               for_each_jset_key(k, _n, entry, &i->j)
+                       keys++;
                entries++;
        }
 
-       bch_info(c, "journal read done, %llu keys in %zu entries, seq %llu",
+       bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
                 keys, entries, journal_cur_seq(j));
 fsck_err:
        return ret;
@@ -950,7 +924,8 @@ int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
        j->replay_journal_seq = 0;
 
        bch2_journal_set_replay_done(j);
-       ret = bch2_journal_flush_all_pins(j);
+       bch2_journal_flush_all_pins(j);
+       ret = bch2_journal_error(j);
 err:
        bch2_journal_entries_free(list);
        return ret;
index 4236b7fc37ff1f37787be02302d0917044ebb3a7..e303df9241dec5a79bb22f509d72f31d6103c8d2 100644 (file)
@@ -1,9 +1,6 @@
 #ifndef _BCACHEFS_JOURNAL_IO_H
 #define _BCACHEFS_JOURNAL_IO_H
 
-struct bkey_i *bch2_journal_find_btree_root(struct bch_fs *, struct jset *,
-                                           enum btree_id, unsigned *);
-
 /*
  * Only used for holding the journal entries we read in btree_journal_read()
  * during cache_registration
@@ -37,6 +34,7 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
        for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)        \
                vstruct_for_each_safe(entry, k, _n)
 
+int bch2_journal_set_seq(struct bch_fs *c, u64, u64);
 int bch2_journal_read(struct bch_fs *, struct list_head *);
 
 int bch2_journal_entry_sectors(struct journal *);
index 0e3e5b6abb39f42868342390dded1b9478200317..394b72bb55187a00ac852e2958dfa083b90cbf18 100644 (file)
@@ -337,34 +337,22 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
        return ret;
 }
 
-int bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
+void bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
 {
-       struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct journal_entry_pin *pin;
        u64 pin_seq;
-       bool flush;
 
        if (!test_bit(JOURNAL_STARTED, &j->flags))
-               return 0;
-again:
-       wait_event(j->wait, journal_flush_done(j, seq_to_flush, &pin, &pin_seq));
-       if (pin) {
-               /* flushing a journal pin might cause a new one to be added: */
-               pin->flush(j, pin, pin_seq);
-               goto again;
-       }
-
-       spin_lock(&j->lock);
-       flush = journal_last_seq(j) != j->last_seq_ondisk ||
-               (seq_to_flush == U64_MAX && c->btree_roots_dirty);
-       spin_unlock(&j->lock);
+               return;
 
-       return flush ? bch2_journal_meta(j) : 0;
-}
+       while (1) {
+               wait_event(j->wait, journal_flush_done(j, seq_to_flush,
+                                                      &pin, &pin_seq));
+               if (!pin)
+                       break;
 
-int bch2_journal_flush_all_pins(struct journal *j)
-{
-       return bch2_journal_flush_pins(j, U64_MAX);
+               pin->flush(j, pin, pin_seq);
+       }
 }
 
 int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
@@ -383,7 +371,9 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
                        seq = iter;
        spin_unlock(&j->lock);
 
-       ret = bch2_journal_flush_pins(j, seq);
+       bch2_journal_flush_pins(j, seq);
+
+       ret = bch2_journal_error(j);
        if (ret)
                return ret;
 
@@ -404,7 +394,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
        }
        spin_unlock(&j->lock);
 
-       bch2_replicas_gc_end(c, ret);
+       ret = bch2_replicas_gc_end(c, ret);
        mutex_unlock(&c->replicas_gc_lock);
 
        return ret;
index 7d460c35cfaea6a27f4e7e5cf63f10a386676326..eb22790251decd34df5e186ac0b88c4d4165efb4 100644 (file)
@@ -29,8 +29,13 @@ void bch2_journal_pin_add_if_older(struct journal *,
 void bch2_journal_reclaim_fast(struct journal *);
 void bch2_journal_reclaim_work(struct work_struct *);
 
-int bch2_journal_flush_pins(struct journal *, u64);
-int bch2_journal_flush_all_pins(struct journal *);
+void bch2_journal_flush_pins(struct journal *, u64);
+
+static inline void bch2_journal_flush_all_pins(struct journal *j)
+{
+       bch2_journal_flush_pins(j, U64_MAX);
+}
+
 int bch2_journal_flush_device_pins(struct journal *, int);
 
 #endif /* _BCACHEFS_JOURNAL_RECLAIM_H */
index a27e0548c098ad4ed07d7795e63643d95fb1b00f..effbeece1ed98c8822d7bcc593071891db9e9e13 100644 (file)
@@ -117,6 +117,7 @@ enum {
        JOURNAL_REPLAY_DONE,
        JOURNAL_STARTED,
        JOURNAL_NEED_WRITE,
+       JOURNAL_NOT_EMPTY,
 };
 
 /* Embedded in struct bch_fs */
index ea519102a22825f03a2923e2dbd5792852b3aa38..215c5aa5be0ecc59e0ab50340b35e5eefa5777e7 100644 (file)
@@ -126,7 +126,13 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 retry:
                        if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key),
                                                    dev_idx)) {
-                               bch2_btree_iter_set_locks_want(&iter, 0);
+                               /*
+                                * we might have found a btree node key we
+                                * needed to update, and then tried to update it
+                                * but got -EINTR after upgrading the iter, but
+                                * then raced and the node is now gone:
+                                */
+                               bch2_btree_iter_downgrade(&iter);
 
                                ret = bch2_mark_bkey_replicas(c, BCH_DATA_BTREE,
                                                              bkey_i_to_s_c(&b->key));
@@ -141,11 +147,6 @@ retry:
                                if (ret)
                                        goto err;
 
-                               if (!bch2_btree_iter_set_locks_want(&iter, U8_MAX)) {
-                                       b = bch2_btree_iter_peek_node(&iter);
-                                       goto retry;
-                               }
-
                                ret = bch2_btree_node_update_key(c, &iter, b, new_key);
                                if (ret == -EINTR) {
                                        b = bch2_btree_iter_peek_node(&iter);
@@ -160,7 +161,7 @@ retry:
 
        ret = 0;
 out:
-       bch2_replicas_gc_end(c, ret);
+       ret = bch2_replicas_gc_end(c, ret);
        mutex_unlock(&c->replicas_gc_lock);
 
        return ret;
index e7ab8870d3ac2a35b60fc42ad0bd0b0278045d43..f476033e707f3b2cf74345fd53f8ccfb593c5a58 100644 (file)
@@ -137,6 +137,9 @@ enum opt_type {
        BCH_OPT(degraded,               u8,     OPT_MOUNT,              \
                OPT_BOOL(),                                             \
                NO_SB_OPT,                      false)                  \
+       BCH_OPT(discard,                u8,     OPT_MOUNT,              \
+               OPT_BOOL(),                                             \
+               NO_SB_OPT,                      false)                  \
        BCH_OPT(verbose_recovery,       u8,     OPT_MOUNT,              \
                OPT_BOOL(),                                             \
                NO_SB_OPT,                      false)                  \
diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c
new file mode 100644 (file)
index 0000000..58aee7a
--- /dev/null
@@ -0,0 +1,346 @@
+
+#include "bcachefs.h"
+#include "alloc.h"
+#include "btree_gc.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "btree_io.h"
+#include "error.h"
+#include "fsck.h"
+#include "journal_io.h"
+#include "quota.h"
+#include "recovery.h"
+#include "super-io.h"
+
+#include <linux/stat.h>
+
+struct bkey_i *btree_root_find(struct bch_fs *c,
+                              struct bch_sb_field_clean *clean,
+                              struct jset *j,
+                              enum btree_id id, unsigned *level)
+{
+       struct bkey_i *k;
+       struct jset_entry *entry, *start, *end;
+
+       if (clean) {
+               start = clean->start;
+               end = vstruct_end(&clean->field);
+       } else {
+               start = j->start;
+               end = vstruct_last(j);
+       }
+
+       for (entry = start; entry < end; entry = vstruct_next(entry))
+               if (entry->type == BCH_JSET_ENTRY_btree_root &&
+                   entry->btree_id == id)
+                       goto found;
+
+       return NULL;
+found:
+       if (!entry->u64s)
+               return ERR_PTR(-EINVAL);
+
+       k = entry->start;
+       *level = entry->level;
+       return k;
+}
+
+static int verify_superblock_clean(struct bch_fs *c,
+                                  struct bch_sb_field_clean *clean,
+                                  struct jset *j)
+{
+       unsigned i;
+       int ret = 0;
+
+       if (!clean || !j)
+               return 0;
+
+       if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c,
+                       "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown",
+                       le64_to_cpu(clean->journal_seq),
+                       le64_to_cpu(j->seq)))
+               bch2_fs_mark_clean(c, false);
+
+       mustfix_fsck_err_on(j->read_clock != clean->read_clock, c,
+                       "superblock read clock doesn't match journal after clean shutdown");
+       mustfix_fsck_err_on(j->write_clock != clean->write_clock, c,
+                       "superblock read clock doesn't match journal after clean shutdown");
+
+       for (i = 0; i < BTREE_ID_NR; i++) {
+               struct bkey_i *k1, *k2;
+               unsigned l1 = 0, l2 = 0;
+
+               k1 = btree_root_find(c, clean, NULL, i, &l1);
+               k2 = btree_root_find(c, NULL, j, i, &l2);
+
+               if (!k1 && !k2)
+                       continue;
+
+               mustfix_fsck_err_on(!k1 || !k2 ||
+                                   IS_ERR(k1) ||
+                                   IS_ERR(k2) ||
+                                   k1->k.u64s != k2->k.u64s ||
+                                   memcmp(k1, k2, bkey_bytes(k1)) ||
+                                   l1 != l2, c,
+                       "superblock btree root doesn't match journal after clean shutdown");
+       }
+fsck_err:
+       return ret;
+}
+
+static bool journal_empty(struct list_head *journal)
+{
+       struct journal_replay *i;
+       struct jset_entry *entry;
+
+       if (list_empty(journal))
+               return true;
+
+       i = list_last_entry(journal, struct journal_replay, list);
+
+       if (i->j.last_seq != i->j.seq)
+               return false;
+
+       list_for_each_entry(i, journal, list) {
+               vstruct_for_each(&i->j, entry) {
+                       if (entry->type == BCH_JSET_ENTRY_btree_root)
+                               continue;
+
+                       if (entry->type == BCH_JSET_ENTRY_btree_keys &&
+                           !entry->u64s)
+                               continue;
+                       return false;
+               }
+       }
+
+       return true;
+}
+
+int bch2_fs_recovery(struct bch_fs *c)
+{
+       const char *err = "cannot allocate memory";
+       struct bch_sb_field_clean *clean = NULL, *sb_clean = NULL;
+       LIST_HEAD(journal);
+       struct jset *j = NULL;
+       unsigned i;
+       int ret;
+
+       mutex_lock(&c->sb_lock);
+       if (!bch2_sb_get_replicas(c->disk_sb.sb)) {
+               bch_info(c, "building replicas info");
+               set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags);
+       }
+
+       if (c->sb.clean)
+               sb_clean = bch2_sb_get_clean(c->disk_sb.sb);
+       if (sb_clean) {
+               clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field),
+                               GFP_KERNEL);
+               if (!clean) {
+                       ret = -ENOMEM;
+                       mutex_unlock(&c->sb_lock);
+                       goto err;
+               }
+       }
+       mutex_unlock(&c->sb_lock);
+
+       if (clean)
+               bch_info(c, "recovering from clean shutdown, journal seq %llu",
+                        le64_to_cpu(clean->journal_seq));
+
+       if (!clean || !c->opts.nofsck) {
+               ret = bch2_journal_read(c, &journal);
+               if (ret)
+                       goto err;
+
+               j = &list_entry(journal.prev, struct journal_replay, list)->j;
+       } else {
+               ret = bch2_journal_set_seq(c,
+                                          le64_to_cpu(clean->journal_seq),
+                                          le64_to_cpu(clean->journal_seq));
+               BUG_ON(ret);
+       }
+
+       ret = verify_superblock_clean(c, clean, j);
+       if (ret)
+               goto err;
+
+       fsck_err_on(clean && !journal_empty(&journal), c,
+                   "filesystem marked clean but journal not empty");
+
+       if (clean) {
+               c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock);
+               c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock);
+       } else {
+               c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
+               c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
+       }
+
+       for (i = 0; i < BTREE_ID_NR; i++) {
+               unsigned level;
+               struct bkey_i *k;
+
+               k = btree_root_find(c, clean, j, i, &level);
+               if (!k)
+                       continue;
+
+               err = "invalid btree root pointer";
+               if (IS_ERR(k))
+                       goto err;
+
+               err = "error reading btree root";
+               if (bch2_btree_root_read(c, i, k, level)) {
+                       if (i != BTREE_ID_ALLOC)
+                               goto err;
+
+                       mustfix_fsck_err(c, "error reading btree root");
+               }
+       }
+
+       for (i = 0; i < BTREE_ID_NR; i++)
+               if (!c->btree_roots[i].b)
+                       bch2_btree_root_alloc(c, i);
+
+       err = "error reading allocation information";
+       ret = bch2_alloc_read(c, &journal);
+       if (ret)
+               goto err;
+
+       set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+
+       bch_verbose(c, "starting mark and sweep:");
+       err = "error in recovery";
+       ret = bch2_initial_gc(c, &journal);
+       if (ret)
+               goto err;
+       bch_verbose(c, "mark and sweep done");
+
+       if (c->opts.noreplay)
+               goto out;
+
+       /*
+        * Mark dirty before journal replay, fsck:
+        * XXX: after a clean shutdown, this could be done lazily only when fsck
+        * finds an error
+        */
+       bch2_fs_mark_clean(c, false);
+
+       /*
+        * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
+        * will give spurious errors about oldest_gen > bucket_gen -
+        * this is a hack but oh well.
+        */
+       bch2_fs_journal_start(&c->journal);
+
+       err = "error starting allocator";
+       if (bch2_fs_allocator_start(c))
+               goto err;
+
+       bch_verbose(c, "starting journal replay:");
+       err = "journal replay failed";
+       ret = bch2_journal_replay(c, &journal);
+       if (ret)
+               goto err;
+       bch_verbose(c, "journal replay done");
+
+       if (c->opts.norecovery)
+               goto out;
+
+       bch_verbose(c, "starting fsck:");
+       err = "error in fsck";
+       ret = bch2_fsck(c, !c->opts.nofsck);
+       if (ret)
+               goto err;
+       bch_verbose(c, "fsck done");
+
+       if (enabled_qtypes(c)) {
+               bch_verbose(c, "reading quotas:");
+               ret = bch2_fs_quota_read(c);
+               if (ret)
+                       goto err;
+               bch_verbose(c, "quotas done");
+       }
+
+out:
+       bch2_journal_entries_free(&journal);
+       kfree(clean);
+       return ret;
+err:
+fsck_err:
+       BUG_ON(!ret);
+       goto out;
+}
+
+int bch2_fs_initialize(struct bch_fs *c)
+{
+       struct bch_inode_unpacked inode;
+       struct bkey_inode_buf packed_inode;
+       const char *err = "cannot allocate memory";
+       struct bch_dev *ca;
+       LIST_HEAD(journal);
+       unsigned i;
+       int ret;
+
+       bch_notice(c, "initializing new filesystem");
+
+       set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
+
+       ret = bch2_initial_gc(c, &journal);
+       if (ret)
+               goto err;
+
+       err = "unable to allocate journal buckets";
+       for_each_online_member(ca, c, i)
+               if (bch2_dev_journal_alloc(ca)) {
+                       percpu_ref_put(&ca->io_ref);
+                       goto err;
+               }
+
+       for (i = 0; i < BTREE_ID_NR; i++)
+               bch2_btree_root_alloc(c, i);
+
+       /*
+        * journal_res_get() will crash if called before this has
+        * set up the journal.pin FIFO and journal.cur pointer:
+        */
+       bch2_fs_journal_start(&c->journal);
+       bch2_journal_set_replay_done(&c->journal);
+
+       err = "error starting allocator";
+       if (bch2_fs_allocator_start(c))
+               goto err;
+
+       bch2_inode_init(c, &inode, 0, 0,
+                       S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
+       inode.bi_inum = BCACHEFS_ROOT_INO;
+
+       bch2_inode_pack(&packed_inode, &inode);
+
+       err = "error creating root directory";
+       if (bch2_btree_insert(c, BTREE_ID_INODES,
+                             &packed_inode.inode.k_i,
+                             NULL, NULL, NULL, 0))
+               goto err;
+
+       if (enabled_qtypes(c)) {
+               ret = bch2_fs_quota_read(c);
+               if (ret)
+                       goto err;
+       }
+
+       err = "error writing first journal entry";
+       if (bch2_journal_meta(&c->journal))
+               goto err;
+
+       mutex_lock(&c->sb_lock);
+       SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
+       SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
+
+       bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+
+       return 0;
+err:
+       BUG_ON(!ret);
+       return ret;
+}
diff --git a/libbcachefs/recovery.h b/libbcachefs/recovery.h
new file mode 100644 (file)
index 0000000..685507e
--- /dev/null
@@ -0,0 +1,7 @@
+#ifndef _BCACHEFS_RECOVERY_H
+#define _BCACHEFS_RECOVERY_H
+
+int bch2_fs_recovery(struct bch_fs *);
+int bch2_fs_initialize(struct bch_fs *);
+
+#endif /* _BCACHEFS_RECOVERY_H */
index 6c52d1d456c50636627b97fc84cf4990f4893b81..1e94d35fde96b76f5273745d8f11521f257ef6ba 100644 (file)
@@ -215,10 +215,8 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
        return 0;
 err:
        mutex_unlock(&c->sb_lock);
-       if (new_gc)
-               kfree(new_gc);
-       if (new_r)
-               kfree(new_r);
+       kfree(new_gc);
+       kfree(new_r);
        return ret;
 }
 
@@ -265,10 +263,9 @@ int bch2_mark_bkey_replicas(struct bch_fs *c,
        return bch2_mark_replicas(c, data_type, bch2_bkey_dirty_devs(k));
 }
 
-int bch2_replicas_gc_end(struct bch_fs *c, int err)
+int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 {
        struct bch_replicas_cpu *new_r, *old_r;
-       int ret = 0;
 
        lockdep_assert_held(&c->replicas_gc_lock);
 
@@ -276,29 +273,31 @@ int bch2_replicas_gc_end(struct bch_fs *c, int err)
 
        new_r = rcu_dereference_protected(c->replicas_gc,
                                          lockdep_is_held(&c->sb_lock));
+       rcu_assign_pointer(c->replicas_gc, NULL);
 
-       if (err) {
-               rcu_assign_pointer(c->replicas_gc, NULL);
-               kfree_rcu(new_r, rcu);
+       if (ret)
                goto err;
-       }
 
        if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) {
                ret = -ENOSPC;
                goto err;
        }
 
+       bch2_write_super(c);
+
+       /* don't update in memory replicas until changes are persistent */
+
        old_r = rcu_dereference_protected(c->replicas,
                                          lockdep_is_held(&c->sb_lock));
 
        rcu_assign_pointer(c->replicas, new_r);
-       rcu_assign_pointer(c->replicas_gc, NULL);
        kfree_rcu(old_r, rcu);
-
-       bch2_write_super(c);
-err:
+out:
        mutex_unlock(&c->sb_lock);
        return ret;
+err:
+       kfree_rcu(new_r, rcu);
+       goto out;
 }
 
 int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
index f7dd0144f6089d9927a123be41fb2421c5c92d80..c80510952b71c0224c56d53e1ef3f438da10a2d5 100644 (file)
@@ -237,6 +237,7 @@ static inline int bch2_hash_needs_whiteout(const struct bch_hash_desc desc,
 {
        struct bkey_s_c k;
 
+       bch2_btree_iter_copy(iter, start);
        bch2_btree_iter_next_slot(iter);
 
        for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k) {
index 9772d59730781e07ba2d8ec9bb776b3d363191e1..54de9fac6e2283e63487a691f18523ba89522f24 100644 (file)
@@ -4,6 +4,7 @@
 #include "disk_groups.h"
 #include "error.h"
 #include "io.h"
+#include "journal.h"
 #include "replicas.h"
 #include "quota.h"
 #include "super-io.h"
@@ -89,6 +90,9 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
        struct bch_sb *new_sb;
        struct bio *bio;
 
+       if (sb->sb && sb->page_order >= order)
+               return 0;
+
        if (sb->have_layout) {
                u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
 
@@ -849,6 +853,84 @@ static const struct bch_sb_field_ops bch_sb_field_ops_crypt = {
        .validate       = bch2_sb_validate_crypt,
 };
 
+/* BCH_SB_FIELD_clean: */
+
+void bch2_fs_mark_clean(struct bch_fs *c, bool clean)
+{
+       struct bch_sb_field_clean *sb_clean;
+       unsigned u64s = sizeof(*sb_clean) / sizeof(u64);
+       struct jset_entry *entry;
+       struct btree_root *r;
+
+       mutex_lock(&c->sb_lock);
+       if (clean == BCH_SB_CLEAN(c->disk_sb.sb))
+               goto out;
+
+       SET_BCH_SB_CLEAN(c->disk_sb.sb, clean);
+
+       if (!clean)
+               goto write_super;
+
+       mutex_lock(&c->btree_root_lock);
+
+       for (r = c->btree_roots;
+            r < c->btree_roots + BTREE_ID_NR;
+            r++)
+               if (r->alive)
+                       u64s += jset_u64s(r->key.u64s);
+
+       sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s);
+       if (!sb_clean) {
+               bch_err(c, "error resizing superblock while setting filesystem clean");
+               goto out;
+       }
+
+       sb_clean->flags         = 0;
+       sb_clean->read_clock    = cpu_to_le16(c->bucket_clock[READ].hand);
+       sb_clean->write_clock   = cpu_to_le16(c->bucket_clock[WRITE].hand);
+       sb_clean->journal_seq   = journal_cur_seq(&c->journal) - 1;
+
+       entry = sb_clean->start;
+       memset(entry, 0,
+              vstruct_end(&sb_clean->field) - (void *) entry);
+
+       for (r = c->btree_roots;
+            r < c->btree_roots + BTREE_ID_NR;
+            r++)
+               if (r->alive) {
+                       entry->u64s     = r->key.u64s;
+                       entry->btree_id = r - c->btree_roots;
+                       entry->level    = r->level;
+                       entry->type     = BCH_JSET_ENTRY_btree_root;
+                       bkey_copy(&entry->start[0], &r->key);
+                       entry = vstruct_next(entry);
+                       BUG_ON((void *) entry > vstruct_end(&sb_clean->field));
+               }
+
+       BUG_ON(entry != vstruct_end(&sb_clean->field));
+
+       mutex_unlock(&c->btree_root_lock);
+write_super:
+       bch2_write_super(c);
+out:
+       mutex_unlock(&c->sb_lock);
+}
+
+static const char *bch2_sb_validate_clean(struct bch_sb *sb,
+                                         struct bch_sb_field *f)
+{
+       struct bch_sb_field_clean *clean = field_to_type(f, clean);
+
+       if (vstruct_bytes(&clean->field) < sizeof(*clean))
+               return "invalid field crypt: wrong size";
+
+       return NULL;
+}
+
+static const struct bch_sb_field_ops bch_sb_field_ops_clean = {
+       .validate       = bch2_sb_validate_clean,
+};
+
 static const struct bch_sb_field_ops *bch2_sb_field_ops[] = {
 #define x(f, nr)                                       \
        [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f,
index 995b1c907318c823b009b3bbf48e45059a8c10af..7d09d8e45816b7f2ce19aed6bcfa7504d1f1909a 100644 (file)
@@ -131,6 +131,10 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
        };
 }
 
+/* BCH_SB_FIELD_clean: */
+
+void bch2_fs_mark_clean(struct bch_fs *, bool);
+
 size_t bch2_sb_field_to_text(char *, size_t, struct bch_sb *,
                             struct bch_sb_field *);
 
index 1eab7c77f89ebf73ecec999c5252aab80e7c9984..a2a32b924434b15182d616b9eded5b012dca9725 100644 (file)
@@ -10,7 +10,6 @@
 #include "alloc.h"
 #include "btree_cache.h"
 #include "btree_gc.h"
-#include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
 #include "chardev.h"
 #include "inode.h"
 #include "io.h"
 #include "journal.h"
-#include "journal_io.h"
 #include "journal_reclaim.h"
-#include "keylist.h"
 #include "move.h"
 #include "migrate.h"
 #include "movinggc.h"
 #include "quota.h"
 #include "rebalance.h"
+#include "recovery.h"
 #include "replicas.h"
 #include "super.h"
 #include "super-io.h"
@@ -201,18 +199,6 @@ int bch2_congested(void *data, int bdi_bits)
  * - allocator depends on the journal (when it rewrites prios and gens)
  */
 
-static void bch_fs_mark_clean(struct bch_fs *c)
-{
-       if (!bch2_journal_error(&c->journal) &&
-           !test_bit(BCH_FS_ERROR, &c->flags) &&
-           !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) {
-               mutex_lock(&c->sb_lock);
-               SET_BCH_SB_CLEAN(c->disk_sb.sb, true);
-               bch2_write_super(c);
-               mutex_unlock(&c->sb_lock);
-       }
-}
-
 static void __bch2_fs_read_only(struct bch_fs *c)
 {
        struct bch_dev *ca;
@@ -229,7 +215,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
         * Flush journal before stopping allocators, because flushing journal
         * blacklist entries involves allocating new btree nodes:
         */
-       bch2_journal_flush_pins(&c->journal, U64_MAX - 1);
+       bch2_journal_flush_all_pins(&c->journal);
 
        for_each_member_device(ca, c, i)
                bch2_dev_allocator_stop(ca);
@@ -246,9 +232,6 @@ static void __bch2_fs_read_only(struct bch_fs *c)
        closure_wait_event(&c->btree_interior_update_wait,
                           !bch2_btree_interior_updates_nr_pending(c));
 
-       if (!test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
-               bch2_btree_verify_flushed(c);
-
        bch2_fs_journal_stop(&c->journal);
 
        /*
@@ -257,6 +240,8 @@ static void __bch2_fs_read_only(struct bch_fs *c)
         */
        if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
                bch2_btree_flush_all_writes(c);
+       else
+               bch2_btree_verify_flushed(c);
 
        /*
         * After stopping journal:
@@ -275,12 +260,10 @@ static void bch2_writes_disabled(struct percpu_ref *writes)
 
 void bch2_fs_read_only(struct bch_fs *c)
 {
-       if (c->state != BCH_FS_STARTING &&
-           c->state != BCH_FS_RW)
+       if (c->state == BCH_FS_RO)
                return;
 
-       if (test_bit(BCH_FS_ERROR, &c->flags))
-               return;
+       BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
 
        /*
         * Block new foreground-end write operations from starting - any new
@@ -311,13 +294,18 @@ void bch2_fs_read_only(struct bch_fs *c)
 
        __bch2_fs_read_only(c);
 
-       bch_fs_mark_clean(c);
-
        wait_event(bch_read_only_wait,
                   test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
 
        clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
-       c->state = BCH_FS_RO;
+
+       if (!bch2_journal_error(&c->journal) &&
+           !test_bit(BCH_FS_ERROR, &c->flags) &&
+           !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+               bch2_fs_mark_clean(c, true);
+
+       if (c->state != BCH_FS_STOPPING)
+               c->state = BCH_FS_RO;
 }
 
 static void bch2_fs_read_only_work(struct work_struct *work)
@@ -352,10 +340,11 @@ const char *bch2_fs_read_write(struct bch_fs *c)
        const char *err = NULL;
        unsigned i;
 
-       if (c->state != BCH_FS_STARTING &&
-           c->state != BCH_FS_RO)
+       if (c->state == BCH_FS_RW)
                return NULL;
 
+       bch2_fs_mark_clean(c, false);
+
        for_each_rw_member(ca, c, i)
                bch2_dev_allocator_add(c, ca);
        bch2_recalc_capacity(c);
@@ -446,11 +435,6 @@ void bch2_fs_stop(struct bch_fs *c)
        struct bch_dev *ca;
        unsigned i;
 
-       mutex_lock(&c->state_lock);
-       BUG_ON(c->state == BCH_FS_STOPPING);
-       c->state = BCH_FS_STOPPING;
-       mutex_unlock(&c->state_lock);
-
        for_each_member_device(ca, c, i)
                if (ca->kobj.state_in_sysfs &&
                    ca->disk_sb.bdev)
@@ -475,11 +459,9 @@ void bch2_fs_stop(struct bch_fs *c)
        closure_debug_destroy(&c->cl);
 
        mutex_lock(&c->state_lock);
-       __bch2_fs_read_only(c);
+       bch2_fs_read_only(c);
        mutex_unlock(&c->state_lock);
 
-       bch_fs_mark_clean(c);
-
        /* btree prefetch might have kicked off reads in the background: */
        bch2_btree_flush_all_reads(c);
 
@@ -695,9 +677,7 @@ const char *bch2_fs_start(struct bch_fs *c)
        const char *err = "cannot allocate memory";
        struct bch_sb_field_members *mi;
        struct bch_dev *ca;
-       LIST_HEAD(journal);
-       struct jset *j;
-       time64_t now;
+       time64_t now = ktime_get_seconds();
        unsigned i;
        int ret = -EINVAL;
 
@@ -706,157 +686,26 @@ const char *bch2_fs_start(struct bch_fs *c)
        BUG_ON(c->state != BCH_FS_STARTING);
 
        mutex_lock(&c->sb_lock);
+
        for_each_online_member(ca, c, i)
                bch2_sb_from_fs(c, ca);
+
+       mi = bch2_sb_get_members(c->disk_sb.sb);
+       for_each_online_member(ca, c, i)
+               mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
+
        mutex_unlock(&c->sb_lock);
 
        for_each_rw_member(ca, c, i)
                bch2_dev_allocator_add(c, ca);
        bch2_recalc_capacity(c);
 
-       if (BCH_SB_INITIALIZED(c->disk_sb.sb)) {
-               ret = bch2_journal_read(c, &journal);
-               if (ret)
-                       goto err;
-
-               j = &list_entry(journal.prev, struct journal_replay, list)->j;
-
-               c->bucket_clock[READ].hand = le16_to_cpu(j->read_clock);
-               c->bucket_clock[WRITE].hand = le16_to_cpu(j->write_clock);
-
-               for (i = 0; i < BTREE_ID_NR; i++) {
-                       unsigned level;
-                       struct bkey_i *k;
-
-                       k = bch2_journal_find_btree_root(c, j, i, &level);
-                       if (!k)
-                               continue;
-
-                       err = "invalid btree root pointer";
-                       if (IS_ERR(k))
-                               goto err;
-
-                       err = "error reading btree root";
-                       if (bch2_btree_root_read(c, i, k, level)) {
-                               if (i != BTREE_ID_ALLOC)
-                                       goto err;
-
-                               mustfix_fsck_err(c, "error reading btree root");
-                       }
-               }
-
-               for (i = 0; i < BTREE_ID_NR; i++)
-                       if (!c->btree_roots[i].b)
-                               bch2_btree_root_alloc(c, i);
-
-               err = "error reading allocation information";
-               ret = bch2_alloc_read(c, &journal);
-               if (ret)
-                       goto err;
-
-               set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
-
-               bch_verbose(c, "starting mark and sweep:");
-               err = "error in recovery";
-               ret = bch2_initial_gc(c, &journal);
-               if (ret)
-                       goto err;
-               bch_verbose(c, "mark and sweep done");
-
-               if (c->opts.noreplay)
-                       goto recovery_done;
-
-               /*
-                * bch2_fs_journal_start() can't happen sooner, or btree_gc_finish()
-                * will give spurious errors about oldest_gen > bucket_gen -
-                * this is a hack but oh well.
-                */
-               bch2_fs_journal_start(&c->journal);
-
-               err = "error starting allocator";
-               if (bch2_fs_allocator_start(c))
-                       goto err;
-
-               bch_verbose(c, "starting journal replay:");
-               err = "journal replay failed";
-               ret = bch2_journal_replay(c, &journal);
-               if (ret)
-                       goto err;
-               bch_verbose(c, "journal replay done");
-
-               if (c->opts.norecovery)
-                       goto recovery_done;
-
-               bch_verbose(c, "starting fsck:");
-               err = "error in fsck";
-               ret = bch2_fsck(c, !c->opts.nofsck);
-               if (ret)
-                       goto err;
-               bch_verbose(c, "fsck done");
-
-               if (enabled_qtypes(c)) {
-                       bch_verbose(c, "reading quotas:");
-                       ret = bch2_fs_quota_read(c);
-                       if (ret)
-                               goto err;
-                       bch_verbose(c, "quotas done");
-               }
-       } else {
-               struct bch_inode_unpacked inode;
-               struct bkey_inode_buf packed_inode;
-
-               bch_notice(c, "initializing new filesystem");
-
-               set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags);
-
-               ret = bch2_initial_gc(c, &journal);
-               if (ret)
-                       goto err;
-
-               err = "unable to allocate journal buckets";
-               for_each_online_member(ca, c, i)
-                       if (bch2_dev_journal_alloc(ca)) {
-                               percpu_ref_put(&ca->io_ref);
-                               goto err;
-                       }
-
-               for (i = 0; i < BTREE_ID_NR; i++)
-                       bch2_btree_root_alloc(c, i);
-
-               /*
-                * journal_res_get() will crash if called before this has
-                * set up the journal.pin FIFO and journal.cur pointer:
-                */
-               bch2_fs_journal_start(&c->journal);
-               bch2_journal_set_replay_done(&c->journal);
-
-               err = "error starting allocator";
-               if (bch2_fs_allocator_start(c))
-                       goto err;
-
-               bch2_inode_init(c, &inode, 0, 0,
-                              S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL);
-               inode.bi_inum = BCACHEFS_ROOT_INO;
-
-               bch2_inode_pack(&packed_inode, &inode);
-
-               err = "error creating root directory";
-               if (bch2_btree_insert(c, BTREE_ID_INODES,
-                                    &packed_inode.inode.k_i,
-                                    NULL, NULL, NULL, 0))
-                       goto err;
-
-               if (enabled_qtypes(c)) {
-                       ret = bch2_fs_quota_read(c);
-                       if (ret)
-                               goto err;
-               }
+       ret = BCH_SB_INITIALIZED(c->disk_sb.sb)
+               ? bch2_fs_recovery(c)
+               : bch2_fs_initialize(c);
+       if (ret)
+               goto err;
 
-               err = "error writing first journal entry";
-               if (bch2_journal_meta(&c->journal))
-                       goto err;
-       }
-recovery_done:
        err = "dynamic fault";
        if (bch2_fs_init_fault("fs_start"))
                goto err;
@@ -869,28 +718,13 @@ recovery_done:
                        goto err;
        }
 
-       mutex_lock(&c->sb_lock);
-       mi = bch2_sb_get_members(c->disk_sb.sb);
-       now = ktime_get_seconds();
-
-       for_each_member_device(ca, c, i)
-               mi->members[ca->dev_idx].last_mount = cpu_to_le64(now);
-
-       SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
-       SET_BCH_SB_CLEAN(c->disk_sb.sb, false);
-
-       bch2_write_super(c);
-       mutex_unlock(&c->sb_lock);
-
        set_bit(BCH_FS_STARTED, &c->flags);
 
        err = NULL;
 out:
        mutex_unlock(&c->state_lock);
-       bch2_journal_entries_free(&journal);
        return err;
 err:
-fsck_err:
        switch (ret) {
        case BCH_FSCK_ERRORS_NOT_FIXED:
                bch_err(c, "filesystem contains errors: please report this to the developers");
@@ -1091,6 +925,9 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
        ca->mi = bch2_mi_to_cpu(member);
        ca->uuid = member->uuid;
 
+       if (opt_defined(c->opts, discard))
+               ca->mi.discard = opt_get(c->opts, discard);
+
        if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete,
                            0, GFP_KERNEL) ||
            percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete,
@@ -1454,7 +1291,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
         * must flush all existing journal entries, they might have
         * (overwritten) keys that point to the device we're removing:
         */
-       ret = bch2_journal_flush_all_pins(&c->journal);
+       bch2_journal_flush_all_pins(&c->journal);
+       ret = bch2_journal_error(&c->journal);
        if (ret) {
                bch_err(ca, "Remove failed, journal error");
                goto err;
@@ -1615,6 +1453,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 {
        struct bch_opts opts = bch2_opts_empty();
        struct bch_sb_handle sb = { NULL };
+       struct bch_sb_field_members *mi;
        struct bch_dev *ca;
        unsigned dev_idx;
        const char *err;
@@ -1646,6 +1485,15 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
                        goto err;
        }
 
+       mutex_lock(&c->sb_lock);
+       mi = bch2_sb_get_members(c->disk_sb.sb);
+
+       mi->members[ca->dev_idx].last_mount =
+               cpu_to_le64(ktime_get_seconds());
+
+       bch2_write_super(c);
+       mutex_unlock(&c->sb_lock);
+
        mutex_unlock(&c->state_lock);
        return 0;
 err:
index 66b5b9f933bc4c562d9b0d295480531252030210..4987ee76a08cbdeb4153b04990941643bda39e0d 100644 (file)
@@ -27,6 +27,7 @@
 #include "rebalance.h"
 #include "replicas.h"
 #include "super-io.h"
+#include "tests.h"
 
 #include <linux/blkdev.h>
 #include <linux/sort.h>
@@ -192,6 +193,10 @@ rw_attribute(pd_controllers_update_seconds);
 read_attribute(meta_replicas_have);
 read_attribute(data_replicas_have);
 
+#ifdef CONFIG_BCACHEFS_TESTS
+write_attribute(perf_test);
+#endif /* CONFIG_BCACHEFS_TESTS */
+
 #define BCH_DEBUG_PARAM(name, description)                             \
        rw_attribute(name);
 
@@ -446,7 +451,25 @@ STORE(__bch2_fs)
                sc.nr_to_scan = strtoul_or_return(buf);
                c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc);
        }
-
+#ifdef CONFIG_BCACHEFS_TESTS
+       if (attr == &sysfs_perf_test) {
+               char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
+               char *test              = strsep(&p, " \t\n");
+               char *nr_str            = strsep(&p, " \t\n");
+               char *threads_str       = strsep(&p, " \t\n");
+               unsigned threads;
+               u64 nr;
+               int ret = -EINVAL;
+
+               if (threads_str &&
+                   !(ret = kstrtouint(threads_str, 10, &threads)) &&
+                   !(ret = bch2_strtoull_h(nr_str, &nr)))
+                       bch2_btree_perf_test(c, test, nr, threads);
+               else
+                       size = ret;
+               kfree(tmp);
+       }
+#endif
        return size;
 }
 
@@ -477,6 +500,10 @@ struct attribute *bch2_fs_files[] = {
        &sysfs_promote_whole_extents,
 
        &sysfs_compression_stats,
+
+#ifdef CONFIG_BCACHEFS_TESTS
+       &sysfs_perf_test,
+#endif
        NULL
 };
 
diff --git a/libbcachefs/tests.c b/libbcachefs/tests.c
new file mode 100644 (file)
index 0000000..9dcadd2
--- /dev/null
@@ -0,0 +1,289 @@
+#ifdef CONFIG_BCACHEFS_TESTS
+
+#include "bcachefs.h"
+#include "btree_update.h"
+#include "tests.h"
+
+#include "linux/kthread.h"
+#include "linux/random.h"
+
+static void test_delete(struct bch_fs *c, u64 nr)
+{
+       struct btree_iter iter;
+       struct bkey_i_cookie k;
+       int ret;
+
+       bkey_cookie_init(&k.k_i);
+
+       bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS, k.k.p,
+                            BTREE_ITER_INTENT);
+
+       ret = bch2_btree_iter_traverse(&iter);
+       BUG_ON(ret);
+
+       ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+                                  BTREE_INSERT_ENTRY(&iter, &k.k_i));
+       BUG_ON(ret);
+
+       pr_info("deleting once");
+       ret = bch2_btree_delete_at(&iter, 0);
+       BUG_ON(ret);
+
+       pr_info("deleting twice");
+       ret = bch2_btree_delete_at(&iter, 0);
+       BUG_ON(ret);
+
+       bch2_btree_iter_unlock(&iter);
+}
+
+static u64 test_rand(void)
+{
+       u64 v;
+#if 0
+       v = prandom_u32();
+#else
+       prandom_bytes(&v, sizeof(v));
+#endif
+       return v;
+}
+
+static void rand_insert(struct bch_fs *c, u64 nr)
+{
+       struct bkey_i_cookie k;
+       int ret;
+       u64 i;
+
+       for (i = 0; i < nr; i++) {
+               bkey_cookie_init(&k.k_i);
+               k.k.p.offset = test_rand();
+
+               ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k.k_i,
+                                       NULL, NULL, NULL, 0);
+               BUG_ON(ret);
+       }
+}
+
+static void rand_lookup(struct bch_fs *c, u64 nr)
+{
+       u64 i;
+
+       for (i = 0; i < nr; i++) {
+               struct btree_iter iter;
+               struct bkey_s_c k;
+
+               bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS,
+                                    POS(0, test_rand()), 0);
+
+               k = bch2_btree_iter_peek(&iter);
+               bch2_btree_iter_unlock(&iter);
+       }
+}
+
+static void rand_mixed(struct bch_fs *c, u64 nr)
+{
+       int ret;
+       u64 i;
+
+       for (i = 0; i < nr; i++) {
+               struct btree_iter iter;
+               struct bkey_s_c k;
+
+               bch2_btree_iter_init(&iter, c, BTREE_ID_DIRENTS,
+                                    POS(0, test_rand()), 0);
+
+               k = bch2_btree_iter_peek(&iter);
+
+               if (!(i & 3) && k.k) {
+                       struct bkey_i_cookie k;
+
+                       bkey_cookie_init(&k.k_i);
+                       k.k.p = iter.pos;
+
+                       ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+                                                  BTREE_INSERT_ENTRY(&iter, &k.k_i));
+                       BUG_ON(ret);
+               }
+
+               bch2_btree_iter_unlock(&iter);
+       }
+
+}
+
+static void rand_delete(struct bch_fs *c, u64 nr)
+{
+       struct bkey_i k;
+       int ret;
+       u64 i;
+
+       for (i = 0; i < nr; i++) {
+               bkey_init(&k.k);
+               k.k.p.offset = test_rand();
+
+               ret = bch2_btree_insert(c, BTREE_ID_DIRENTS, &k,
+                                       NULL, NULL, NULL, 0);
+               BUG_ON(ret);
+       }
+}
+
+static void seq_insert(struct bch_fs *c, u64 nr)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       struct bkey_i_cookie insert;
+       int ret;
+       u64 i = 0;
+
+       bkey_cookie_init(&insert.k_i);
+
+       for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN,
+                          BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k) {
+               insert.k.p = iter.pos;
+
+               ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+                               BTREE_INSERT_ENTRY(&iter, &insert.k_i));
+               BUG_ON(ret);
+
+               if (++i == nr)
+                       break;
+       }
+       bch2_btree_iter_unlock(&iter);
+}
+
+static void seq_lookup(struct bch_fs *c, u64 nr)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+
+       for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN, 0, k)
+               ;
+       bch2_btree_iter_unlock(&iter);
+}
+
+static void seq_overwrite(struct bch_fs *c, u64 nr)
+{
+       struct btree_iter iter;
+       struct bkey_s_c k;
+       int ret;
+
+       for_each_btree_key(&iter, c, BTREE_ID_DIRENTS, POS_MIN,
+                          BTREE_ITER_INTENT, k) {
+               struct bkey_i_cookie u;
+
+               bkey_reassemble(&u.k_i, k);
+
+               ret = bch2_btree_insert_at(c, NULL, NULL, NULL, 0,
+                                          BTREE_INSERT_ENTRY(&iter, &u.k_i));
+               BUG_ON(ret);
+       }
+       bch2_btree_iter_unlock(&iter);
+}
+
+static void seq_delete(struct bch_fs *c, u64 nr)
+{
+       int ret;
+
+       ret = bch2_btree_delete_range(c, BTREE_ID_DIRENTS,
+                                     POS_MIN, POS_MAX,
+                                     ZERO_VERSION, NULL, NULL, NULL);
+       BUG_ON(ret);
+}
+
+typedef void (*perf_test_fn)(struct bch_fs *, u64);
+
+struct test_job {
+       struct bch_fs                   *c;
+       u64                             nr;
+       unsigned                        nr_threads;
+       perf_test_fn                    fn;
+
+       atomic_t                        ready;
+       wait_queue_head_t               ready_wait;
+
+       atomic_t                        done;
+       struct completion               done_completion;
+
+       u64                             start;
+       u64                             finish;
+};
+
+static int btree_perf_test_thread(void *data)
+{
+       struct test_job *j = data;
+
+       if (atomic_dec_and_test(&j->ready)) {
+               wake_up(&j->ready_wait);
+               j->start = sched_clock();
+       } else {
+               wait_event(j->ready_wait, !atomic_read(&j->ready));
+       }
+
+       j->fn(j->c, j->nr / j->nr_threads);
+
+       if (atomic_dec_and_test(&j->done)) {
+               j->finish = sched_clock();
+               complete(&j->done_completion);
+       }
+
+       return 0;
+}
+
+void bch2_btree_perf_test(struct bch_fs *c, const char *testname,
+                         u64 nr, unsigned nr_threads)
+{
+       struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads };
+       char name_buf[20], nr_buf[20], per_sec_buf[20];
+       unsigned i;
+       u64 time;
+
+       atomic_set(&j.ready, nr_threads);
+       init_waitqueue_head(&j.ready_wait);
+
+       atomic_set(&j.done, nr_threads);
+       init_completion(&j.done_completion);
+
+#define perf_test(_test)                               \
+       if (!strcmp(testname, #_test)) j.fn = _test
+
+       perf_test(rand_insert);
+       perf_test(rand_lookup);
+       perf_test(rand_mixed);
+       perf_test(rand_delete);
+
+       perf_test(seq_insert);
+       perf_test(seq_lookup);
+       perf_test(seq_overwrite);
+       perf_test(seq_delete);
+
+       /* a unit test, not a perf test: */
+       perf_test(test_delete);
+
+       if (!j.fn) {
+               pr_err("unknown test %s", testname);
+               return;
+       }
+
+       //pr_info("running test %s:", testname);
+
+       if (nr_threads == 1)
+               btree_perf_test_thread(&j);
+       else
+               for (i = 0; i < nr_threads; i++)
+                       kthread_run(btree_perf_test_thread, &j,
+                                   "bcachefs perf test[%u]", i);
+
+       while (wait_for_completion_interruptible(&j.done_completion))
+               ;
+
+       time = j.finish - j.start;
+
+       scnprintf(name_buf, sizeof(name_buf), "%s:", testname);
+       bch2_hprint(nr_buf, nr);
+       bch2_hprint(per_sec_buf, nr * NSEC_PER_SEC / time);
+       printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n",
+               name_buf, nr_buf, nr_threads,
+               time / NSEC_PER_SEC,
+               time * nr_threads / nr,
+               per_sec_buf);
+}
+
+#endif /* CONFIG_BCACHEFS_TESTS */
diff --git a/libbcachefs/tests.h b/libbcachefs/tests.h
new file mode 100644 (file)
index 0000000..3f1b8d1
--- /dev/null
@@ -0,0 +1,14 @@
+#ifndef _BCACHEFS_TEST_H
+#define _BCACHEFS_TEST_H
+
+struct bch_fs;
+
+#ifdef CONFIG_BCACHEFS_TESTS
+
+void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned);
+
+#else
+
+#endif /* CONFIG_BCACHEFS_TESTS */
+
+#endif /* _BCACHEFS_TEST_H */
index e263dd20540912e53aabee176325e5cbd1078423..24c6cc568762e969521409cdc452da6b85a5ff20 100644 (file)
 #define simple_strtoint(c, end, base)  simple_strtol(c, end, base)
 #define simple_strtouint(c, end, base) simple_strtoul(c, end, base)
 
+static const char si_units[] = "?kMGTPEZY";
+
+static int __bch2_strtoh(const char *cp, u64 *res,
+                        u64 t_max, bool t_signed)
+{
+       bool positive = *cp != '-';
+       unsigned u;
+       u64 v = 0;
+
+       if (*cp == '+' || *cp == '-')
+               cp++;
+
+       if (!isdigit(*cp))
+               return -EINVAL;
+
+       do {
+               if (v > U64_MAX / 10)
+                       return -ERANGE;
+               v *= 10;
+               if (v > U64_MAX - (*cp - '0'))
+                       return -ERANGE;
+               v += *cp - '0';
+               cp++;
+       } while (isdigit(*cp));
+
+       for (u = 1; u < ARRAY_SIZE(si_units); u++)
+               if (*cp == si_units[u]) {
+                       cp++;
+                       goto got_unit;
+               }
+       u = 0;
+got_unit:
+       if (*cp == '\n')
+               cp++;
+       if (*cp)
+               return -EINVAL;
+
+       if (fls64(v) + u * 10 > 64)
+               return -ERANGE;
+
+       v <<= u * 10;
+
+       if (positive) {
+               if (v > t_max)
+                       return -ERANGE;
+       } else {
+               if (v && !t_signed)
+                       return -ERANGE;
+
+               if (v > t_max + 1)
+                       return -ERANGE;
+               v = -v;
+       }
+
+       *res = v;
+       return 0;
+}
+
 #define STRTO_H(name, type)                                    \
 int bch2_ ## name ## _h(const char *cp, type *res)             \
 {                                                              \
-       int u = 0;                                              \
-       char *e;                                                \
-       type i = simple_ ## name(cp, &e, 10);                   \
-                                                               \
-       switch (tolower(*e)) {                                  \
-       default:                                                \
-               return -EINVAL;                                 \
-       case 'y':                                               \
-       case 'z':                                               \
-               u++;                                            \
-       case 'e':                                               \
-               u++;                                            \
-       case 'p':                                               \
-               u++;                                            \
-       case 't':                                               \
-               u++;                                            \
-       case 'g':                                               \
-               u++;                                            \
-       case 'm':                                               \
-               u++;                                            \
-       case 'k':                                               \
-               u++;                                            \
-               if (e++ == cp)                                  \
-                       return -EINVAL;                         \
-       case '\n':                                              \
-       case '\0':                                              \
-               if (*e == '\n')                                 \
-                       e++;                                    \
-       }                                                       \
-                                                               \
-       if (*e)                                                 \
-               return -EINVAL;                                 \
-                                                               \
-       while (u--) {                                           \
-               if ((type) ~0 > 0 &&                            \
-                   (type) ~0 / 1024 <= i)                      \
-                       return -EINVAL;                         \
-               if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) ||  \
-                   (i < 0 && -ANYSINT_MAX(type) / 1024 > i))   \
-                       return -EINVAL;                         \
-               i *= 1024;                                      \
-       }                                                       \
-                                                               \
-       *res = i;                                               \
-       return 0;                                               \
-}                                                              \
+       u64 v;                                                  \
+       int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type),      \
+                       ANYSINT_MAX(type) != ((type) ~0ULL));   \
+       *res = v;                                               \
+       return ret;                                             \
+}
 
 STRTO_H(strtoint, int)
 STRTO_H(strtouint, unsigned int)
@@ -84,7 +102,6 @@ STRTO_H(strtoull, unsigned long long)
 
 ssize_t bch2_hprint(char *buf, s64 v)
 {
-       static const char units[] = "?kMGTPEZY";
        char dec[4] = "";
        int u, t = 0;
 
@@ -103,7 +120,7 @@ ssize_t bch2_hprint(char *buf, s64 v)
        if (v < 100 && v > -100)
                scnprintf(dec, sizeof(dec), ".%i", t / 103);
 
-       return sprintf(buf, "%lli%s%c", v, dec, units[u]);
+       return sprintf(buf, "%lli%s%c", v, dec, si_units[u]);
 }
 
 ssize_t bch2_scnprint_string_list(char *buf, size_t size,
index c89c7200a1b483a33cfc40b8b5d77c9a7aab6382..de95480c8b088aa15e448a4fd9492def4c031a7a 100644 (file)
@@ -15,7 +15,7 @@
 
 static unsigned xattr_val_u64s(unsigned name_len, unsigned val_len)
 {
-       return DIV_ROUND_UP(sizeof(struct bch_xattr) +
+       return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) +
                            name_len + val_len, sizeof(u64));
 }