]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to fd637ebda0 bcachefs: Journal updates to interior nodes
authorKent Overstreet <kent.overstreet@gmail.com>
Wed, 25 Mar 2020 19:56:38 +0000 (15:56 -0400)
committerKent Overstreet <kent.overstreet@gmail.com>
Wed, 25 Mar 2020 19:56:38 +0000 (15:56 -0400)
21 files changed:
.bcachefs_revision
libbcachefs/bcachefs_format.h
libbcachefs/bkey.h
libbcachefs/bkey_methods.c
libbcachefs/bkey_sort.c
libbcachefs/btree_cache.c
libbcachefs/btree_cache.h
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/btree_types.h
libbcachefs/btree_update.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_interior.h
libbcachefs/btree_update_leaf.c
libbcachefs/extent_update.c
libbcachefs/fsck.c
libbcachefs/recovery.c
libbcachefs/recovery.h
libbcachefs/super-io.c

index e6246606d5a7b6653c2e6e281a03072fc8a000c1..330c6bddd5f4d084b56aa24f791953a9e1f5b583 100644 (file)
@@ -1 +1 @@
-3592e42edfaed6a66470fb6a456a5895243ef2f4
+fd637ebda030609b15a473f01f1ef54bbe818f27
index 798f5c9ea16452f58d21af911d5e6a9448c3a513..a78988e3ded7347df61c3d13c074438339d15e40 100644 (file)
@@ -1312,7 +1312,8 @@ LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3],  0, 16);
        x(new_extent_overwrite,         9)      \
        x(incompressible,               10)     \
        x(btree_ptr_v2,                 11)     \
-       x(extents_above_btree_updates,  12)
+       x(extents_above_btree_updates,  12)     \
+       x(btree_updates_journalled,     13)
 
 #define BCH_SB_FEATURES_ALL                            \
        ((1ULL << BCH_FEATURE_new_siphash)|             \
index 9106bea9ac067d2a7eb4c4908ce461f037d5e3fe..cbcfbd26bc581118c9a8f41d8043fc99591e6911 100644 (file)
@@ -400,9 +400,8 @@ static inline int bch2_compile_bkey_format(const struct bkey_format *format,
 static inline void bkey_reassemble(struct bkey_i *dst,
                                   struct bkey_s_c src)
 {
-       BUG_ON(bkey_packed(src.k));
        dst->k = *src.k;
-       memcpy_u64s(&dst->v, src.v, bkey_val_u64s(src.k));
+       memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
 }
 
 #define bkey_s_null            ((struct bkey_s)   { .k = NULL })
index c064cf468a9b9a2d0570c95ce210e0f3afed75b5..0aa3d3b9a281c7fad15df4914b7f007861e3cad0 100644 (file)
@@ -134,7 +134,7 @@ const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 
 const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
 {
-       if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
+       if (bkey_cmp(k.k->p, b->data->min_key) < 0)
                return "key before start of btree node";
 
        if (bkey_cmp(k.k->p, b->data->max_key) > 0)
index 68965a0f973acfbb2b2f5dba0145f344ce762ff4..839e78d1dc35fb3e71fdaff3407a9a50d58cd50d 100644 (file)
@@ -130,44 +130,21 @@ bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
        return nr;
 }
 
-static void extent_sort_advance_prev(struct bkey_format *f,
-                                    struct btree_nr_keys *nr,
-                                    struct bkey_packed *start,
-                                    struct bkey_packed **prev)
-{
-       if (*prev) {
-               bch2_bkey_pack(*prev, (void *) *prev, f);
-
-               btree_keys_account_key_add(nr, 0, *prev);
-               *prev = bkey_next(*prev);
-       } else {
-               *prev = start;
-       }
-}
-
 static void extent_sort_append(struct bch_fs *c,
                               struct bkey_format *f,
                               struct btree_nr_keys *nr,
-                              struct bkey_packed *start,
-                              struct bkey_packed **prev,
+                              struct bkey_packed **out,
                               struct bkey_s k)
 {
-       if (bkey_whiteout(k.k))
-               return;
-
-       /*
-        * prev is always unpacked, for key merging - until right before we
-        * advance it:
-        */
+       if (!bkey_whiteout(k.k)) {
+               if (!bch2_bkey_pack_key(*out, k.k, f))
+                       memcpy_u64s_small(*out, k.k, BKEY_U64s);
 
-       if (*prev &&
-           bch2_bkey_merge(c, bkey_i_to_s((void *) *prev), k) ==
-           BCH_MERGE_MERGE)
-               return;
+               memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k));
 
-       extent_sort_advance_prev(f, nr, start, prev);
-
-       bkey_reassemble((void *) *prev, k.s_c);
+               btree_keys_account_key_add(nr, 0, *out);
+               *out = bkey_next(*out);
+       }
 }
 
 /* Sort + repack in a new format: */
@@ -201,7 +178,7 @@ bch2_sort_repack(struct bset *dst, struct btree *src,
        return nr;
 }
 
-/* Sort, repack, and merge: */
+/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */
 struct btree_nr_keys
 bch2_sort_repack_merge(struct bch_fs *c,
                       struct bset *dst, struct btree *src,
@@ -209,7 +186,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
                       struct bkey_format *out_f,
                       bool filter_whiteouts)
 {
-       struct bkey_packed *prev = NULL, *k_packed;
+       struct bkey_packed *out = vstruct_last(dst), *k_packed;
        struct bkey_on_stack k;
        struct btree_nr_keys nr;
 
@@ -234,13 +211,10 @@ bch2_sort_repack_merge(struct bch_fs *c,
                    bch2_bkey_normalize(c, bkey_i_to_s(k.k)))
                        continue;
 
-               extent_sort_append(c, out_f, &nr, vstruct_last(dst),
-                                  &prev, bkey_i_to_s(k.k));
+               extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k));
        }
 
-       extent_sort_advance_prev(out_f, &nr, vstruct_last(dst), &prev);
-
-       dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+       dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
        bkey_on_stack_exit(&k, c);
        return nr;
 }
@@ -337,7 +311,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
        struct btree *b = iter->b;
        struct bkey_format *f = &b->format;
        struct sort_iter_set *_l = iter->data, *_r = iter->data + 1;
-       struct bkey_packed *prev = NULL;
+       struct bkey_packed *out = dst->start;
        struct bkey l_unpacked, r_unpacked;
        struct bkey_s l, r;
        struct btree_nr_keys nr;
@@ -360,7 +334,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
                l = __bkey_disassemble(b, _l->k, &l_unpacked);
 
                if (iter->used == 1) {
-                       extent_sort_append(c, f, &nr, dst->start, &prev, l);
+                       extent_sort_append(c, f, &nr, &out, l);
                        extent_iter_advance(iter, 0);
                        continue;
                }
@@ -369,7 +343,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
 
                /* If current key and next key don't overlap, just append */
                if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
-                       extent_sort_append(c, f, &nr, dst->start, &prev, l);
+                       extent_sort_append(c, f, &nr, &out, l);
                        extent_iter_advance(iter, 0);
                        continue;
                }
@@ -414,17 +388,15 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
                        __sort_iter_sift(iter, 0,
                                         extent_sort_fix_overlapping_cmp);
 
-                       extent_sort_append(c, f, &nr, dst->start,
-                                          &prev, bkey_i_to_s(split.k));
+                       extent_sort_append(c, f, &nr, &out,
+                                          bkey_i_to_s(split.k));
                } else {
                        bch2_cut_back_s(bkey_start_pos(r.k), l);
                        extent_save(b, _l->k, l.k);
                }
        }
 
-       extent_sort_advance_prev(f, &nr, dst->start, &prev);
-
-       dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+       dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
 
        bkey_on_stack_exit(&split, c);
        return nr;
index e9df7e82a7669488c35d1fdfb9aa18a686fdfbbd..5c3e7e165fcfb5fa3e24d5fcb57d26d85f20c92b 100644 (file)
@@ -588,6 +588,7 @@ err:
 static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
                                struct btree_iter *iter,
                                const struct bkey_i *k,
+                               enum btree_id btree_id,
                                unsigned level,
                                enum six_lock_type lock_type,
                                bool sync)
@@ -600,7 +601,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
         * Parent node must be locked, else we could read in a btree node that's
         * been freed:
         */
-       if (!bch2_btree_node_relock(iter, level + 1))
+       if (iter && !bch2_btree_node_relock(iter, level + 1))
                return ERR_PTR(-EINTR);
 
        b = bch2_btree_node_mem_alloc(c);
@@ -608,7 +609,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
                return b;
 
        bkey_copy(&b->key, k);
-       if (bch2_btree_node_hash_insert(bc, b, level, iter->btree_id)) {
+       if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
                /* raced with another fill: */
 
                /* mark as unhashed... */
@@ -628,7 +629,7 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
         *
         * XXX: ideally should be dropping all btree node locks here
         */
-       if (btree_node_read_locked(iter, level + 1))
+       if (iter && btree_node_read_locked(iter, level + 1))
                btree_node_unlock(iter, level + 1);
 
        bch2_btree_node_read(c, b, sync);
@@ -676,7 +677,8 @@ retry:
                 * else we could read in a btree node from disk that's been
                 * freed:
                 */
-               b = bch2_btree_node_fill(c, iter, k, level, lock_type, true);
+               b = bch2_btree_node_fill(c, iter, k, iter->btree_id,
+                                        level, lock_type, true);
 
                /* We raced and found the btree node in the cache */
                if (!b)
@@ -762,6 +764,74 @@ lock_node:
        return b;
 }
 
+struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
+                                        const struct bkey_i *k,
+                                        enum btree_id btree_id,
+                                        unsigned level)
+{
+       struct btree_cache *bc = &c->btree_cache;
+       struct btree *b;
+       struct bset_tree *t;
+
+       EBUG_ON(level >= BTREE_MAX_DEPTH);
+
+       b = btree_node_mem_ptr(k);
+       if (b)
+               goto lock_node;
+retry:
+       b = btree_cache_find(bc, k);
+       if (unlikely(!b)) {
+               b = bch2_btree_node_fill(c, NULL, k, btree_id,
+                                        level, SIX_LOCK_read, true);
+
+               /* We raced and found the btree node in the cache */
+               if (!b)
+                       goto retry;
+
+               if (IS_ERR(b))
+                       return b;
+       } else {
+lock_node:
+               six_lock_read(&b->lock);
+
+               if (unlikely(b->hash_val != btree_ptr_hash_val(k) ||
+                            b->btree_id != btree_id ||
+                            b->level != level)) {
+                       six_unlock_read(&b->lock);
+                       goto retry;
+               }
+       }
+
+       /* XXX: waiting on IO with btree locks held: */
+       wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+                      TASK_UNINTERRUPTIBLE);
+
+       prefetch(b->aux_data);
+
+       for_each_bset(b, t) {
+               void *p = (u64 *) b->aux_data + t->aux_data_offset;
+
+               prefetch(p + L1_CACHE_BYTES * 0);
+               prefetch(p + L1_CACHE_BYTES * 1);
+               prefetch(p + L1_CACHE_BYTES * 2);
+       }
+
+       /* avoid atomic set bit if it's not needed: */
+       if (!btree_node_accessed(b))
+               set_btree_node_accessed(b);
+
+       if (unlikely(btree_node_read_error(b))) {
+               six_unlock_read(&b->lock);
+               return ERR_PTR(-EIO);
+       }
+
+       EBUG_ON(b->btree_id != btree_id ||
+               BTREE_NODE_LEVEL(b->data) != level ||
+               bkey_cmp(b->data->max_key, k->k.p));
+
+       return b;
+}
+
 struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
                                          struct btree_iter *iter,
                                          struct btree *b,
@@ -876,7 +946,8 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
        if (b)
                return;
 
-       bch2_btree_node_fill(c, iter, k, level, SIX_LOCK_read, false);
+       bch2_btree_node_fill(c, iter, k, iter->btree_id,
+                            level, SIX_LOCK_read, false);
 }
 
 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
index bc24d92678d3f2713b0ee3c05eaff3aa80867ab4..132cc95a4c0276b615797fae18a8325990054e3c 100644 (file)
@@ -25,6 +25,9 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
                                  const struct bkey_i *, unsigned,
                                  enum six_lock_type);
 
+struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
+                                        enum btree_id, unsigned);
+
 struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
                                struct btree *, enum btree_node_sibling);
 
index c5a0c0ed22a0084b8688eb17dec15128d2af09dd..7c89a6dd7f5a1302f1e95b6c3cbb31803b75a3e9 100644 (file)
@@ -184,16 +184,8 @@ fsck_err:
        return ret;
 }
 
-static bool pos_in_journal_keys(struct journal_keys *journal_keys,
-                               enum btree_id id, struct bpos pos)
-{
-       struct journal_key *k = journal_key_search(journal_keys, id, pos);
-
-       return k && k->btree_id == id && !bkey_cmp(k->k->k.p, pos);
-}
-
 static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
-                             struct journal_keys *journal_keys, bool initial)
+                             bool initial)
 {
        struct btree_node_iter iter;
        struct bkey unpacked;
@@ -207,10 +199,6 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 
        for_each_btree_node_key_unpack(b, k, &iter,
                                       &unpacked) {
-               if (!b->level && journal_keys &&
-                   pos_in_journal_keys(journal_keys, b->btree_id, k.k->p))
-                       continue;
-
                bch2_bkey_debugcheck(c, b, k);
 
                ret = bch2_gc_mark_key(c, k, max_stale, initial);
@@ -222,7 +210,6 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 }
 
 static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
-                        struct journal_keys *journal_keys,
                         bool initial, bool metadata_only)
 {
        struct btree_trans trans;
@@ -250,8 +237,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
                gc_pos_set(c, gc_pos_btree_node(b));
 
-               ret = btree_gc_mark_node(c, b, &max_stale,
-                                        journal_keys, initial);
+               ret = btree_gc_mark_node(c, b, &max_stale, initial);
                if (ret)
                        break;
 
@@ -287,6 +273,78 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
        return ret;
 }
 
+static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
+                                        struct journal_keys *journal_keys,
+                                        unsigned target_depth)
+{
+       struct btree_and_journal_iter iter;
+       struct bkey_s_c k;
+       u8 max_stale = 0;
+       int ret = 0;
+
+       bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
+
+       while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+               bch2_bkey_debugcheck(c, b, k);
+
+               ret = bch2_gc_mark_key(c, k, &max_stale, true);
+               if (ret)
+                       break;
+
+               if (b->level > target_depth) {
+                       struct btree *child;
+                       BKEY_PADDED(k) tmp;
+
+                       bkey_reassemble(&tmp.k, k);
+
+                       child = bch2_btree_node_get_noiter(c, &tmp.k,
+                                               b->btree_id, b->level - 1);
+                       ret = PTR_ERR_OR_ZERO(child);
+                       if (ret)
+                               break;
+
+                       bch2_gc_btree_init_recurse(c, child,
+                                       journal_keys, target_depth);
+                       six_unlock_read(&child->lock);
+               }
+
+               bch2_btree_and_journal_iter_advance(&iter);
+       }
+
+       return ret;
+}
+
+static int bch2_gc_btree_init(struct bch_fs *c,
+                             struct journal_keys *journal_keys,
+                             enum btree_id btree_id,
+                             bool metadata_only)
+{
+       struct btree *b;
+       unsigned target_depth = metadata_only           ? 1
+               : expensive_debug_checks(c)             ? 0
+               : !btree_node_type_needs_gc(btree_id)   ? 1
+               : 0;
+       u8 max_stale = 0;
+       int ret = 0;
+
+       b = c->btree_roots[btree_id].b;
+
+       if (btree_node_fake(b))
+               return 0;
+
+       six_lock_read(&b->lock);
+       if (b->level >= target_depth)
+               ret = bch2_gc_btree_init_recurse(c, b,
+                                       journal_keys, target_depth);
+
+       if (!ret)
+               ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
+                                      &max_stale, true);
+       six_unlock_read(&b->lock);
+
+       return ret;
+}
+
 static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
 {
        return  (int) btree_id_to_gc_phase(l) -
@@ -305,27 +363,12 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
 
        for (i = 0; i < BTREE_ID_NR; i++) {
                enum btree_id id = ids[i];
-               enum btree_node_type type = __btree_node_type(0, id);
-
-               int ret = bch2_gc_btree(c, id, journal_keys,
-                                       initial, metadata_only);
+               int ret = initial
+                       ? bch2_gc_btree_init(c, journal_keys,
+                                            id, metadata_only)
+                       : bch2_gc_btree(c, id, initial, metadata_only);
                if (ret)
                        return ret;
-
-               if (journal_keys && !metadata_only &&
-                   btree_node_type_needs_gc(type)) {
-                       struct journal_key *j;
-                       u8 max_stale;
-                       int ret;
-
-                       for_each_journal_key(*journal_keys, j)
-                               if (j->btree_id == id) {
-                                       ret = bch2_gc_mark_key(c, bkey_i_to_s_c(j->k),
-                                                              &max_stale, initial);
-                                       if (ret)
-                                               return ret;
-                               }
-               }
        }
 
        return 0;
index 3f7c1042004249242663b128a3fb0a13a3836c42..b48d48b8c27df7c67ac000fe2aa5c91a01885553 100644 (file)
@@ -1261,7 +1261,6 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
                closure_put(&((struct btree_update *) new)->cl);
 
        bch2_journal_pin_drop(&c->journal, &w->journal);
-       closure_wake_up(&w->wait);
 }
 
 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
@@ -1618,9 +1617,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
        wbio->wbio.bio.bi_end_io        = btree_node_write_endio;
        wbio->wbio.bio.bi_private       = b;
 
-       if (b->level || !b->written)
-               wbio->wbio.bio.bi_opf |= REQ_FUA;
-
        bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9);
 
        /*
@@ -1794,12 +1790,11 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
        rcu_read_lock();
        for_each_cached_btree(b, c, tbl, i, pos) {
                unsigned long flags = READ_ONCE(b->flags);
-               unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
 
                if (!(flags & (1 << BTREE_NODE_dirty)))
                        continue;
 
-               pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu c %u p %u\n",
+               pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n",
                       b,
                       (flags & (1 << BTREE_NODE_dirty)) != 0,
                       (flags & (1 << BTREE_NODE_need_write)) != 0,
@@ -1807,9 +1802,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
                       b->written,
                       !list_empty_careful(&b->write_blocked),
                       b->will_make_reachable != 0,
-                      b->will_make_reachable & 1,
-                      b->writes[ idx].wait.list.first != NULL,
-                      b->writes[!idx].wait.list.first != NULL);
+                      b->will_make_reachable & 1);
        }
        rcu_read_unlock();
 
index e90e89eee2731f34bb25e4d17087e05ca07386d3..fd719dda7d91696efa4121a3ac4ed717352dfebb 100644 (file)
@@ -102,19 +102,20 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 void bch2_btree_node_write(struct bch_fs *, struct btree *,
                          enum six_lock_type);
 
-static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b)
+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
+                                           enum six_lock_type lock_held)
 {
        while (b->written &&
               btree_node_need_write(b) &&
               btree_node_may_write(b)) {
                if (!btree_node_write_in_flight(b)) {
-                       bch2_btree_node_write(c, b, SIX_LOCK_read);
+                       bch2_btree_node_write(c, b, lock_held);
                        break;
                }
 
                six_unlock_read(&b->lock);
                btree_node_wait_on_io(b);
-               btree_node_lock_type(c, b, SIX_LOCK_read);
+               btree_node_lock_type(c, b, lock_held);
        }
 }
 
@@ -131,7 +132,7 @@ do {                                                                        \
                new |= (1 << BTREE_NODE_need_write);                    \
        } while ((v = cmpxchg(&(_b)->flags, old, new)) != old);         \
                                                                        \
-       btree_node_write_if_need(_c, _b);                               \
+       btree_node_write_if_need(_c, _b, SIX_LOCK_read);                \
 } while (0)
 
 void bch2_btree_flush_all_reads(struct bch_fs *);
index 2819b9a487f2361e006dc054cb468b4ae1c249e8..6ed688cdcfdee2ca681801559cede96372f1bf6f 100644 (file)
@@ -1068,7 +1068,14 @@ retry_all:
                        goto retry_all;
        }
 
-       ret = hweight64(trans->iters_live) > 1 ? -EINTR : 0;
+       if (hweight64(trans->iters_live) > 1)
+               ret = -EINTR;
+       else
+               trans_for_each_iter(trans, iter)
+                       if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) {
+                               ret = -EINTR;
+                               break;
+                       }
 out:
        bch2_btree_cache_cannibalize_unlock(c);
        return ret;
index 51d579a4ffae9fb2587bad891328778714e828c0..31a5c215ca3496eb77d2c2a929ca29ccd159c8c4 100644 (file)
@@ -53,7 +53,6 @@ struct bset_tree {
 
 struct btree_write {
        struct journal_entry_pin        journal;
-       struct closure_waitlist         wait;
 };
 
 struct btree_alloc {
@@ -261,6 +260,11 @@ static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter)
        return iter->flags & BTREE_ITER_TYPE;
 }
 
+static inline struct btree_iter_level *iter_l(struct btree_iter *iter)
+{
+       return iter->l + iter->level;
+}
+
 struct btree_insert_entry {
        unsigned                trigger_flags;
        unsigned                trans_triggers_run:1;
@@ -539,8 +543,6 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type)
 struct btree_root {
        struct btree            *b;
 
-       struct btree_update     *as;
-
        /* On disk root - see async splits: */
        __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
        u8                      level;
index 9f58d47ef5d6abbecfaa703a9fc4e351ea298a86..11f7d02de622b0d8ebeeed37056b983b2ac241fd 100644 (file)
@@ -12,6 +12,7 @@ void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *,
                                     struct btree_iter *);
 bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *,
                                struct btree_node_iter *, struct bkey_i *);
+void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
 
 enum btree_insert_flags {
        __BTREE_INSERT_NOUNLOCK,
index 02f19146326287fb849c495a092d4acaf5e3098f..bc7749c8060abfb94a8e641ab66e15920b91b8bc 100644 (file)
@@ -24,7 +24,6 @@
 static void btree_node_will_make_reachable(struct btree_update *,
                                           struct btree *);
 static void btree_update_drop_new_node(struct bch_fs *, struct btree *);
-static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *, int);
 
 /* Debug code: */
 
@@ -260,16 +259,17 @@ void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
 }
 
 static void bch2_btree_node_free_ondisk(struct bch_fs *c,
-                                       struct pending_btree_node_free *pending)
+                       struct pending_btree_node_free *pending,
+                       u64 journal_seq)
 {
        BUG_ON(!pending->index_update_done);
 
        bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-                     0, 0, NULL, 0, BTREE_TRIGGER_OVERWRITE);
+                     0, 0, NULL, journal_seq, BTREE_TRIGGER_OVERWRITE);
 
        if (gc_visited(c, gc_phase(GC_PHASE_PENDING_DELETE)))
                bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
-                             0, 0, NULL, 0,
+                             0, 0, NULL, journal_seq,
                              BTREE_TRIGGER_OVERWRITE|
                              BTREE_TRIGGER_GC);
 }
@@ -585,10 +585,13 @@ static void bch2_btree_update_free(struct btree_update *as)
 {
        struct bch_fs *c = as->c;
 
+       bch2_journal_preres_put(&c->journal, &as->journal_preres);
+
+       bch2_journal_pin_drop(&c->journal, &as->journal);
        bch2_journal_pin_flush(&c->journal, &as->journal);
 
-       BUG_ON(as->nr_new_nodes);
-       BUG_ON(as->nr_pending);
+       BUG_ON((as->nr_new_nodes || as->nr_pending) &&
+              !bch2_journal_error(&c->journal));;
 
        if (as->reserve)
                bch2_btree_reserve_put(c, as->reserve);
@@ -603,13 +606,10 @@ static void bch2_btree_update_free(struct btree_update *as)
        mutex_unlock(&c->btree_interior_update_lock);
 }
 
-static void btree_update_nodes_reachable(struct closure *cl)
+static void btree_update_nodes_reachable(struct btree_update *as, u64 seq)
 {
-       struct btree_update *as = container_of(cl, struct btree_update, cl);
        struct bch_fs *c = as->c;
 
-       bch2_journal_pin_drop(&c->journal, &as->journal);
-
        mutex_lock(&c->btree_interior_update_lock);
 
        while (as->nr_new_nodes) {
@@ -630,39 +630,22 @@ static void btree_update_nodes_reachable(struct closure *cl)
        }
 
        while (as->nr_pending)
-               bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]);
+               bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending],
+                                           seq);
 
        mutex_unlock(&c->btree_interior_update_lock);
-
-       closure_wake_up(&as->wait);
-
-       bch2_btree_update_free(as);
-}
-
-static void btree_update_wait_on_journal(struct closure *cl)
-{
-       struct btree_update *as = container_of(cl, struct btree_update, cl);
-       struct bch_fs *c = as->c;
-       int ret;
-
-       ret = bch2_journal_open_seq_async(&c->journal, as->journal_seq, cl);
-       if (ret == -EAGAIN) {
-               continue_at(cl, btree_update_wait_on_journal, system_wq);
-               return;
-       }
-       if (ret < 0)
-               goto err;
-
-       bch2_journal_flush_seq_async(&c->journal, as->journal_seq, cl);
-err:
-       continue_at(cl, btree_update_nodes_reachable, system_wq);
 }
 
 static void btree_update_nodes_written(struct closure *cl)
 {
        struct btree_update *as = container_of(cl, struct btree_update, cl);
+       struct journal_res res = { 0 };
        struct bch_fs *c = as->c;
        struct btree *b;
+       struct bset *i;
+       struct bkey_i *k;
+       unsigned journal_u64s = 0;
+       int ret;
 
        /*
         * We did an update to a parent node where the pointers we added pointed
@@ -671,7 +654,7 @@ static void btree_update_nodes_written(struct closure *cl)
         */
        mutex_lock(&c->btree_interior_update_lock);
        as->nodes_written = true;
-retry:
+again:
        as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
                                      struct btree_update, unwritten_list);
        if (!as || !as->nodes_written) {
@@ -679,31 +662,53 @@ retry:
                return;
        }
 
+       b = as->b;
+       if (b && !six_trylock_intent(&b->lock)) {
+               mutex_unlock(&c->btree_interior_update_lock);
+               btree_node_lock_type(c, b, SIX_LOCK_intent);
+               six_unlock_intent(&b->lock);
+               goto out;
+       }
+
+       journal_u64s = 0;
+
+       if (as->mode != BTREE_INTERIOR_UPDATING_ROOT)
+               for_each_keylist_key(&as->parent_keys, k)
+                       journal_u64s += jset_u64s(k->k.u64s);
+
+       ret = bch2_journal_res_get(&c->journal, &res, journal_u64s,
+                                  JOURNAL_RES_GET_RESERVED);
+       if (ret) {
+               BUG_ON(!bch2_journal_error(&c->journal));
+               /* can't unblock btree writes */
+               goto free_update;
+       }
+
+       if (as->mode != BTREE_INTERIOR_UPDATING_ROOT)
+               for_each_keylist_key(&as->parent_keys, k)
+                       bch2_journal_add_entry(&c->journal, &res,
+                                              BCH_JSET_ENTRY_btree_keys,
+                                              as->btree_id,
+                                              as->level,
+                                              k, k->k.u64s);
+
        switch (as->mode) {
        case BTREE_INTERIOR_NO_UPDATE:
                BUG();
        case BTREE_INTERIOR_UPDATING_NODE:
-               /* The usual case: */
-               b = READ_ONCE(as->b);
-
-               if (!six_trylock_read(&b->lock)) {
-                       mutex_unlock(&c->btree_interior_update_lock);
-                       btree_node_lock_type(c, b, SIX_LOCK_read);
-                       six_unlock_read(&b->lock);
-                       mutex_lock(&c->btree_interior_update_lock);
-                       goto retry;
-               }
-
-               BUG_ON(!btree_node_dirty(b));
-               closure_wait(&btree_current_write(b)->wait, &as->cl);
+               /* @b is the node we did the final insert into: */
+               BUG_ON(!res.ref);
 
+               six_lock_write(&b->lock);
                list_del(&as->write_blocked_list);
 
-               /*
-                * for flush_held_btree_writes() waiting on updates to flush or
-                * nodes to be writeable:
-                */
-               closure_wake_up(&c->btree_interior_update_wait);
+               i = btree_bset_last(b);
+               i->journal_seq = cpu_to_le64(
+                       max(res.seq,
+                           le64_to_cpu(i->journal_seq)));
+
+               bch2_btree_add_journal_pin(c, b, res.seq);
+               six_unlock_write(&b->lock);
 
                list_del(&as->unwritten_list);
                mutex_unlock(&c->btree_interior_update_lock);
@@ -712,82 +717,51 @@ retry:
                 * b->write_blocked prevented it from being written, so
                 * write it now if it needs to be written:
                 */
-               bch2_btree_node_write_cond(c, b, true);
-               six_unlock_read(&b->lock);
-               continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
+               btree_node_write_if_need(c, b, SIX_LOCK_intent);
+               six_unlock_intent(&b->lock);
                break;
 
        case BTREE_INTERIOR_UPDATING_AS:
-               /*
-                * The btree node we originally updated has been freed and is
-                * being rewritten - so we need to write anything here, we just
-                * need to signal to that btree_update that it's ok to make the
-                * new replacement node visible:
-                */
-               closure_put(&as->parent_as->cl);
-
-               /*
-                * and then we have to wait on that btree_update to finish:
-                */
-               closure_wait(&as->parent_as->wait, &as->cl);
+               BUG_ON(b);
 
                list_del(&as->unwritten_list);
                mutex_unlock(&c->btree_interior_update_lock);
-
-               continue_at(&as->cl, btree_update_nodes_reachable, system_wq);
                break;
 
-       case BTREE_INTERIOR_UPDATING_ROOT:
-               /* b is the new btree root: */
-               b = READ_ONCE(as->b);
-
-               if (!six_trylock_read(&b->lock)) {
-                       mutex_unlock(&c->btree_interior_update_lock);
-                       btree_node_lock_type(c, b, SIX_LOCK_read);
-                       six_unlock_read(&b->lock);
-                       mutex_lock(&c->btree_interior_update_lock);
-                       goto retry;
-               }
-
-               BUG_ON(c->btree_roots[b->btree_id].as != as);
-               c->btree_roots[b->btree_id].as = NULL;
+       case BTREE_INTERIOR_UPDATING_ROOT: {
+               struct btree_root *r = &c->btree_roots[as->btree_id];
 
-               bch2_btree_set_root_ondisk(c, b, WRITE);
+               BUG_ON(b);
 
-               /*
-                * We don't have to wait anything anything here (before
-                * btree_update_nodes_reachable frees the old nodes
-                * ondisk) - we've ensured that the very next journal write will
-                * have the pointer to the new root, and before the allocator
-                * can reuse the old nodes it'll have to do a journal commit:
-                */
-               six_unlock_read(&b->lock);
+               mutex_lock(&c->btree_root_lock);
+               bkey_copy(&r->key, as->parent_keys.keys);
+               r->level = as->level;
+               r->alive = true;
+               c->btree_roots_dirty = true;
+               mutex_unlock(&c->btree_root_lock);
 
                list_del(&as->unwritten_list);
                mutex_unlock(&c->btree_interior_update_lock);
-
-               /*
-                * Bit of funny circularity going on here we have to break:
-                *
-                * We have to drop our journal pin before writing the journal
-                * entry that points to the new btree root: else, we could
-                * deadlock if the journal currently happens to be full.
-                *
-                * This mean we're dropping the journal pin _before_ the new
-                * nodes are technically reachable - but this is safe, because
-                * after the bch2_btree_set_root_ondisk() call above they will
-                * be reachable as of the very next journal write:
-                */
-               bch2_journal_pin_drop(&c->journal, &as->journal);
-
-               as->journal_seq = bch2_journal_last_unwritten_seq(&c->journal);
-
-               btree_update_wait_on_journal(&as->cl);
                break;
        }
+       }
 
+       bch2_journal_pin_drop(&c->journal, &as->journal);
+
+       bch2_journal_res_put(&c->journal, &res);
+       bch2_journal_preres_put(&c->journal, &as->journal_preres);
+
+       btree_update_nodes_reachable(as, res.seq);
+free_update:
+       bch2_btree_update_free(as);
+       /*
+        * for flush_held_btree_writes() waiting on updates to flush or
+        * nodes to be writeable:
+        */
+       closure_wake_up(&c->btree_interior_update_wait);
+out:
        mutex_lock(&c->btree_interior_update_lock);
-       goto retry;
+       goto again;
 }
 
 /*
@@ -804,48 +778,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
        BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
        BUG_ON(!btree_node_dirty(b));
 
-       as->mode = BTREE_INTERIOR_UPDATING_NODE;
-       as->b = b;
+       as->mode        = BTREE_INTERIOR_UPDATING_NODE;
+       as->b           = b;
+       as->level       = b->level;
        list_add(&as->write_blocked_list, &b->write_blocked);
 
        mutex_unlock(&c->btree_interior_update_lock);
-
-       /*
-        * In general, when you're staging things in a journal that will later
-        * be written elsewhere, and you also want to guarantee ordering: that
-        * is, if you have updates a, b, c, after a crash you should never see c
-        * and not a or b - there's a problem:
-        *
-        * If the final destination of the update(s) (i.e. btree node) can be
-        * written/flushed _before_ the relevant journal entry - oops, that
-        * breaks ordering, since the various leaf nodes can be written in any
-        * order.
-        *
-        * Normally we use bset->journal_seq to deal with this - if during
-        * recovery we find a btree node write that's newer than the newest
-        * journal entry, we just ignore it - we don't need it, anything we're
-        * supposed to have (that we reported as completed via fsync()) will
-        * still be in the journal, and as far as the state of the journal is
-        * concerned that btree node write never happened.
-        *
-        * That breaks when we're rewriting/splitting/merging nodes, since we're
-        * mixing btree node writes that haven't happened yet with previously
-        * written data that has been reported as completed to the journal.
-        *
-        * Thus, before making the new nodes reachable, we have to wait the
-        * newest journal sequence number we have data for to be written (if it
-        * hasn't been yet).
-        */
-       bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl);
-}
-
-static void interior_update_flush(struct journal *j,
-                       struct journal_entry_pin *pin, u64 seq)
-{
-       struct btree_update *as =
-               container_of(pin, struct btree_update, journal);
-
-       bch2_journal_flush_seq_async(j, as->journal_seq, NULL);
 }
 
 static void btree_update_reparent(struct btree_update *as,
@@ -853,10 +791,10 @@ static void btree_update_reparent(struct btree_update *as,
 {
        struct bch_fs *c = as->c;
 
+       lockdep_assert_held(&c->btree_interior_update_lock);
+
        child->b = NULL;
        child->mode = BTREE_INTERIOR_UPDATING_AS;
-       child->parent_as = as;
-       closure_get(&as->cl);
 
        /*
         * When we write a new btree root, we have to drop our journal pin
@@ -867,46 +805,24 @@ static void btree_update_reparent(struct btree_update *as,
         * just transfer the journal pin to the new interior update so
         * btree_update_nodes_written() can drop it.
         */
-       bch2_journal_pin_copy(&c->journal, &as->journal,
-                             &child->journal, interior_update_flush);
+       bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
        bch2_journal_pin_drop(&c->journal, &child->journal);
-
-       as->journal_seq = max(as->journal_seq, child->journal_seq);
 }
 
-static void btree_update_updated_root(struct btree_update *as)
+static void btree_update_updated_root(struct btree_update *as, struct btree *b)
 {
        struct bch_fs *c = as->c;
-       struct btree_root *r = &c->btree_roots[as->btree_id];
-
-       mutex_lock(&c->btree_interior_update_lock);
-       list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
 
        BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+       BUG_ON(!bch2_keylist_empty(&as->parent_keys));
 
-       /*
-        * Old root might not be persistent yet - if so, redirect its
-        * btree_update operation to point to us:
-        */
-       if (r->as)
-               btree_update_reparent(as, r->as);
-
-       as->mode = BTREE_INTERIOR_UPDATING_ROOT;
-       as->b = r->b;
-       r->as = as;
+       mutex_lock(&c->btree_interior_update_lock);
+       list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
 
+       as->mode        = BTREE_INTERIOR_UPDATING_ROOT;
+       as->level       = b->level;
+       bch2_keylist_add(&as->parent_keys, &b->key);
        mutex_unlock(&c->btree_interior_update_lock);
-
-       /*
-        * When we're rewriting nodes and updating interior nodes, there's an
-        * issue with updates that haven't been written in the journal getting
-        * mixed together with older data - see btree_update_updated_node()
-        * for the explanation.
-        *
-        * However, this doesn't affect us when we're writing a new btree root -
-        * because to make that new root reachable we have to write out a new
-        * journal entry, which must necessarily be newer than as->journal_seq.
-        */
 }
 
 static void btree_node_will_make_reachable(struct btree_update *as,
@@ -983,10 +899,8 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
                                               struct btree *b)
 {
        struct bch_fs *c = as->c;
-       struct closure *cl, *cl_n;
        struct btree_update *p, *n;
        struct btree_write *w;
-       struct bset_tree *t;
 
        set_btree_node_dying(b);
 
@@ -995,18 +909,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 
        btree_interior_update_add_node_reference(as, b);
 
-       /*
-        * Does this node have data that hasn't been written in the journal?
-        *
-        * If so, we have to wait for the corresponding journal entry to be
-        * written before making the new nodes reachable - we can't just carry
-        * over the bset->journal_seq tracking, since we'll be mixing those keys
-        * in with keys that aren't in the journal anymore:
-        */
-       for_each_bset(b, t)
-               as->journal_seq = max(as->journal_seq,
-                                     le64_to_cpu(bset(b, t)->journal_seq));
-
        mutex_lock(&c->btree_interior_update_lock);
 
        /*
@@ -1030,16 +932,6 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
 
        clear_btree_node_dirty(b);
        clear_btree_node_need_write(b);
-       w = btree_current_write(b);
-
-       /*
-        * Does this node have any btree_update operations waiting on this node
-        * to be written?
-        *
-        * If so, wake them up when this btree_update operation is reachable:
-        */
-       llist_for_each_entry_safe(cl, cl_n, llist_del_all(&w->wait.list), list)
-               llist_add(&cl->list, &as->wait.list);
 
        /*
         * Does this node have unwritten data that has a pin on the journal?
@@ -1049,13 +941,12 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
         * oldest pin of any of the nodes we're freeing. We'll release the pin
         * when the new nodes are persistent and reachable on disk:
         */
-       bch2_journal_pin_copy(&c->journal, &as->journal,
-                             &w->journal, interior_update_flush);
+       w = btree_current_write(b);
+       bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
        bch2_journal_pin_drop(&c->journal, &w->journal);
 
        w = btree_prev_write(b);
-       bch2_journal_pin_copy(&c->journal, &as->journal,
-                             &w->journal, interior_update_flush);
+       bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
        bch2_journal_pin_drop(&c->journal, &w->journal);
 
        mutex_unlock(&c->btree_interior_update_lock);
@@ -1078,6 +969,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
 {
        struct btree_reserve *reserve;
        struct btree_update *as;
+       int ret;
 
        reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl);
        if (IS_ERR(reserve))
@@ -1094,6 +986,15 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
 
        bch2_keylist_init(&as->parent_keys, as->inline_keys);
 
+       ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
+                                jset_u64s(BKEY_BTREE_PTR_U64s_MAX) * 3, 0);
+       if (ret) {
+               bch2_btree_reserve_put(c, reserve);
+               closure_debug_destroy(&as->cl);
+               mempool_free(as, &c->btree_interior_update_pool);
+               return ERR_PTR(ret);
+       }
+
        mutex_lock(&c->btree_interior_update_lock);
        list_add_tail(&as->list, &c->btree_interior_update_list);
        mutex_unlock(&c->btree_interior_update_lock);
@@ -1153,22 +1054,6 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
        mutex_unlock(&c->btree_interior_update_lock);
 }
 
-static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b, int rw)
-{
-       struct btree_root *r = &c->btree_roots[b->btree_id];
-
-       mutex_lock(&c->btree_root_lock);
-
-       BUG_ON(b != r->b);
-       bkey_copy(&r->key, &b->key);
-       r->level = b->level;
-       r->alive = true;
-       if (rw == WRITE)
-               c->btree_roots_dirty = true;
-
-       mutex_unlock(&c->btree_root_lock);
-}
-
 /**
  * bch_btree_set_root - update the root in memory and on disk
  *
@@ -1201,7 +1086,7 @@ static void bch2_btree_set_root(struct btree_update *as, struct btree *b,
 
        bch2_btree_set_root_inmem(as, b);
 
-       btree_update_updated_root(as);
+       btree_update_updated_root(as, b);
 
        /*
         * Unlock old root after new root is visible:
@@ -1471,7 +1356,8 @@ static void btree_split(struct btree_update *as, struct btree *b,
                bch2_btree_build_aux_trees(n1);
                six_unlock_write(&n1->lock);
 
-               bch2_keylist_add(&as->parent_keys, &n1->key);
+               if (parent)
+                       bch2_keylist_add(&as->parent_keys, &n1->key);
        }
 
        bch2_btree_node_write(c, n1, SIX_LOCK_intent);
@@ -1545,12 +1431,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
               (bkey_cmp_packed(b, k, &insert->k) >= 0))
                ;
 
-       while (!bch2_keylist_empty(keys)) {
-               insert = bch2_keylist_front(keys);
-
+       for_each_keylist_key(keys, insert)
                bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
-               bch2_keylist_pop_front(keys);
-       }
 
        btree_update_updated_node(as, b);
 
@@ -1630,7 +1512,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
                          unsigned flags)
 {
        struct btree_trans *trans = iter->trans;
-       struct btree *b = iter->l[0].b;
+       struct btree *b = iter_l(iter)->b;
        struct btree_update *as;
        struct closure cl;
        int ret = 0;
@@ -2107,7 +1989,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
                        bkey_copy(&b->key, new_key);
                }
 
-               btree_update_updated_root(as);
+               btree_update_updated_root(as, b);
                bch2_btree_node_unlock_write(b, iter);
        }
 
index c90fcd48eeb7f7386602e6accd26244be421b880..0ac95dd80a38e987646c12ca6697fe33c1e48849 100644 (file)
@@ -69,8 +69,10 @@ struct btree_update {
        unsigned                        nodes_written:1;
 
        enum btree_id                   btree_id;
+       u8                              level;
 
        struct btree_reserve            *reserve;
+       struct journal_preres           journal_preres;
 
        /*
         * BTREE_INTERIOR_UPDATING_NODE:
@@ -83,18 +85,6 @@ struct btree_update {
        struct btree                    *b;
        struct list_head                write_blocked_list;
 
-       /*
-        * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now
-        * we're now blocking another btree_update
-        * @parent_as - btree_update that's waiting on our nodes to finish
-        * writing, before it can make new nodes visible on disk
-        * @wait - list of child btree_updates that are waiting on this
-        * btree_update to make all the new nodes visible before they can free
-        * their old btree nodes
-        */
-       struct btree_update             *parent_as;
-       struct closure_waitlist         wait;
-
        /*
         * We may be freeing nodes that were dirty, and thus had journal entries
         * pinned: we need to transfer the oldest of those pins to the
@@ -103,8 +93,6 @@ struct btree_update {
         */
        struct journal_entry_pin        journal;
 
-       u64                             journal_seq;
-
        /*
         * Nodes being freed:
         * Protected by c->btree_node_pending_free_lock
index 7c2f72a3a725821226d59fcec7ee3e4d7cb5a1f1..f94bc6a0b699c52a99a70c7a79c227606e6b4815 100644 (file)
@@ -24,7 +24,7 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans,
                                     struct btree_insert_entry *i)
 {
        return i != trans->updates2 &&
-               i[0].iter->l[0].b == i[-1].iter->l[0].b;
+               iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b;
 }
 
 inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
@@ -145,6 +145,17 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin,
        return __btree_node_flush(j, pin, 1, seq);
 }
 
+inline void bch2_btree_add_journal_pin(struct bch_fs *c,
+                                      struct btree *b, u64 seq)
+{
+       struct btree_write *w = btree_current_write(b);
+
+       bch2_journal_pin_add(&c->journal, seq, &w->journal,
+                            btree_node_write_idx(b) == 0
+                            ? btree_node_flush0
+                            : btree_node_flush1);
+}
+
 static inline void __btree_journal_key(struct btree_trans *trans,
                                       enum btree_id btree_id,
                                       struct bkey_i *insert)
@@ -172,13 +183,8 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct journal *j = &c->journal;
-       struct btree *b = iter->l[0].b;
-       struct btree_write *w = btree_current_write(b);
-       u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
-               ? trans->journal_res.seq
-               : j->replay_journal_seq;
+       struct btree *b = iter_l(iter)->b;
 
-       EBUG_ON(iter->level || b->level);
        EBUG_ON(trans->journal_res.ref !=
                !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));
 
@@ -188,10 +194,10 @@ static void bch2_btree_journal_key(struct btree_trans *trans,
                        cpu_to_le64(trans->journal_res.seq);
        }
 
-       bch2_journal_pin_add(j, seq, &w->journal,
-                            btree_node_write_idx(b) == 0
-                            ? btree_node_flush0
-                            : btree_node_flush1);
+       bch2_btree_add_journal_pin(c, b,
+               likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
+                       ? trans->journal_res.seq
+                       : j->replay_journal_seq);
 
        if (unlikely(!btree_node_dirty(b)))
                set_btree_node_dirty(b);
@@ -205,17 +211,15 @@ static void btree_insert_key_leaf(struct btree_trans *trans,
                                  struct bkey_i *insert)
 {
        struct bch_fs *c = trans->c;
-       struct btree *b = iter->l[0].b;
+       struct btree *b = iter_l(iter)->b;
        struct bset_tree *t = bset_tree_last(b);
        int old_u64s = bset_u64s(t);
        int old_live_u64s = b->nr.live_u64s;
        int live_u64s_added, u64s_added;
 
-       EBUG_ON(iter->level);
-
        insert->k.needs_whiteout = false;
 
-       if (likely(bch2_btree_bset_insert_key(iter, b, &iter->l[0].iter, insert)))
+       if (likely(bch2_btree_bset_insert_key(iter, b, &iter_l(iter)->iter, insert)))
                bch2_btree_journal_key(trans, iter, insert);
 
        live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
@@ -241,7 +245,6 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
 
-       BUG_ON(iter->level);
        BUG_ON(bkey_cmp(insert->k.p, iter->pos));
        BUG_ON(debug_check_bkeys(c) &&
               bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id));
@@ -290,7 +293,7 @@ btree_key_can_insert(struct btree_trans *trans,
                     unsigned *u64s)
 {
        struct bch_fs *c = trans->c;
-       struct btree *b = iter->l[0].b;
+       struct btree *b = iter_l(iter)->b;
        static enum btree_insert_ret ret;
 
        if (unlikely(btree_node_fake(b)))
@@ -345,7 +348,7 @@ static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
        struct btree_insert_entry *i;
 
        trans_for_each_update(trans, i)
-               if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
+               if (gc_visited(c, gc_pos_btree_node(iter_l(i->iter)->b)))
                        bch2_mark_update(trans, i->iter, i->k, NULL,
                                         i->trigger_flags|BTREE_TRIGGER_GC);
 }
@@ -461,7 +464,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
        int ret;
 
        trans_for_each_update2(trans, i)
-               BUG_ON(!btree_node_intent_locked(i->iter, 0));
+               BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level));
 
        ret = bch2_journal_preres_get(&trans->c->journal,
                        &trans->journal_preres, trans->journal_preres_u64s,
@@ -495,13 +498,13 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
        trans_for_each_update2(trans, i)
                if (!same_leaf_as_prev(trans, i))
                        bch2_btree_node_lock_for_insert(trans->c,
-                                               i->iter->l[0].b, i->iter);
+                                       iter_l(i->iter)->b, i->iter);
 
        ret = bch2_trans_commit_write_locked(trans, stopped_at);
 
        trans_for_each_update2(trans, i)
                if (!same_leaf_as_prev(trans, i))
-                       bch2_btree_node_unlock_write_inlined(i->iter->l[0].b,
+                       bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b,
                                                             i->iter);
 
        /*
index beb3b694e33c2cabb79feffc8a501711d15cf841..8e5070d5a39b4d72ddba9fcb8fa932c46fa57aa9 100644 (file)
@@ -44,6 +44,10 @@ static int count_iters_for_insert(struct btree_trans *trans,
         * extent we're inserting and overwriting:
         */
        *nr_iters += 1;
+       if (*nr_iters >= max_iters) {
+               *end = bpos_min(*end, k.k->p);
+               ret = 1;
+       }
 
        switch (k.k->type) {
        case KEY_TYPE_extent:
index 902c8da9dc15a259ae44415782808836a4b8b9f2..936e6366cb0470da857456b7d4f7e07097881ab2 100644 (file)
@@ -478,7 +478,8 @@ static int check_extents(struct bch_fs *c)
        bch_verbose(c, "checking extents");
 
        iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
-                                  POS(BCACHEFS_ROOT_INO, 0), 0);
+                                  POS(BCACHEFS_ROOT_INO, 0),
+                                  BTREE_ITER_INTENT);
 retry:
        for_each_btree_key_continue(iter, 0, k, ret) {
                if (bkey_cmp(prev.p, bkey_start_pos(k.k)) > 0) {
index 02b381cb567b38e484d4c6c6a3256371e576842e..2b428ee73364cf3e9560d46cae808d1cb4843f0c 100644 (file)
 
 /* iterate over keys read from the journal: */
 
-struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
+static struct journal_key *journal_key_search(struct journal_keys *journal_keys,
+                                             enum btree_id id, unsigned level,
+                                             struct bpos pos)
 {
-       while (iter->k) {
-               if (iter->k->btree_id == iter->btree_id)
-                       return bkey_i_to_s_c(iter->k->k);
+       size_t l = 0, r = journal_keys->nr, m;
 
-               iter->k++;
-               if (iter->k == iter->keys->d + iter->keys->nr)
-                       iter->k = NULL;
+       while (l < r) {
+               m = l + ((r - l) >> 1);
+               if ((cmp_int(id,        journal_keys->d[m].btree_id) ?:
+                    cmp_int(level,     journal_keys->d[m].level) ?:
+                    bkey_cmp(pos,      journal_keys->d[m].k->k.p)) > 0)
+                       l = m + 1;
+               else
+                       r = m;
        }
 
-       return bkey_s_c_null;
+       BUG_ON(l < journal_keys->nr &&
+              (cmp_int(id,     journal_keys->d[l].btree_id) ?:
+               cmp_int(level,  journal_keys->d[l].level) ?:
+               bkey_cmp(pos,   journal_keys->d[l].k->k.p)) > 0);
+
+       BUG_ON(l &&
+              (cmp_int(id,     journal_keys->d[l - 1].btree_id) ?:
+               cmp_int(level,  journal_keys->d[l - 1].level) ?:
+               bkey_cmp(pos,   journal_keys->d[l - 1].k->k.p)) <= 0);
+
+       return l < journal_keys->nr ? journal_keys->d + l : NULL;
 }
 
-struct bkey_s_c bch2_journal_iter_next(struct journal_iter *iter)
+static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
 {
-       if (!iter->k)
-               return bkey_s_c_null;
+       if (iter->k &&
+           iter->k < iter->keys->d + iter->keys->nr &&
+           iter->k->btree_id   == iter->btree_id &&
+           iter->k->level      == iter->level)
+               return iter->k->k;
 
-       iter->k++;
-       if (iter->k == iter->keys->d + iter->keys->nr)
-               iter->k = NULL;
+       iter->k = NULL;
+       return NULL;
+}
 
-       return bch2_journal_iter_peek(iter);
+static void bch2_journal_iter_advance(struct journal_iter *iter)
+{
+       if (iter->k)
+               iter->k++;
+}
+
+static void bch2_journal_iter_init(struct journal_iter *iter,
+                                  struct journal_keys *journal_keys,
+                                  enum btree_id id, unsigned level,
+                                  struct bpos pos)
+{
+       iter->btree_id  = id;
+       iter->level     = level;
+       iter->keys      = journal_keys;
+       iter->k         = journal_key_search(journal_keys, id, level, pos);
+}
+
+static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
+{
+       return iter->btree
+               ? bch2_btree_iter_peek(iter->btree)
+               : bch2_btree_node_iter_peek_unpack(&iter->node_iter,
+                                                  iter->b, &iter->unpacked);
+}
+
+static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
+{
+       if (iter->btree)
+               bch2_btree_iter_next(iter->btree);
+       else
+               bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
 }
 
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
@@ -59,10 +107,10 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
        case none:
                break;
        case btree:
-               bch2_btree_iter_next(iter->btree);
+               bch2_journal_iter_advance_btree(iter);
                break;
        case journal:
-               bch2_journal_iter_next(&iter->journal);
+               bch2_journal_iter_advance(&iter->journal);
                break;
        }
 
@@ -74,14 +122,16 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
        struct bkey_s_c ret;
 
        while (1) {
-               struct bkey_s_c btree_k         = bch2_btree_iter_peek(iter->btree);
-               struct bkey_s_c journal_k       = bch2_journal_iter_peek(&iter->journal);
+               struct bkey_s_c btree_k         =
+                       bch2_journal_iter_peek_btree(iter);
+               struct bkey_s_c journal_k       =
+                       bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal));
 
                if (btree_k.k && journal_k.k) {
                        int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p);
 
                        if (!cmp)
-                               bch2_btree_iter_next(iter->btree);
+                               bch2_journal_iter_advance_btree(iter);
 
                        iter->last = cmp < 0 ? btree : journal;
                } else if (btree_k.k) {
@@ -94,6 +144,14 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
                }
 
                ret = iter->last == journal ? journal_k : btree_k;
+
+               if (iter->b &&
+                   bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) {
+                       iter->journal.k = NULL;
+                       iter->last = none;
+                       return bkey_s_c_null;
+               }
+
                if (!bkey_deleted(ret.k))
                        break;
 
@@ -110,41 +168,32 @@ struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *
        return bch2_btree_and_journal_iter_peek(iter);
 }
 
-struct journal_key *journal_key_search(struct journal_keys *journal_keys,
-                                      enum btree_id id, struct bpos pos)
-{
-       size_t l = 0, r = journal_keys->nr, m;
-
-       while (l < r) {
-               m = l + ((r - l) >> 1);
-               if ((cmp_int(id, journal_keys->d[m].btree_id) ?:
-                    bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0)
-                       l = m + 1;
-               else
-                       r = m;
-       }
-
-       BUG_ON(l < journal_keys->nr &&
-              (cmp_int(id, journal_keys->d[l].btree_id) ?:
-               bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0);
-
-       BUG_ON(l &&
-              (cmp_int(id, journal_keys->d[l - 1].btree_id) ?:
-               bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0);
-
-       return l < journal_keys->nr ? journal_keys->d + l : NULL;
-}
-
 void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter,
                                      struct btree_trans *trans,
                                      struct journal_keys *journal_keys,
                                      enum btree_id id, struct bpos pos)
 {
-       iter->journal.keys      = journal_keys;
-       iter->journal.k         = journal_key_search(journal_keys, id, pos);
-       iter->journal.btree_id  = id;
+       memset(iter, 0, sizeof(*iter));
 
        iter->btree = bch2_trans_get_iter(trans, id, pos, 0);
+       bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos);
+}
+
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+                                               struct journal_keys *journal_keys,
+                                               struct btree *b)
+{
+       struct bpos start = b->data->min_key;
+
+       if (btree_node_type_is_extents(b->btree_id))
+               start = bkey_successor(start);
+
+       memset(iter, 0, sizeof(*iter));
+
+       iter->b = b;
+       bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
+       bch2_journal_iter_init(&iter->journal, journal_keys,
+                              b->btree_id, b->level, start);
 }
 
 /* sort and dedup all keys in the journal: */
@@ -169,7 +218,8 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
        const struct journal_key *l = _l;
        const struct journal_key *r = _r;
 
-       return cmp_int(l->btree_id, r->btree_id) ?:
+       return  cmp_int(l->btree_id,    r->btree_id) ?:
+               cmp_int(l->level,       r->level) ?:
                bkey_cmp(l->k->k.p, r->k->k.p) ?:
                cmp_int(l->journal_seq, r->journal_seq) ?:
                cmp_int(l->journal_offset, r->journal_offset);
@@ -180,9 +230,10 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
        const struct journal_key *l = _l;
        const struct journal_key *r = _r;
 
-       return cmp_int(l->journal_seq, r->journal_seq) ?:
-               cmp_int(l->btree_id, r->btree_id) ?:
-               bkey_cmp(l->k->k.p, r->k->k.p);
+       return  cmp_int(r->level,       l->level) ?:
+               cmp_int(l->journal_seq, r->journal_seq) ?:
+               cmp_int(l->btree_id,    r->btree_id) ?:
+               bkey_cmp(l->k->k.p,     r->k->k.p);
 }
 
 static void journal_keys_free(struct journal_keys *keys)
@@ -218,6 +269,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
                for_each_jset_key(k, _n, entry, &p->j)
                        keys.d[keys.nr++] = (struct journal_key) {
                                .btree_id       = entry->btree_id,
+                               .level          = entry->level,
                                .k              = k,
                                .journal_seq    = le64_to_cpu(p->j.seq) -
                                        keys.journal_seq_base,
@@ -229,7 +281,8 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
        src = dst = keys.d;
        while (src < keys.d + keys.nr) {
                while (src + 1 < keys.d + keys.nr &&
-                      src[0].btree_id == src[1].btree_id &&
+                      src[0].btree_id  == src[1].btree_id &&
+                      src[0].level     == src[1].level &&
                       !bkey_cmp(src[0].k->k.p, src[1].k->k.p))
                        src++;
 
@@ -351,12 +404,15 @@ err:
 }
 
 static int __bch2_journal_replay_key(struct btree_trans *trans,
-                                    enum btree_id id, struct bkey_i *k)
+                                    enum btree_id id, unsigned level,
+                                    struct bkey_i *k)
 {
        struct btree_iter *iter;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, id, k->k.p, BTREE_ITER_INTENT);
+       iter = bch2_trans_get_node_iter(trans, id, k->k.p,
+                                       BTREE_MAX_DEPTH, level,
+                                       BTREE_ITER_INTENT);
        if (IS_ERR(iter))
                return PTR_ERR(iter);
 
@@ -375,13 +431,13 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
 }
 
 static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id,
-                                  struct bkey_i *k)
+                                  unsigned level, struct bkey_i *k)
 {
        return bch2_trans_do(c, NULL, NULL,
                             BTREE_INSERT_NOFAIL|
                             BTREE_INSERT_LAZY_RW|
                             BTREE_INSERT_JOURNAL_REPLAY,
-                            __bch2_journal_replay_key(&trans, id, k));
+                            __bch2_journal_replay_key(&trans, id, level, k));
 }
 
 static int bch2_journal_replay(struct bch_fs *c,
@@ -393,15 +449,20 @@ static int bch2_journal_replay(struct bch_fs *c,
 
        sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL);
 
+       replay_now_at(j, keys.journal_seq_base);
+
        for_each_journal_key(keys, i) {
-               replay_now_at(j, keys.journal_seq_base + i->journal_seq);
+               if (!i->level)
+                       replay_now_at(j, keys.journal_seq_base + i->journal_seq);
 
+               if (i->level)
+                       ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
                if (i->btree_id == BTREE_ID_ALLOC)
                        ret = bch2_alloc_replay_key(c, i->k);
                else if (i->k->k.size)
                        ret = bch2_extent_replay_key(c, i->btree_id, i->k);
                else
-                       ret = bch2_journal_replay_key(c, i->btree_id, i->k);
+                       ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
 
                if (ret) {
                        bch_err(c, "journal replay: error %d while replaying key",
@@ -864,7 +925,7 @@ int bch2_fs_recovery(struct bch_fs *c)
                 */
                bch_info(c, "starting metadata mark and sweep");
                err = "error in mark and sweep";
-               ret = bch2_gc(c, NULL, true, true);
+               ret = bch2_gc(c, &journal_keys, true, true);
                if (ret)
                        goto err;
                bch_verbose(c, "mark and sweep done");
index c913093015635a3dde895a9ee629389551779ea6..fa1f2818817d4b10cfd305275c554a773fa92c42 100644 (file)
@@ -5,6 +5,7 @@
 struct journal_keys {
        struct journal_key {
                enum btree_id   btree_id:8;
+               unsigned        level:8;
                struct bkey_i   *k;
                u32             journal_seq;
                u32             journal_offset;
@@ -17,15 +18,23 @@ struct journal_keys {
        for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
 
 struct journal_iter {
+       enum btree_id           btree_id;
+       unsigned                level;
        struct journal_keys     *keys;
        struct journal_key      *k;
-       enum btree_id           btree_id;
 };
 
-struct btree_and_journal_iter {
-       enum btree_id           btree_id;
+/*
+ * Iterate over keys in the btree, with keys from the journal overlaid on top:
+ */
 
+struct btree_and_journal_iter {
        struct btree_iter       *btree;
+
+       struct btree            *b;
+       struct btree_node_iter  node_iter;
+       struct bkey             unpacked;
+
        struct journal_iter     journal;
 
        enum last_key_returned {
@@ -38,12 +47,14 @@ struct btree_and_journal_iter {
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
-struct journal_key *journal_key_search(struct journal_keys *,
-                                      enum btree_id, struct bpos);
+
 void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *,
                                      struct btree_trans *,
                                      struct journal_keys *,
                                      enum btree_id, struct bpos);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+                                               struct journal_keys *,
+                                               struct btree *);
 
 int bch2_fs_recovery(struct bch_fs *);
 int bch2_fs_initialize(struct bch_fs *);
index e3cb08d8c976c04dc9bca712c051d8bfa7d8d8d7..6596764c84215eecb1650fdb2ca4ccc39f216e26 100644 (file)
@@ -958,6 +958,7 @@ int bch2_fs_mark_dirty(struct bch_fs *c)
        c->disk_sb.sb->compat[0] &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA);
        c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite;
        c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates;
+       c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled;
        ret = bch2_write_super(c);
        mutex_unlock(&c->sb_lock);
 
@@ -1089,6 +1090,7 @@ void bch2_fs_mark_clean(struct bch_fs *c)
        c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
        c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA;
        c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates);
+       c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled);
 
        u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved;