]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/btree_io.c
Update bcachefs sources to 14e9ac5016 bcachefs: btree_iter fastpath
[bcachefs-tools-debian] / libbcachefs / btree_io.c
index b56b17350d3e9d7aa1e05e9f4b105fc78a1ca7ec..eeb546efd2754f465862106ed4d83e7c1c5323e1 100644 (file)
@@ -56,9 +56,9 @@ static void btree_bounce_free(struct bch_fs *c, unsigned order,
                              bool used_mempool, void *p)
 {
        if (used_mempool)
-               mempool_free(virt_to_page(p), &c->btree_bounce_pool);
+               mempool_free(p, &c->btree_bounce_pool);
        else
-               free_pages((unsigned long) p, order);
+               vpfree(p, PAGE_SIZE << order);
 }
 
 static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
@@ -66,7 +66,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
 {
        void *p;
 
-       BUG_ON(1 << order > btree_pages(c));
+       BUG_ON(order > btree_page_order(c));
 
        *used_mempool = false;
        p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order);
@@ -74,7 +74,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, unsigned order,
                return p;
 
        *used_mempool = true;
-       return page_address(mempool_alloc(&c->btree_bounce_pool, GFP_NOIO));
+       return mempool_alloc(&c->btree_bounce_pool, GFP_NOIO);
 }
 
 typedef int (*sort_cmp_fn)(struct btree *,
@@ -872,32 +872,57 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, struct nonce nonce)
                    vstruct_end(i) - (void *) i->_data);
 }
 
-#define btree_node_error(b, c, ptr, fmt, ...)                          \
-       bch2_fs_inconsistent(c,                                         \
-               "btree node error at btree %u level %u/%u bucket %zu block %u u64s %u: " fmt,\
-               (b)->btree_id, (b)->level, btree_node_root(c, b)        \
-                           ? btree_node_root(c, b)->level : -1,        \
-               PTR_BUCKET_NR(ca, ptr), (b)->written,                   \
-               le16_to_cpu((i)->u64s), ##__VA_ARGS__)
-
-static const char *validate_bset(struct bch_fs *c, struct btree *b,
-                                struct bch_dev *ca,
-                                const struct bch_extent_ptr *ptr,
-                                struct bset *i, unsigned sectors,
-                                unsigned *whiteout_u64s)
+#define btree_node_error(c, b, ptr, msg, ...)                          \
+do {                                                                   \
+       if (write == READ &&                                            \
+           !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {             \
+               mustfix_fsck_err(c,                                     \
+                        "btree node read error at btree %u level %u/%u\n"\
+                       "sector %llu node offset %u bset u64s %u: " msg,\
+                       (b)->btree_id, (b)->level,                      \
+                       (c)->btree_roots[(b)->btree_id].level,          \
+                       (u64) ptr->offset, (b)->written,                \
+                       le16_to_cpu((i)->u64s), ##__VA_ARGS__);         \
+       } else {                                                        \
+               bch_err(c, "%s at btree %u level %u/%u\n"               \
+                       "sector %llu node offset %u bset u64s %u: " msg,\
+                       write == WRITE                                  \
+                       ? "corrupt metadata in btree node write"        \
+                       : "btree node error",                           \
+                       (b)->btree_id, (b)->level,                      \
+                       (c)->btree_roots[(b)->btree_id].level,          \
+                       (u64) ptr->offset, (b)->written,                \
+                       le16_to_cpu((i)->u64s), ##__VA_ARGS__);         \
+               ret = BCH_FSCK_ERRORS_NOT_FIXED;                        \
+               goto fsck_err;                                          \
+       }                                                               \
+} while (0)
+
+static int validate_bset(struct bch_fs *c, struct btree *b,
+                        const struct bch_extent_ptr *ptr,
+                        struct bset *i, unsigned sectors,
+                        unsigned *whiteout_u64s,
+                        int write)
 {
        struct bkey_packed *k, *prev = NULL;
        struct bpos prev_pos = POS_MIN;
        bool seen_non_whiteout = false;
+       int ret = 0;
 
-       if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION)
-               return "unsupported bset version";
+       if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION) {
+               btree_node_error(c, b, ptr, "unsupported bset version");
+               i->u64s = 0;
+               return 0;
+       }
 
-       if (b->written + sectors > c->sb.btree_node_size)
-               return  "bset past end of btree node";
+       if (b->written + sectors > c->sb.btree_node_size) {
+               btree_node_error(c, b, ptr, "bset past end of btree node");
+               i->u64s = 0;
+               return 0;
+       }
 
-       if (i != &b->data->keys && !i->u64s)
-               btree_node_error(b, c, ptr, "empty set");
+       if (b->written && !i->u64s)
+               btree_node_error(c, b, ptr, "empty set");
 
        if (!BSET_SEPARATE_WHITEOUTS(i)) {
                seen_non_whiteout = true;
@@ -911,7 +936,7 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b,
                const char *invalid;
 
                if (!k->u64s) {
-                       btree_node_error(b, c, ptr,
+                       btree_node_error(c, b, ptr,
                                "KEY_U64s 0: %zu bytes of metadata lost",
                                vstruct_end(i) - (void *) k);
 
@@ -920,7 +945,7 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b,
                }
 
                if (bkey_next(k) > vstruct_last(i)) {
-                       btree_node_error(b, c, ptr,
+                       btree_node_error(c, b, ptr,
                                         "key extends past end of bset");
 
                        i->u64s = cpu_to_le16((u64 *) k - i->_data);
@@ -928,7 +953,7 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b,
                }
 
                if (k->format > KEY_FORMAT_CURRENT) {
-                       btree_node_error(b, c, ptr,
+                       btree_node_error(c, b, ptr,
                                         "invalid bkey format %u", k->format);
 
                        i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
@@ -947,8 +972,8 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b,
                        char buf[160];
 
                        bch2_bkey_val_to_text(c, btree_node_type(b),
-                                            buf, sizeof(buf), u);
-                       btree_node_error(b, c, ptr,
+                                             buf, sizeof(buf), u);
+                       btree_node_error(c, b, ptr,
                                         "invalid bkey %s: %s", buf, invalid);
 
                        i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
@@ -969,7 +994,7 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b,
                        *whiteout_u64s = k->_data - i->_data;
                        seen_non_whiteout = true;
                } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) {
-                       btree_node_error(b, c, ptr,
+                       btree_node_error(c, b, ptr,
                                         "keys out of order: %llu:%llu > %llu:%llu",
                                         prev_pos.inode,
                                         prev_pos.offset,
@@ -984,7 +1009,8 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b,
        }
 
        SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
-       return NULL;
+fsck_err:
+       return ret;
 }
 
 static bool extent_contains_ptr(struct bkey_s_c_extent e,
@@ -999,6 +1025,17 @@ static bool extent_contains_ptr(struct bkey_s_c_extent e,
        return false;
 }
 
+static void bch2_btree_node_read_complete(struct btree_read_bio *rb,
+                                         struct btree *b)
+{
+       struct bch_dev *ca = rb->pick.ca;
+
+       bio_put(&rb->bio);
+       percpu_ref_put(&ca->io_ref);
+       clear_btree_node_read_in_flight(b);
+       wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
+}
+
 void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
                              struct bch_dev *ca,
                              const struct bch_extent_ptr *ptr)
@@ -1012,7 +1049,7 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
        const char *err;
        struct bch_csum csum;
        struct nonce nonce;
-       int ret;
+       int ret, write = READ;
 
        iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
        __bch2_btree_node_iter_init(iter, btree_node_is_extents(b));
@@ -1115,9 +1152,10 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
                        sectors = vstruct_sectors(bne, c->block_bits);
                }
 
-               err = validate_bset(c, b, ca, ptr, i, sectors, &whiteout_u64s);
-               if (err)
-                       goto err;
+               ret = validate_bset(c, b, ptr, i, sectors,
+                                   &whiteout_u64s, READ);
+               if (ret)
+                       goto fsck_err;
 
                b->written += sectors;
 
@@ -1145,7 +1183,7 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
                if (bne->keys.seq == b->data->keys.seq)
                        goto err;
 
-       sorted = btree_bounce_alloc(c, ilog2(btree_pages(c)), &used_mempool);
+       sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool);
        sorted->keys.u64s = 0;
 
        b->nr = btree_node_is_extents(b)
@@ -1161,7 +1199,7 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b,
 
        BUG_ON(b->nr.live_u64s != u64s);
 
-       btree_bounce_free(c, ilog2(btree_pages(c)), used_mempool, sorted);
+       btree_bounce_free(c, btree_page_order(c), used_mempool, sorted);
 
        bch2_bset_build_aux_tree(b, b->set, false);
 
@@ -1172,16 +1210,49 @@ out:
        mempool_free(iter, &c->fill_iter);
        return;
 err:
+       btree_node_error(c, b, ptr, "%s", err);
+fsck_err:
+       bch2_inconsistent_error(c);
        set_btree_node_read_error(b);
-       btree_node_error(b, c, ptr, "%s", err);
        goto out;
 }
 
-void bch2_btree_node_read(struct bch_fs *c, struct btree *b)
+static void btree_node_read_work(struct work_struct *work)
+{
+       struct btree_read_bio *rb =
+               container_of(work, struct btree_read_bio, work);
+
+       bch2_btree_node_read_done(rb->c, rb->bio.bi_private,
+                                 rb->pick.ca, &rb->pick.ptr);
+       bch2_btree_node_read_complete(rb, rb->bio.bi_private);
+}
+
+static void btree_node_read_endio(struct bio *bio)
+{
+       struct btree *b = bio->bi_private;
+       struct btree_read_bio *rb =
+               container_of(bio, struct btree_read_bio, bio);
+
+       if (bch2_dev_fatal_io_err_on(bio->bi_error,
+                       rb->pick.ca, "IO error reading bucket %zu",
+                       PTR_BUCKET_NR(rb->pick.ca, &rb->pick.ptr)) ||
+           bch2_meta_read_fault("btree")) {
+               set_btree_node_read_error(b);
+               bch2_btree_node_read_complete(rb, rb->bio.bi_private);
+               return;
+       }
+
+       INIT_WORK(&rb->work, btree_node_read_work);
+       schedule_work(&rb->work);
+}
+
+void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
+                         bool sync)
 {
        uint64_t start_time = local_clock();
-       struct bio *bio;
        struct extent_pick_ptr pick;
+       struct btree_read_bio *rb;
+       struct bio *bio;
 
        trace_btree_read(c, b);
 
@@ -1193,27 +1264,37 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b)
        }
 
        bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
+       rb = container_of(bio, struct btree_read_bio, bio);
+       rb->c                   = c;
+       rb->pick                = pick;
+       bio->bi_opf             = REQ_OP_READ|REQ_SYNC|REQ_META;
        bio->bi_bdev            = pick.ca->disk_sb.bdev;
        bio->bi_iter.bi_sector  = pick.ptr.offset;
        bio->bi_iter.bi_size    = btree_bytes(c);
-       bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC);
        bch2_bio_map(bio, b->data);
 
-       submit_bio_wait(bio);
+       set_btree_node_read_in_flight(b);
 
-       if (bch2_dev_fatal_io_err_on(bio->bi_error,
-                                 pick.ca, "IO error reading bucket %zu",
-                                 PTR_BUCKET_NR(pick.ca, &pick.ptr)) ||
-           bch2_meta_read_fault("btree")) {
-               set_btree_node_read_error(b);
-               goto out;
-       }
+       if (sync) {
+               submit_bio_wait(bio);
+
+               if (bch2_dev_fatal_io_err_on(bio->bi_error,
+                               pick.ca, "IO error reading bucket %zu",
+                               PTR_BUCKET_NR(pick.ca, &pick.ptr)) ||
+                   bch2_meta_read_fault("btree")) {
+                       set_btree_node_read_error(b);
+                       goto out;
+               }
 
-       bch2_btree_node_read_done(c, b, pick.ca, &pick.ptr);
-       bch2_time_stats_update(&c->btree_read_time, start_time);
+               bch2_btree_node_read_done(c, b, pick.ca, &pick.ptr);
+               bch2_time_stats_update(&c->btree_read_time, start_time);
 out:
-       bio_put(bio);
-       percpu_ref_put(&pick.ca->io_ref);
+               bch2_btree_node_read_complete(rb, b);
+       } else {
+               bio->bi_end_io  = btree_node_read_endio;
+               bio->bi_private = b;
+               submit_bio(bio);
+       }
 }
 
 int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
@@ -1238,7 +1319,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id,
        bkey_copy(&b->key, k);
        BUG_ON(bch2_btree_node_hash_insert(c, b, level, id));
 
-       bch2_btree_node_read(c, b);
+       bch2_btree_node_read(c, b, true);
        six_unlock_write(&b->lock);
 
        if (btree_node_read_error(b)) {
@@ -1263,57 +1344,123 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
 {
        struct btree_write *w = btree_prev_write(b);
 
-       /*
-        * Before calling bch2_btree_complete_write() - if the write errored, we
-        * have to halt new journal writes before they see this btree node
-        * write as completed:
-        */
-       if (btree_node_write_error(b))
-               bch2_journal_halt(&c->journal);
-
        bch2_btree_complete_write(c, b, w);
        btree_node_io_unlock(b);
 }
 
-static void btree_node_write_endio(struct bio *bio)
+static void bch2_btree_node_write_error(struct bch_fs *c,
+                                       struct bch_write_bio *wbio)
 {
-       struct btree *b = bio->bi_private;
-       struct bch_write_bio *wbio = to_wbio(bio);
-       struct bch_fs *c        = wbio->c;
-       struct bio *orig        = wbio->split ? wbio->orig : NULL;
-       struct closure *cl      = !wbio->split ? wbio->cl : NULL;
-       struct bch_dev *ca      = wbio->ca;
+       struct btree *b         = wbio->bio.bi_private;
+       struct closure *cl      = wbio->cl;
+       __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+       struct bkey_i_extent *new_key;
+
+       bkey_copy(&tmp.k, &b->key);
+       new_key = bkey_i_to_extent(&tmp.k);
 
-       if (bch2_dev_fatal_io_err_on(bio->bi_error, ca, "btree write") ||
+       while (wbio->replicas_failed) {
+               unsigned idx = __fls(wbio->replicas_failed);
+
+               bch2_extent_drop_ptr_idx(extent_i_to_s(new_key), idx);
+               wbio->replicas_failed ^= 1 << idx;
+       }
+
+       if (!bch2_extent_nr_ptrs(extent_i_to_s_c(new_key)) ||
+           bch2_btree_node_update_key(c, b, new_key)) {
+               set_btree_node_noevict(b);
+               bch2_fatal_error(c);
+       }
+
+       bio_put(&wbio->bio);
+       btree_node_write_done(c, b);
+       if (cl)
+               closure_put(cl);
+}
+
+void bch2_btree_write_error_work(struct work_struct *work)
+{
+       struct bch_fs *c = container_of(work, struct bch_fs,
+                                       btree_write_error_work);
+       struct bio *bio;
+
+       while (1) {
+               spin_lock_irq(&c->read_retry_lock);
+               bio = bio_list_pop(&c->read_retry_list);
+               spin_unlock_irq(&c->read_retry_lock);
+
+               if (!bio)
+                       break;
+
+               bch2_btree_node_write_error(c, to_wbio(bio));
+       }
+}
+
+static void btree_node_write_endio(struct bio *bio)
+{
+       struct btree *b                 = bio->bi_private;
+       struct bch_write_bio *wbio      = to_wbio(bio);
+       struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
+       struct bch_write_bio *orig      = parent ?: wbio;
+       struct closure *cl              = !wbio->split ? wbio->cl : NULL;
+       struct bch_fs *c                = wbio->c;
+       struct bch_dev *ca              = wbio->ca;
+
+       if (bch2_dev_nonfatal_io_err_on(bio->bi_error, ca, "btree write") ||
            bch2_meta_write_fault("btree"))
-               set_btree_node_write_error(b);
+               set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);
 
        if (wbio->have_io_ref)
                percpu_ref_put(&ca->io_ref);
 
-       if (wbio->bounce)
-               btree_bounce_free(c,
-                       wbio->order,
-                       wbio->used_mempool,
-                       page_address(bio->bi_io_vec[0].bv_page));
-
-       if (wbio->put_bio)
+       if (parent) {
                bio_put(bio);
+               bio_endio(&parent->bio);
+               return;
+       }
 
-       if (orig) {
-               bio_endio(orig);
-       } else {
-               btree_node_write_done(c, b);
-               if (cl)
-                       closure_put(cl);
+       btree_bounce_free(c,
+               wbio->order,
+               wbio->used_mempool,
+               wbio->data);
+
+       if (wbio->replicas_failed) {
+               unsigned long flags;
+
+               spin_lock_irqsave(&c->btree_write_error_lock, flags);
+               bio_list_add(&c->read_retry_list, &wbio->bio);
+               spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+               queue_work(c->wq, &c->btree_write_error_work);
+               return;
        }
+
+       bio_put(bio);
+       btree_node_write_done(c, b);
+       if (cl)
+               closure_put(cl);
+}
+
+static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
+                                  struct bset *i, unsigned sectors)
+{
+       const struct bch_extent_ptr *ptr;
+       unsigned whiteout_u64s = 0;
+       int ret;
+
+       extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr)
+               break;
+
+       ret = validate_bset(c, b, ptr, i, sectors, &whiteout_u64s, WRITE);
+       if (ret)
+               bch2_inconsistent_error(c);
+
+       return ret;
 }
 
 void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                            struct closure *parent,
                            enum six_lock_type lock_type_held)
 {
-       struct bio *bio;
        struct bch_write_bio *wbio;
        struct bset_tree *t;
        struct bset *i;
@@ -1343,18 +1490,24 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                if (!(old & (1 << BTREE_NODE_dirty)))
                        return;
 
+               if (b->written &&
+                   !btree_node_may_write(b))
+                       return;
+
                if (old & (1 << BTREE_NODE_write_in_flight)) {
                        btree_node_wait_on_io(b);
                        continue;
                }
 
                new &= ~(1 << BTREE_NODE_dirty);
+               new &= ~(1 << BTREE_NODE_need_write);
                new |=  (1 << BTREE_NODE_write_in_flight);
                new |=  (1 << BTREE_NODE_just_written);
                new ^=  (1 << BTREE_NODE_write_idx);
        } while (cmpxchg_acquire(&b->flags, old, new) != old);
 
        BUG_ON(!list_empty(&b->write_blocked));
+       BUG_ON((b->will_make_reachable != NULL) != !b->written);
 
        BUG_ON(b->written >= c->sb.btree_node_size);
        BUG_ON(bset_written(b, btree_bset_last(b)));
@@ -1430,13 +1583,17 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 
        clear_needs_whiteout(i);
 
-       if (b->written && !i->u64s) {
-               /* Nothing to write: */
-               btree_bounce_free(c, order, used_mempool, data);
-               btree_node_write_done(c, b);
-               return;
-       }
+       /* do we have data to write? */
+       if (b->written && !i->u64s)
+               goto nowrite;
 
+       bytes_to_write = vstruct_end(i) - data;
+       sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
+
+       memset(data + bytes_to_write, 0,
+              (sectors_to_write << 9) - bytes_to_write);
+
+       BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size);
        BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN);
        BUG_ON(i->seq != b->data->keys.seq);
 
@@ -1445,6 +1602,11 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
 
        nonce = btree_nonce(b, i, b->written << 9);
 
+       /* if we're going to be encrypting, check metadata validity first: */
+       if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
+           validate_bset_for_write(c, b, i, sectors_to_write))
+               goto err;
+
        if (bn) {
                bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce,
                            &bn->flags,
@@ -1464,15 +1626,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
        }
 
-       bytes_to_write = vstruct_end(i) - data;
-       sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9;
-
-       memset(data + bytes_to_write, 0,
-              (sectors_to_write << 9) - bytes_to_write);
-
-       BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size);
-
-       trace_btree_write(b, bytes_to_write, sectors_to_write);
+       /* if we're not encrypting, check metadata after checksumming: */
+       if (!bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i)) &&
+           validate_bset_for_write(c, b, i, sectors_to_write))
+               goto err;
 
        /*
         * We handle btree write errors by immediately halting the journal -
@@ -1488,32 +1645,25 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
         * break:
         */
        if (bch2_journal_error(&c->journal) ||
-           c->opts.nochanges) {
-               set_btree_node_noevict(b);
-               b->written += sectors_to_write;
-
-               btree_bounce_free(c, order, used_mempool, data);
-               btree_node_write_done(c, b);
-               return;
-       }
+           c->opts.nochanges)
+               goto err;
 
-       bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write);
+       trace_btree_write(b, bytes_to_write, sectors_to_write);
 
-       wbio                    = to_wbio(bio);
+       wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write));
        wbio->cl                = parent;
-       wbio->bounce            = true;
-       wbio->put_bio           = true;
        wbio->order             = order;
        wbio->used_mempool      = used_mempool;
-       bio->bi_iter.bi_size    = sectors_to_write << 9;
-       bio->bi_end_io          = btree_node_write_endio;
-       bio->bi_private         = b;
-       bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA);
+       wbio->data              = data;
+       wbio->bio.bi_opf        = REQ_OP_WRITE|REQ_META|REQ_FUA;
+       wbio->bio.bi_iter.bi_size = sectors_to_write << 9;
+       wbio->bio.bi_end_io     = btree_node_write_endio;
+       wbio->bio.bi_private    = b;
 
        if (parent)
                closure_get(parent);
 
-       bch2_bio_map(bio, data);
+       bch2_bio_map(&wbio->bio, data);
 
        /*
         * If we're appending to a leaf node, we don't technically need FUA -
@@ -1543,6 +1693,13 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
        b->written += sectors_to_write;
 
        bch2_submit_wbio_replicas(wbio, c, &k.key);
+       return;
+err:
+       set_btree_node_noevict(b);
+       b->written += sectors_to_write;
+nowrite:
+       btree_bounce_free(c, order, used_mempool, data);
+       btree_node_write_done(c, b);
 }
 
 /*
@@ -1630,82 +1787,19 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
        }
 }
 
-/*
- * Write all dirty btree nodes to disk, including roots
- */
-void bch2_btree_flush(struct bch_fs *c)
+void bch2_btree_verify_flushed(struct bch_fs *c)
 {
-       struct closure cl;
-       struct btree *b;
        struct bucket_table *tbl;
        struct rhash_head *pos;
-       bool saw_dirty;
+       struct btree *b;
        unsigned i;
 
-       closure_init_stack(&cl);
-
        rcu_read_lock();
+       tbl = rht_dereference_rcu(c->btree_cache_table.tbl,
+                                 &c->btree_cache_table);
 
-       do {
-               saw_dirty = false;
-               i = 0;
-restart:
-               tbl = rht_dereference_rcu(c->btree_cache_table.tbl,
-                                         &c->btree_cache_table);
-
-               for (; i < tbl->size; i++)
-                       rht_for_each_entry_rcu(b, pos, tbl, i, hash) {
-                               saw_dirty |= btree_node_dirty(b);
-
-                               if (btree_node_dirty(b) &&
-                                   btree_node_may_write(b)) {
-                                       rcu_read_unlock();
-                                       six_lock_read(&b->lock);
-                                       bch2_btree_node_write_dirty(c, b, &cl, 1);
-                                       six_unlock_read(&b->lock);
-                                       rcu_read_lock();
-                                       goto restart;
-                               }
-                       }
-       } while (saw_dirty);
-
+       for (i = 0; i < tbl->size; i++)
+               rht_for_each_entry_rcu(b, pos, tbl, i, hash)
+                       BUG_ON(btree_node_dirty(b));
        rcu_read_unlock();
-
-       closure_sync(&cl);
-}
-
-/**
- * bch_btree_node_flush_journal - flush any journal entries that contain keys
- * from this node
- *
- * The bset's journal sequence number is used for preserving ordering of index
- * updates across unclean shutdowns - it's used to ignore bsets newer than the
- * most recent journal entry.
- *
- * But when rewriting btree nodes we compact all the bsets in a btree node - and
- * if we compacted a bset that should be ignored with bsets we do need, that
- * would be bad. So to avoid that, prior to making the new node visible ensure
- * that the journal has been flushed so that all the bsets we compacted should
- * be visible.
- */
-void bch2_btree_node_flush_journal_entries(struct bch_fs *c,
-                                         struct btree *b,
-                                         struct closure *cl)
-{
-       int i = b->nsets;
-
-       /*
-        * Journal sequence numbers in the different bsets will always be in
-        * ascending order, we only need to flush the highest - except that the
-        * most recent bset might not have a journal sequence number yet, so we
-        * need to loop:
-        */
-       while (i--) {
-               u64 seq = le64_to_cpu(bset(b, &b->set[i])->journal_seq);
-
-               if (seq) {
-                       bch2_journal_flush_seq_async(&c->journal, seq, cl);
-                       break;
-               }
-       }
 }