]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/io.c
New upstream release
[bcachefs-tools-debian] / libbcachefs / io.c
index 0ff835e8d1b4fbc49bac8426d016098f0c57a27e..5bacc6a9dd8fb57cc6e58610db92b006fb0e3f80 100644 (file)
 #include "journal.h"
 #include "keylist.h"
 #include "move.h"
+#include "nocow_locking.h"
 #include "rebalance.h"
 #include "subvolume.h"
 #include "super.h"
 #include "super-io.h"
+#include "trace.h"
 
 #include <linux/blkdev.h>
+#include <linux/prefetch.h>
 #include <linux/random.h>
 #include <linux/sched/mm.h>
 
-#include <trace/events/bcachefs.h>
-
 const char *bch2_blk_status_to_str(blk_status_t status)
 {
        if (status == BLK_STS_REMOVED)
@@ -45,6 +46,8 @@ const char *bch2_blk_status_to_str(blk_status_t status)
        return blk_status_to_str(status);
 }
 
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
+
 static bool bch2_target_congested(struct bch_fs *c, u16 target)
 {
        const struct bch_devs_mask *devs;
@@ -133,6 +136,15 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
        __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
 }
 
+#else
+
+static bool bch2_target_congested(struct bch_fs *c, u16 target)
+{
+       return false;
+}
+
+#endif
+
 /* Allocate, free from mempool: */
 
 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
@@ -151,7 +163,7 @@ static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
        struct page *page;
 
        if (likely(!*using_mempool)) {
-               page = alloc_page(GFP_NOIO);
+               page = alloc_page(GFP_NOFS);
                if (unlikely(!page)) {
                        mutex_lock(&c->bio_bounce_pages_lock);
                        *using_mempool = true;
@@ -160,7 +172,7 @@ static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool)
                }
        } else {
 pool_alloc:
-               page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO);
+               page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS);
        }
 
        return page;
@@ -205,7 +217,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
 
        bch2_trans_copy_iter(&iter, extent_iter);
 
-       for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, old, ret) {
+       for_each_btree_key_upto_continue_norestart(iter,
+                               new->k.p, BTREE_ITER_SLOTS, old, ret) {
                s64 sectors = min(new->k.p.offset, old.k->p.offset) -
                        max(bkey_start_offset(&new->k),
                            bkey_start_offset(old.k));
@@ -225,7 +238,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
                     (!new_compressed && bch2_bkey_sectors_compressed(old))))
                        *usage_increasing = true;
 
-               if (bkey_cmp(old.k->p, new->k.p) >= 0)
+               if (bkey_ge(old.k->p, new->k.p))
                        break;
        }
 
@@ -233,57 +246,54 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans,
        return ret;
 }
 
-static int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
-                                            struct btree_iter *extent_iter,
-                                            u64 new_i_size,
-                                            s64 i_sectors_delta)
+static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans,
+                                                   struct btree_iter *extent_iter,
+                                                   u64 new_i_size,
+                                                   s64 i_sectors_delta)
 {
        struct btree_iter iter;
-       struct bkey_s_c inode_k;
-       struct bkey_s_c_inode_v3 inode;
-       struct bkey_i_inode_v3 *new_inode;
+       struct bkey_i *k;
+       struct bkey_i_inode_v3 *inode;
+       unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL;
        int ret;
 
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
-                            SPOS(0,
-                                 extent_iter->pos.inode,
-                                 extent_iter->snapshot),
-                            BTREE_ITER_INTENT|BTREE_ITER_CACHED);
-       inode_k = bch2_btree_iter_peek_slot(&iter);
-       ret = bkey_err(inode_k);
+       k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes,
+                             SPOS(0,
+                                  extent_iter->pos.inode,
+                                  extent_iter->snapshot),
+                             BTREE_ITER_CACHED);
+       ret = PTR_ERR_OR_ZERO(k);
        if (unlikely(ret))
-               goto err;
-
-       ret = bkey_is_inode(inode_k.k) ? 0 : -ENOENT;
-       if (unlikely(ret))
-               goto err;
+               return ret;
 
-       if (unlikely(inode_k.k->type != KEY_TYPE_inode_v3)) {
-               inode_k = bch2_inode_to_v3(trans, inode_k);
-               ret = bkey_err(inode_k);
+       if (unlikely(k->k.type != KEY_TYPE_inode_v3)) {
+               k = bch2_inode_to_v3(trans, k);
+               ret = PTR_ERR_OR_ZERO(k);
                if (unlikely(ret))
                        goto err;
        }
 
-       inode = bkey_s_c_to_inode_v3(inode_k);
-
-       new_inode = bch2_trans_kmalloc(trans, bkey_bytes(inode_k.k));
-       ret = PTR_ERR_OR_ZERO(new_inode);
-       if (unlikely(ret))
-               goto err;
-
-       bkey_reassemble(&new_inode->k_i, inode.s_c);
+       inode = bkey_i_to_inode_v3(k);
 
-       if (!(le64_to_cpu(inode.v->bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
-           new_i_size > le64_to_cpu(inode.v->bi_size))
-               new_inode->v.bi_size = cpu_to_le64(new_i_size);
+       if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) &&
+           new_i_size > le64_to_cpu(inode->v.bi_size)) {
+               inode->v.bi_size = cpu_to_le64(new_i_size);
+               inode_update_flags = 0;
+       }
 
-       le64_add_cpu(&new_inode->v.bi_sectors, i_sectors_delta);
+       if (i_sectors_delta) {
+               le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta);
+               inode_update_flags = 0;
+       }
 
-       new_inode->k.p.snapshot = iter.snapshot;
+       if (inode->k.p.snapshot != iter.snapshot) {
+               inode->k.p.snapshot = iter.snapshot;
+               inode_update_flags = 0;
+       }
 
-       ret = bch2_trans_update(trans, &iter, &new_inode->k_i,
-                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+       ret = bch2_trans_update(trans, &iter, &inode->k_i,
+                               BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
+                               inode_update_flags);
 err:
        bch2_trans_iter_exit(trans, &iter);
        return ret;
@@ -373,6 +383,7 @@ int bch2_extent_fallocate(struct btree_trans *trans,
        struct open_buckets open_buckets;
        struct bkey_s_c k;
        struct bkey_buf old, new;
+       unsigned sectors_allocated;
        bool have_reservation = false;
        bool unwritten = opts.nocow &&
            c->sb.version >= bcachefs_metadata_version_unwritten_extents;
@@ -383,6 +394,8 @@ int bch2_extent_fallocate(struct btree_trans *trans,
        closure_init_stack(&cl);
        open_buckets.nr = 0;
 retry:
+       sectors_allocated = 0;
+
        k = bch2_btree_iter_peek_slot(iter);
        ret = bkey_err(k);
        if (ret)
@@ -438,16 +451,17 @@ retry:
                                &devs_have,
                                opts.data_replicas,
                                opts.data_replicas,
-                               RESERVE_none, 0, &cl, &wp);
-               if (ret == -EAGAIN) {
+                               BCH_WATERMARK_normal, 0, &cl, &wp);
+               if (ret) {
                        bch2_trans_unlock(trans);
                        closure_sync(&cl);
-                       goto retry;
-               }
-               if (ret)
+                       if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+                               goto retry;
                        return ret;
+               }
 
                sectors = min(sectors, wp->sectors_free);
+               sectors_allocated = sectors;
 
                bch2_key_resize(&e->k, sectors);
 
@@ -474,6 +488,9 @@ out:
                goto retry;
        }
 
+       if (!ret && sectors_allocated)
+               bch2_increment_clock(c, sectors_allocated, WRITE);
+
        bch2_open_buckets_put(c, &open_buckets);
        bch2_disk_reservation_put(c, &disk_res);
        bch2_bkey_buf_exit(&new, c);
@@ -513,11 +530,12 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 
                bch2_btree_iter_set_snapshot(iter, snapshot);
 
-               k = bch2_btree_iter_peek(iter);
-               if (bkey_cmp(iter->pos, end_pos) >= 0) {
-                       bch2_btree_iter_set_pos(iter, end_pos);
+               /*
+                * peek_upto() doesn't have ideal semantics for extents:
+                */
+               k = bch2_btree_iter_peek_upto(iter, end_pos);
+               if (!k.k)
                        break;
-               }
 
                ret = bkey_err(k);
                if (ret)
@@ -608,7 +626,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
                if (ret)
                        break;
 
-               if (bkey_cmp(iter.pos, k->k.p) >= 0)
+               if (bkey_ge(iter.pos, k->k.p))
                        bch2_keylist_pop_front(&op->insert_keys);
                else
                        bch2_cut_front(iter.pos, k);
@@ -642,7 +660,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 
                if (to_entry(ptr + 1) < ptrs.end) {
                        n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
-                                               GFP_NOIO, &ca->replica_set));
+                                               GFP_NOFS, &ca->replica_set));
 
                        n->bio.bi_end_io        = wbio->bio.bi_end_io;
                        n->bio.bi_private       = wbio->bio.bi_private;
@@ -671,6 +689,12 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
                                     bio_sectors(&n->bio));
 
                        bio_set_dev(&n->bio, ca->disk_sb.bdev);
+
+                       if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) {
+                               bio_endio(&n->bio);
+                               continue;
+                       }
+
                        submit_bio(&n->bio);
                } else {
                        n->bio.bi_status        = BLK_STS_REMOVED;
@@ -687,11 +711,13 @@ static void bch2_write_done(struct closure *cl)
        struct bch_fs *c = op->c;
 
        bch2_disk_reservation_put(c, &op->res);
-       percpu_ref_put(&c->writes);
+       if (!(op->flags & BCH_WRITE_MOVE))
+               bch2_write_ref_put(c, BCH_WRITE_REF_write);
        bch2_keylist_free(&op->insert_keys, op->inline_keys);
 
        bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time);
 
+       EBUG_ON(cl->parent);
        closure_debug_destroy(cl);
        if (op->end_io)
                op->end_io(op);
@@ -715,7 +741,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op)
                }
 
                if (dst != src)
-                       memmove_u64s_down(dst, src, src->u64s);
+                       memmove_u64s_down(dst, src, src->k.u64s);
                dst = bkey_next(dst);
        }
 
@@ -745,14 +771,9 @@ static void __bch2_write_index(struct bch_write_op *op)
         * particularly want to plumb io_opts all the way through the btree
         * update stack right now
         */
-       for_each_keylist_key(keys, k) {
+       for_each_keylist_key(keys, k)
                bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts);
 
-               if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k)))
-                       bch2_check_set_feature(op->c, BCH_FEATURE_incompressible);
-
-       }
-
        if (!bch2_keylist_empty(keys)) {
                u64 sectors_start = keylist_sectors(keys);
 
@@ -765,15 +786,17 @@ static void __bch2_write_index(struct bch_write_op *op)
 
                op->written += sectors_start - keylist_sectors(keys);
 
-               if (ret) {
+               if (ret && !bch2_err_matches(ret, EROFS)) {
                        struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
 
                        bch_err_inum_offset_ratelimited(c,
                                k->k.p.inode, k->k.p.offset << 9,
                                "write error while doing btree update: %s",
                                bch2_err_str(ret));
-                       goto err;
                }
+
+               if (ret)
+                       goto err;
        }
 out:
        /* If some a bucket wasn't written, we can't erasure code it: */
@@ -789,17 +812,61 @@ err:
        goto out;
 }
 
+static inline void __wp_update_state(struct write_point *wp, enum write_point_state state)
+{
+       if (state != wp->state) {
+               u64 now = ktime_get_ns();
+
+               if (wp->last_state_change &&
+                   time_after64(now, wp->last_state_change))
+                       wp->time[wp->state] += now - wp->last_state_change;
+               wp->state = state;
+               wp->last_state_change = now;
+       }
+}
+
+static inline void wp_update_state(struct write_point *wp, bool running)
+{
+       enum write_point_state state;
+
+       state = running                  ? WRITE_POINT_running :
+               !list_empty(&wp->writes) ? WRITE_POINT_waiting_io
+                                        : WRITE_POINT_stopped;
+
+       __wp_update_state(wp, state);
+}
+
 static void bch2_write_index(struct closure *cl)
 {
        struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
        struct write_point *wp = op->wp;
        struct workqueue_struct *wq = index_update_wq(op);
+       unsigned long flags;
+
+       if ((op->flags & BCH_WRITE_DONE) &&
+           (op->flags & BCH_WRITE_MOVE))
+               bch2_bio_free_pages_pool(op->c, &op->wbio.bio);
+
+       spin_lock_irqsave(&wp->writes_lock, flags);
+       if (wp->state == WRITE_POINT_waiting_io)
+               __wp_update_state(wp, WRITE_POINT_waiting_work);
+       list_add_tail(&op->wp_list, &wp->writes);
+       spin_unlock_irqrestore (&wp->writes_lock, flags);
 
-       barrier();
-       op->btree_update_ready = true;
        queue_work(wq, &wp->index_update_work);
 }
 
+static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp)
+{
+       op->wp = wp;
+
+       if (wp->state == WRITE_POINT_stopped) {
+               spin_lock_irq(&wp->writes_lock);
+               __wp_update_state(wp, WRITE_POINT_waiting_io);
+               spin_unlock_irq(&wp->writes_lock);
+       }
+}
+
 void bch2_write_point_do_index_updates(struct work_struct *work)
 {
        struct write_point *wp =
@@ -807,17 +874,18 @@ void bch2_write_point_do_index_updates(struct work_struct *work)
        struct bch_write_op *op;
 
        while (1) {
-               spin_lock(&wp->writes_lock);
+               spin_lock_irq(&wp->writes_lock);
                op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list);
-               if (op && !op->btree_update_ready)
-                       op = NULL;
                if (op)
                        list_del(&op->wp_list);
-               spin_unlock(&wp->writes_lock);
+               wp_update_state(wp, op != NULL);
+               spin_unlock_irq(&wp->writes_lock);
 
                if (!op)
                        break;
 
+               op->flags |= BCH_WRITE_IN_WORKER;
+
                __bch2_write_index(op);
 
                if (!(op->flags & BCH_WRITE_DONE))
@@ -859,12 +927,10 @@ static void bch2_write_endio(struct bio *bio)
        if (wbio->put_bio)
                bio_put(bio);
 
-       if (parent) {
+       if (parent)
                bio_endio(&parent->bio);
-               return;
-       }
-
-       closure_put(cl);
+       else
+               closure_put(cl);
 }
 
 static void init_append_extent(struct bch_write_op *op,
@@ -872,7 +938,6 @@ static void init_append_extent(struct bch_write_op *op,
                               struct bversion version,
                               struct bch_extent_crc_unpacked crc)
 {
-       struct bch_fs *c = op->c;
        struct bkey_i_extent *e;
 
        op->pos.offset += crc.uncompressed_size;
@@ -887,7 +952,7 @@ static void init_append_extent(struct bch_write_op *op,
            crc.nonce)
                bch2_extent_crc_append(&e->k_i, crc);
 
-       bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, crc.compressed_size,
+       bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size,
                                       op->flags & BCH_WRITE_CACHED);
 
        bch2_keylist_push(&op->insert_keys);
@@ -911,7 +976,7 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
        pages = min(pages, BIO_MAX_VECS);
 
        bio = bio_alloc_bioset(NULL, pages, 0,
-                              GFP_NOIO, &c->bio_write);
+                              GFP_NOFS, &c->bio_write);
        wbio                    = wbio_init(bio);
        wbio->put_bio           = true;
        /* copy WRITE_SYNC flag */
@@ -1013,11 +1078,12 @@ static enum prep_encoded_ret {
        /* Can we just write the entire extent as is? */
        if (op->crc.uncompressed_size == op->crc.live_size &&
            op->crc.compressed_size <= wp->sectors_free &&
-           (op->crc.compression_type == op->compression_type ||
+           (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) ||
             op->incompressible)) {
                if (!crc_is_compressed(op->crc) &&
                    op->csum_type != op->crc.csum_type &&
-                   bch2_write_rechecksum(c, op, op->csum_type))
+                   bch2_write_rechecksum(c, op, op->csum_type) &&
+                   !c->opts.no_data_io)
                        return PREP_ENCODED_CHECKSUM_ERR;
 
                return PREP_ENCODED_DO_WRITE;
@@ -1037,7 +1103,7 @@ static enum prep_encoded_ret {
                csum = bch2_checksum_bio(c, op->crc.csum_type,
                                         extent_nonce(op->version, op->crc),
                                         bio);
-               if (bch2_crc_cmp(op->crc.csum, csum))
+               if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io)
                        return PREP_ENCODED_CHECKSUM_ERR;
 
                if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
@@ -1055,13 +1121,14 @@ static enum prep_encoded_ret {
         */
        if ((op->crc.live_size != op->crc.uncompressed_size ||
             op->crc.csum_type != op->csum_type) &&
-           bch2_write_rechecksum(c, op, op->csum_type))
+           bch2_write_rechecksum(c, op, op->csum_type) &&
+           !c->opts.no_data_io)
                return PREP_ENCODED_CHECKSUM_ERR;
 
        /*
         * If we want to compress the data, it has to be decrypted:
         */
-       if ((op->compression_type ||
+       if ((op->compression_opt ||
             bch2_csum_type_is_encryption(op->crc.csum_type) !=
             bch2_csum_type_is_encryption(op->csum_type)) &&
            bch2_write_decrypt(op))
@@ -1108,7 +1175,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
        }
 
        if (ec_buf ||
-           op->compression_type ||
+           op->compression_opt ||
            (op->csum_type &&
             !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
            (bch2_csum_type_is_encryption(op->csum_type) &&
@@ -1131,16 +1198,16 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
                    dst->bi_iter.bi_size < c->opts.encoded_extent_max)
                        break;
 
-               BUG_ON(op->compression_type &&
+               BUG_ON(op->compression_opt &&
                       (op->flags & BCH_WRITE_DATA_ENCODED) &&
                       bch2_csum_type_is_encryption(op->crc.csum_type));
-               BUG_ON(op->compression_type && !bounce);
+               BUG_ON(op->compression_opt && !bounce);
 
                crc.compression_type = op->incompressible
                        ? BCH_COMPRESSION_TYPE_incompressible
-                       : op->compression_type
+                       : op->compression_opt
                        ? bch2_bio_compress(c, dst, &dst_len, src, &src_len,
-                                           op->compression_type)
+                                           op->compression_opt)
                        : 0;
                if (!crc_is_compressed(crc)) {
                        dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
@@ -1249,7 +1316,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp,
                BUG_ON(total_output != total_input);
 
                dst = bio_split(src, total_input >> 9,
-                               GFP_NOIO, &c->bio_write);
+                               GFP_NOFS, &c->bio_write);
                wbio_init(dst)->put_bio = true;
                /* copy WRITE_SYNC flag */
                dst->bi_opf             = src->bi_opf;
@@ -1328,13 +1395,11 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
                return 0;
        }
 
-       new = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
+       new = bch2_bkey_make_mut_noupdate(trans, k);
        ret = PTR_ERR_OR_ZERO(new);
        if (ret)
                return ret;
 
-       bkey_reassemble(new, k);
-
        bch2_cut_front(bkey_start_pos(&orig->k), new);
        bch2_cut_back(orig->k.p, new);
 
@@ -1367,23 +1432,23 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
        bch2_trans_init(&trans, c, 0, 0);
 
        for_each_keylist_key(&op->insert_keys, orig) {
-               ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents,
-                                    bkey_start_pos(&orig->k),
+               ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents,
+                                    bkey_start_pos(&orig->k), orig->k.p,
                                     BTREE_ITER_INTENT, k,
                                     NULL, NULL, BTREE_INSERT_NOFAIL, ({
-                       if (bkey_cmp(bkey_start_pos(k.k), orig->k.p) >= 0)
-                               break;
-
                        bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size);
                }));
 
-               if (ret) {
+               if (ret && !bch2_err_matches(ret, EROFS)) {
                        struct bkey_i *k = bch2_keylist_front(&op->insert_keys);
 
                        bch_err_inum_offset_ratelimited(c,
                                k->k.p.inode, k->k.p.offset << 9,
                                "write error while doing btree update: %s",
                                bch2_err_str(ret));
+               }
+
+               if (ret) {
                        op->error = ret;
                        break;
                }
@@ -1417,9 +1482,15 @@ static void bch2_nocow_write(struct bch_write_op *op)
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bkey_ptrs_c ptrs;
-       const struct bch_extent_ptr *ptr, *ptr2;
+       const struct bch_extent_ptr *ptr;
+       struct {
+               struct bpos     b;
+               unsigned        gen;
+               struct nocow_lock_bucket *l;
+       } buckets[BCH_REPLICAS_MAX];
+       unsigned nr_buckets = 0;
        u32 snapshot;
-       int ret;
+       int ret, i;
 
        if (op->flags & BCH_WRITE_MOVE)
                return;
@@ -1438,6 +1509,8 @@ retry:
        while (1) {
                struct bio *bio = &op->wbio.bio;
 
+               nr_buckets = 0;
+
                k = bch2_btree_iter_peek_slot(&iter);
                ret = bkey_err(k);
                if (ret)
@@ -1456,27 +1529,47 @@ retry:
 
                /* Get iorefs before dropping btree locks: */
                ptrs = bch2_bkey_ptrs_c(k);
-               bkey_for_each_ptr(ptrs, ptr)
+               bkey_for_each_ptr(ptrs, ptr) {
+                       buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr);
+                       buckets[nr_buckets].gen = ptr->gen;
+                       buckets[nr_buckets].l =
+                               bucket_nocow_lock(&c->nocow_locks,
+                                                 bucket_to_u64(buckets[nr_buckets].b));
+
+                       prefetch(buckets[nr_buckets].l);
+
                        if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE)))
                                goto err_get_ioref;
 
+                       nr_buckets++;
+
+                       if (ptr->unwritten)
+                               op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
+               }
+
                /* Unlock before taking nocow locks, doing IO: */
                bkey_reassemble(op->insert_keys.top, k);
                bch2_trans_unlock(&trans);
 
                bch2_cut_front(op->pos, op->insert_keys.top);
-               bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
+               if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN)
+                       bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top);
 
-               ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(op->insert_keys.top));
-               bkey_for_each_ptr(ptrs, ptr) {
-                       bch2_bucket_nocow_lock(&c->nocow_locks,
-                                              PTR_BUCKET_POS(c, ptr),
-                                              BUCKET_NOCOW_LOCK_UPDATE);
-                       if (unlikely(ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
-                               goto err_bucket_stale;
+               for (i = 0; i < nr_buckets; i++) {
+                       struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode);
+                       struct nocow_lock_bucket *l = buckets[i].l;
+                       bool stale;
 
-                       if (ptr->unwritten)
-                               op->flags |= BCH_WRITE_CONVERT_UNWRITTEN;
+                       __bch2_bucket_nocow_lock(&c->nocow_locks, l,
+                                                bucket_to_u64(buckets[i].b),
+                                                BUCKET_NOCOW_LOCK_UPDATE);
+
+                       rcu_read_lock();
+                       stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen);
+                       rcu_read_unlock();
+
+                       if (unlikely(stale))
+                               goto err_bucket_stale;
                }
 
                bio = &op->wbio.bio;
@@ -1540,29 +1633,21 @@ err:
        }
        return;
 err_get_ioref:
-       bkey_for_each_ptr(ptrs, ptr2) {
-               if (ptr2 == ptr)
-                       break;
-
-               percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref);
-       }
+       for (i = 0; i < nr_buckets; i++)
+               percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
 
        /* Fall back to COW path: */
        goto out;
 err_bucket_stale:
-       bkey_for_each_ptr(ptrs, ptr2) {
+       while (--i >= 0)
                bch2_bucket_nocow_unlock(&c->nocow_locks,
-                                        PTR_BUCKET_POS(c, ptr2),
+                                        buckets[i].b,
                                         BUCKET_NOCOW_LOCK_UPDATE);
-               if (ptr2 == ptr)
-                       break;
-       }
-
-       bkey_for_each_ptr(ptrs, ptr2)
-               percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref);
+       for (i = 0; i < nr_buckets; i++)
+               percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref);
 
        /* We can retry this: */
-       ret = BCH_ERR_transaction_restart;
+       ret = -BCH_ERR_transaction_restart;
        goto out;
 }
 
@@ -1576,14 +1661,13 @@ static void __bch2_write(struct bch_write_op *op)
 
        nofs_flags = memalloc_nofs_save();
 
-       if (unlikely(op->opts.nocow)) {
+       if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) {
                bch2_nocow_write(op);
                if (op->flags & BCH_WRITE_DONE)
                        goto out_nofs_restore;
        }
 again:
        memset(&op->failed, 0, sizeof(op->failed));
-       op->btree_update_ready = false;
 
        do {
                struct bkey_i *key_to_write;
@@ -1614,33 +1698,33 @@ again:
                                &op->devs_have,
                                op->nr_replicas,
                                op->nr_replicas_required,
-                               op->alloc_reserve,
+                               op->watermark,
                                op->flags,
                                (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
                                              BCH_WRITE_ONLY_SPECIFIED_DEVS))
                                ? NULL : &op->cl, &wp));
                if (unlikely(ret)) {
-                       if (unlikely(ret != -EAGAIN)) {
-                               op->error = ret;
-                               op->flags |= BCH_WRITE_DONE;
-                       }
+                       if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
+                               break;
 
-                       break;
+                       goto err;
                }
 
+               EBUG_ON(!wp);
+
                bch2_open_bucket_get(c, wp, &op->open_buckets);
                ret = bch2_write_extent(op, wp, &bio);
 
-               bch2_alloc_sectors_done(c, wp);
-
-               if (ret < 0) {
-                       op->error = ret;
+               bch2_alloc_sectors_done_inlined(c, wp);
+err:
+               if (ret <= 0) {
                        op->flags |= BCH_WRITE_DONE;
-                       break;
-               }
 
-               if (!ret)
-                       op->flags |= BCH_WRITE_DONE;
+                       if (ret < 0) {
+                               op->error = ret;
+                               break;
+                       }
+               }
 
                bio->bi_end_io  = bch2_write_endio;
                bio->bi_private = &op->cl;
@@ -1662,7 +1746,9 @@ again:
         * synchronously here if we weren't able to submit all of the IO at
         * once, as that signals backpressure to the caller.
         */
-       if ((op->flags & BCH_WRITE_SYNC) || !(op->flags & BCH_WRITE_DONE)) {
+       if ((op->flags & BCH_WRITE_SYNC) ||
+           (!(op->flags & BCH_WRITE_DONE) &&
+            !(op->flags & BCH_WRITE_IN_WORKER))) {
                closure_sync(&op->cl);
                __bch2_write_index(op);
 
@@ -1670,11 +1756,7 @@ again:
                        goto again;
                bch2_write_done(&op->cl);
        } else {
-               spin_lock(&wp->writes_lock);
-               op->wp = wp;
-               list_add_tail(&op->wp_list, &wp->writes);
-               spin_unlock(&wp->writes_lock);
-
+               bch2_write_queue(op, wp);
                continue_at(&op->cl, bch2_write_index, NULL);
        }
 out_nofs_restore:
@@ -1689,6 +1771,9 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
        unsigned sectors;
        int ret;
 
+       op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+       op->flags |= BCH_WRITE_DONE;
+
        bch2_check_set_feature(op->c, BCH_FEATURE_inline_data);
 
        ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
@@ -1716,9 +1801,6 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
        set_bkey_val_bytes(&id->k, data_len);
        bch2_keylist_push(&op->insert_keys);
 
-       op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
-       op->flags |= BCH_WRITE_DONE;
-
        __bch2_write_index(op);
 err:
        bch2_write_done(&op->cl);
@@ -1750,7 +1832,7 @@ void bch2_write(struct closure *cl)
        EBUG_ON(op->cl.parent);
        BUG_ON(!op->nr_replicas);
        BUG_ON(!op->write_point.v);
-       BUG_ON(!bkey_cmp(op->pos, POS_MAX));
+       BUG_ON(bkey_eq(op->pos, POS_MAX));
 
        op->start_time = local_clock();
        bch2_keylist_init(&op->insert_keys, op->inline_keys);
@@ -1765,9 +1847,14 @@ void bch2_write(struct closure *cl)
                goto err;
        }
 
-       if (c->opts.nochanges ||
-           !percpu_ref_tryget_live(&c->writes)) {
-               op->error = -EROFS;
+       if (c->opts.nochanges) {
+               op->error = -BCH_ERR_erofs_no_writes;
+               goto err;
+       }
+
+       if (!(op->flags & BCH_WRITE_MOVE) &&
+           !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) {
+               op->error = -BCH_ERR_erofs_no_writes;
                goto err;
        }
 
@@ -1793,6 +1880,34 @@ err:
                op->end_io(op);
 }
 
+static const char * const bch2_write_flags[] = {
+#define x(f)   #f,
+       BCH_WRITE_FLAGS()
+#undef x
+       NULL
+};
+
+void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op)
+{
+       prt_str(out, "pos: ");
+       bch2_bpos_to_text(out, op->pos);
+       prt_newline(out);
+       printbuf_indent_add(out, 2);
+
+       prt_str(out, "started: ");
+       bch2_pr_time_units(out, local_clock() - op->start_time);
+       prt_newline(out);
+
+       prt_str(out, "flags: ");
+       prt_bitflags(out, bch2_write_flags, op->flags);
+       prt_newline(out);
+
+       prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl));
+       prt_newline(out);
+
+       printbuf_indent_sub(out, 2);
+}
+
 /* Cache promotion on read */
 
 struct promote_op {
@@ -1845,10 +1960,12 @@ static void promote_free(struct bch_fs *c, struct promote_op *op)
 {
        int ret;
 
+       bch2_data_update_exit(&op->write);
+
        ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
                                     bch_promote_params);
        BUG_ON(ret);
-       percpu_ref_put(&c->writes);
+       bch2_write_ref_put(c, BCH_WRITE_REF_promote);
        kfree_rcu(op, rcu);
 }
 
@@ -1860,8 +1977,6 @@ static void promote_done(struct bch_write_op *wop)
 
        bch2_time_stats_update(&c->times[BCH_TIME_data_promote],
                               op->start_time);
-
-       bch2_data_update_exit(&op->write);
        promote_free(c, op);
 }
 
@@ -1882,7 +1997,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
        bch2_data_update_read_done(&op->write, rbio->pick.crc);
 }
 
-static struct promote_op *__promote_alloc(struct bch_fs *c,
+static struct promote_op *__promote_alloc(struct btree_trans *trans,
                                          enum btree_id btree_id,
                                          struct bkey_s_c k,
                                          struct bpos pos,
@@ -1891,15 +2006,16 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
                                          unsigned sectors,
                                          struct bch_read_bio **rbio)
 {
+       struct bch_fs *c = trans->c;
        struct promote_op *op = NULL;
        struct bio *bio;
        unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
        int ret;
 
-       if (!percpu_ref_tryget_live(&c->writes))
+       if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
                return NULL;
 
-       op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
+       op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
        if (!op)
                goto err;
 
@@ -1912,7 +2028,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
         */
        *rbio = kzalloc(sizeof(struct bch_read_bio) +
                        sizeof(struct bio_vec) * pages,
-                       GFP_NOIO);
+                       GFP_NOFS);
        if (!*rbio)
                goto err;
 
@@ -1920,7 +2036,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
        bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
 
        if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
-                                GFP_NOIO))
+                                GFP_NOFS))
                goto err;
 
        (*rbio)->bounce         = true;
@@ -1934,7 +2050,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
        bio = &op->write.op.wbio.bio;
        bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
 
-       ret = bch2_data_update_init(c, &op->write,
+       ret = bch2_data_update_init(trans, NULL, &op->write,
                        writepoint_hashed((unsigned long) current),
                        opts,
                        (struct data_update_opts) {
@@ -1943,7 +2059,17 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
                                .write_flags    = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED,
                        },
                        btree_id, k);
-       BUG_ON(ret);
+       /*
+        * possible errors: -BCH_ERR_nocow_lock_blocked,
+        * -BCH_ERR_ENOSPC_disk_reservation:
+        */
+       if (ret) {
+               ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
+                                       bch_promote_params);
+               BUG_ON(ret);
+               goto err;
+       }
+
        op->write.op.end_io = promote_done;
 
        return op;
@@ -1953,21 +2079,22 @@ err:
        kfree(*rbio);
        *rbio = NULL;
        kfree(op);
-       percpu_ref_put(&c->writes);
+       bch2_write_ref_put(c, BCH_WRITE_REF_promote);
        return NULL;
 }
 
 noinline
-static struct promote_op *promote_alloc(struct bch_fs *c,
-                                              struct bvec_iter iter,
-                                              struct bkey_s_c k,
-                                              struct extent_ptr_decoded *pick,
-                                              struct bch_io_opts opts,
-                                              unsigned flags,
-                                              struct bch_read_bio **rbio,
-                                              bool *bounce,
-                                              bool *read_full)
+static struct promote_op *promote_alloc(struct btree_trans *trans,
+                                       struct bvec_iter iter,
+                                       struct bkey_s_c k,
+                                       struct extent_ptr_decoded *pick,
+                                       struct bch_io_opts opts,
+                                       unsigned flags,
+                                       struct bch_read_bio **rbio,
+                                       bool *bounce,
+                                       bool *read_full)
 {
+       struct bch_fs *c = trans->c;
        bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents);
        /* data might have to be decompressed in the write path: */
        unsigned sectors = promote_full
@@ -1981,7 +2108,7 @@ static struct promote_op *promote_alloc(struct bch_fs *c,
        if (!should_promote(c, k, pos, opts, flags))
                return NULL;
 
-       promote = __promote_alloc(c,
+       promote = __promote_alloc(trans,
                                  k.k->type == KEY_TYPE_reflink_v
                                  ? BTREE_ID_reflink
                                  : BTREE_ID_extents,
@@ -2189,9 +2316,8 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
        if (crc_is_compressed(rbio->pick.crc))
                return 0;
 
-       bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos,
-                            BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
-       k = bch2_btree_iter_peek_slot(&iter);
+       k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos,
+                              BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
        if ((ret = bkey_err(k)))
                goto out;
 
@@ -2267,7 +2393,7 @@ static void __bch2_read_endio(struct work_struct *work)
        }
 
        csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
-       if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
+       if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io)
                goto csum_err;
 
        /*
@@ -2292,7 +2418,8 @@ static void __bch2_read_endio(struct work_struct *work)
                if (ret)
                        goto decrypt_err;
 
-               if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
+               if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) &&
+                   !c->opts.no_data_io)
                        goto decompression_err;
        } else {
                /* don't need to decrypt the entire bio: */
@@ -2427,10 +2554,8 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
        reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) +
                *offset_into_extent;
 
-       bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink,
-                            POS(0, reflink_offset),
-                            BTREE_ITER_SLOTS);
-       k = bch2_btree_iter_peek_slot(&iter);
+       k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink,
+                              POS(0, reflink_offset), 0);
        ret = bkey_err(k);
        if (ret)
                goto err;
@@ -2588,7 +2713,7 @@ retry_pick:
        }
 
        if (orig->opts.promote_target)
-               promote = promote_alloc(c, iter, k, &pick, orig->opts, flags,
+               promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags,
                                        &rbio, &bounce, &read_full);
 
        if (!read_full) {
@@ -2627,7 +2752,7 @@ get_bio:
                rbio = rbio_init(bio_alloc_bioset(NULL,
                                                  DIV_ROUND_UP(sectors, PAGE_SECTORS),
                                                  0,
-                                                 GFP_NOIO,
+                                                 GFP_NOFS,
                                                  &c->bio_read_split),
                                 orig->opts);
 
@@ -2643,7 +2768,7 @@ get_bio:
                 * from the whole bio, in which case we don't want to retry and
                 * lose the error)
                 */
-               rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOIO,
+               rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS,
                                                 &c->bio_read_split),
                                 orig->opts);
                rbio->bio.bi_iter = iter;
@@ -2718,10 +2843,21 @@ get_bio:
                             bio_sectors(&rbio->bio));
                bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
 
-               if (likely(!(flags & BCH_READ_IN_RETRY)))
-                       submit_bio(&rbio->bio);
-               else
-                       submit_bio_wait(&rbio->bio);
+               if (unlikely(c->opts.no_data_io)) {
+                       if (likely(!(flags & BCH_READ_IN_RETRY)))
+                               bio_endio(&rbio->bio);
+               } else {
+                       if (likely(!(flags & BCH_READ_IN_RETRY)))
+                               submit_bio(&rbio->bio);
+                       else
+                               submit_bio_wait(&rbio->bio);
+               }
+
+               /*
+                * We just submitted IO which may block, we expect relock fail
+                * events and shouldn't count them:
+                */
+               trans->notrace_relock_fail = true;
        } else {
                /* Attempting reconstruct read: */
                if (bch2_ec_read_extent(c, rbio)) {
@@ -2897,24 +3033,27 @@ void bch2_fs_io_exit(struct bch_fs *c)
 
 int bch2_fs_io_init(struct bch_fs *c)
 {
-       unsigned i;
+       if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
+                       BIOSET_NEED_BVECS))
+               return -BCH_ERR_ENOMEM_bio_read_init;
 
-       for (i = 0; i < ARRAY_SIZE(c->nocow_locks.l); i++)
-               two_state_lock_init(&c->nocow_locks.l[i]);
+       if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
+                       BIOSET_NEED_BVECS))
+               return -BCH_ERR_ENOMEM_bio_read_split_init;
 
-       if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio),
-                       BIOSET_NEED_BVECS) ||
-           bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio),
-                       BIOSET_NEED_BVECS) ||
-           bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
-                       BIOSET_NEED_BVECS) ||
-           mempool_init_page_pool(&c->bio_bounce_pages,
+       if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio),
+                       BIOSET_NEED_BVECS))
+               return -BCH_ERR_ENOMEM_bio_write_init;
+
+       if (mempool_init_page_pool(&c->bio_bounce_pages,
                                   max_t(unsigned,
                                         c->opts.btree_node_size,
                                         c->opts.encoded_extent_max) /
-                                  PAGE_SIZE, 0) ||
-           rhashtable_init(&c->promote_table, &bch_promote_params))
-               return -ENOMEM;
+                                  PAGE_SIZE, 0))
+               return -BCH_ERR_ENOMEM_bio_bounce_pages_init;
+
+       if (rhashtable_init(&c->promote_table, &bch_promote_params))
+               return -BCH_ERR_ENOMEM_promote_table_init;
 
        return 0;
 }