]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/io.c
Update bcachefs sources to 9e7ae5219c bcachefs: Make write points more dynamic
[bcachefs-tools-debian] / libbcachefs / io.c
index da06845cd1a775105883bff5db6e9d0eac5c2784..e5fc72da83bde530874a73574b62b7bde4d22db1 100644 (file)
 
 #include <trace/events/bcachefs.h>
 
-static inline void __bio_inc_remaining(struct bio *bio)
-{
-       bio_set_flag(bio, BIO_CHAIN);
-       smp_mb__before_atomic();
-       atomic_inc(&bio->__bi_remaining);
-}
-
 /* Allocate, free from mempool: */
 
 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
@@ -85,39 +78,23 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
 
 /* Bios with headers */
 
-static void bch2_submit_wbio(struct bch_fs *c, struct bch_write_bio *wbio,
-                           struct bch_dev *ca, const struct bch_extent_ptr *ptr)
-{
-       wbio->ca                = ca;
-       wbio->submit_time_us    = local_clock_us();
-       wbio->bio.bi_iter.bi_sector = ptr->offset;
-       wbio->bio.bi_bdev       = ca ? ca->disk_sb.bdev : NULL;
-
-       if (!ca)
-               bcache_io_error(c, &wbio->bio, "device has been removed");
-       else
-               generic_make_request(&wbio->bio);
-}
-
 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
+                              enum bch_data_type type,
                               const struct bkey_i *k)
 {
        struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
        const struct bch_extent_ptr *ptr;
        struct bch_write_bio *n;
        struct bch_dev *ca;
+       unsigned ptr_idx = 0;
 
        BUG_ON(c->opts.nochanges);
 
-       wbio->split = false;
-       wbio->c = c;
-
        extent_for_each_ptr(e, ptr) {
+               BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
+                      !c->devs[ptr->dev]);
+
                ca = c->devs[ptr->dev];
-               if (!percpu_ref_tryget(&ca->io_ref)) {
-                       bch2_submit_wbio(c, wbio, NULL, ptr);
-                       break;
-               }
 
                if (ptr + 1 < &extent_entry_last(e)->ptr) {
                        n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
@@ -125,21 +102,38 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 
                        n->bio.bi_end_io        = wbio->bio.bi_end_io;
                        n->bio.bi_private       = wbio->bio.bi_private;
-                       n->c                    = c;
-                       n->orig                 = &wbio->bio;
-                       n->bounce               = false;
+                       n->parent               = wbio;
                        n->split                = true;
+                       n->bounce               = false;
                        n->put_bio              = true;
                        n->bio.bi_opf           = wbio->bio.bi_opf;
-                       __bio_inc_remaining(n->orig);
+                       bio_inc_remaining(&wbio->bio);
                } else {
                        n = wbio;
+                       n->split                = false;
                }
 
+               n->c                    = c;
+               n->ca                   = ca;
+               n->ptr_idx              = ptr_idx++;
+               n->submit_time_us       = local_clock_us();
+               n->bio.bi_iter.bi_sector = ptr->offset;
+
                if (!journal_flushes_device(ca))
                        n->bio.bi_opf |= REQ_FUA;
 
-               bch2_submit_wbio(c, n, ca, ptr);
+               if (likely(percpu_ref_tryget(&ca->io_ref))) {
+                       this_cpu_add(ca->io_done->sectors[WRITE][type],
+                                    bio_sectors(&n->bio));
+
+                       n->have_io_ref          = true;
+                       n->bio.bi_bdev          = ca->disk_sb.bdev;
+                       submit_bio(&n->bio);
+               } else {
+                       n->have_io_ref          = false;
+                       bcache_io_error(c, &n->bio, "device has been removed");
+                       bio_endio(&n->bio);
+               }
        }
 }
 
@@ -188,8 +182,9 @@ static int bch2_write_index_default(struct bch_write_op *op)
        struct btree_iter iter;
        int ret;
 
-       bch2_btree_iter_init_intent(&iter, op->c, BTREE_ID_EXTENTS,
-               bkey_start_pos(&bch2_keylist_front(keys)->k));
+       bch2_btree_iter_init(&iter, op->c, BTREE_ID_EXTENTS,
+                            bkey_start_pos(&bch2_keylist_front(keys)->k),
+                            BTREE_ITER_INTENT);
 
        ret = bch2_btree_insert_list_at(&iter, keys, &op->res,
                                       NULL, op_journal_seq(op),
@@ -246,99 +241,68 @@ static void bch2_write_index(struct closure *cl)
        }
 }
 
-/**
- * bch_write_discard - discard range of keys
- *
- * Used to implement discard, and to handle when writethrough write hits
- * a write error on the cache device.
- */
-static void bch2_write_discard(struct closure *cl)
-{
-       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-       struct bio *bio = &op->bio->bio;
-       struct bpos end = op->pos;
-
-       end.offset += bio_sectors(bio);
-
-       op->error = bch2_discard(op->c, op->pos, end, op->version,
-                               &op->res, NULL, NULL);
-}
-
-/*
- * Convert extents to be inserted to discards after an error:
- */
 static void bch2_write_io_error(struct closure *cl)
 {
        struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+       struct keylist *keys = &op->insert_keys;
+       struct bch_fs *c = op->c;
+       struct bch_extent_ptr *ptr;
+       struct bkey_i *k;
+       int ret;
 
-       if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) {
-               struct bkey_i *src = bch2_keylist_front(&op->insert_keys);
-               struct bkey_i *dst = bch2_keylist_front(&op->insert_keys);
-
-               /*
-                * Our data write just errored, which means we've got a bunch
-                * of keys to insert that point to data that wasn't
-                * successfully written.
-                *
-                * We don't have to insert those keys but we still have to
-                * invalidate that region of the cache - so, if we just strip
-                * off all the pointers from the keys we'll accomplish just
-                * that.
-                */
+       for_each_keylist_key(keys, k) {
+               struct bkey_i *n = bkey_next(k);
+               struct bkey_s_extent e = bkey_i_to_s_extent(k);
 
-               while (src != op->insert_keys.top) {
-                       struct bkey_i *n = bkey_next(src);
+               extent_for_each_ptr_backwards(e, ptr)
+                       if (test_bit(ptr->dev, op->failed.d))
+                               bch2_extent_drop_ptr(e, ptr);
 
-                       set_bkey_val_u64s(&src->k, 0);
-                       src->k.type = KEY_TYPE_DISCARD;
-                       bkey_copy(dst, src);
+               memmove(bkey_next(k), n, (void *) keys->top - (void *) n);
+               keys->top_p -= (u64 *) n - (u64 *) bkey_next(k);
 
-                       dst = bkey_next(dst);
-                       src = n;
+               ret = bch2_extent_nr_ptrs(e.c)
+                       ? bch2_check_mark_super(c, e.c, BCH_DATA_USER)
+                       : -EIO;
+               if (ret) {
+                       keys->top = keys->keys;
+                       op->error = ret;
+                       op->flags |= BCH_WRITE_DONE;
+                       break;
                }
-
-               op->insert_keys.top = dst;
-               op->flags |= BCH_WRITE_DISCARD;
-       } else {
-               /* TODO: We could try to recover from this. */
-               while (!bch2_keylist_empty(&op->insert_keys))
-                       bch2_keylist_pop_front(&op->insert_keys);
-
-               op->error = -EIO;
-               op->flags |= BCH_WRITE_DONE;
        }
 
+       memset(&op->failed, 0, sizeof(op->failed));
+
        bch2_write_index(cl);
+       return;
 }
 
 static void bch2_write_endio(struct bio *bio)
 {
-       struct closure *cl = bio->bi_private;
-       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-       struct bch_write_bio *wbio = to_wbio(bio);
-       struct bch_fs *c = wbio->c;
-       struct bio *orig = wbio->orig;
-       struct bch_dev *ca = wbio->ca;
-
-       if (bch2_dev_nonfatal_io_err_on(bio->bi_error, ca,
-                                      "data write")) {
+       struct closure *cl              = bio->bi_private;
+       struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
+       struct bch_write_bio *wbio      = to_wbio(bio);
+       struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
+       struct bch_fs *c                = wbio->c;
+       struct bch_dev *ca              = wbio->ca;
+
+       if (bch2_dev_io_err_on(bio->bi_error, ca, "data write")) {
+               set_bit(ca->dev_idx, op->failed.d);
                set_closure_fn(cl, bch2_write_io_error, index_update_wq(op));
        }
 
-       if (ca)
+       if (wbio->have_io_ref)
                percpu_ref_put(&ca->io_ref);
 
-       if (bio->bi_error && orig)
-               orig->bi_error = bio->bi_error;
-
        if (wbio->bounce)
                bch2_bio_free_pages_pool(c, bio);
 
        if (wbio->put_bio)
                bio_put(bio);
 
-       if (orig)
-               bio_endio(orig);
+       if (parent)
+               bio_endio(&parent->bio);
        else
                closure_put(cl);
 }
@@ -386,11 +350,10 @@ static void init_append_extent(struct bch_write_op *op,
        bch2_keylist_push(&op->insert_keys);
 }
 
-static int bch2_write_extent(struct bch_write_op *op,
-                           struct open_bucket *ob,
-                           struct bio *orig)
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 {
        struct bch_fs *c = op->c;
+       struct bio *orig = &op->wbio.bio;
        struct bio *bio;
        struct bch_write_bio *wbio;
        unsigned key_to_write_offset = op->insert_keys.top_p -
@@ -398,15 +361,17 @@ static int bch2_write_extent(struct bch_write_op *op,
        struct bkey_i *key_to_write;
        unsigned csum_type = op->csum_type;
        unsigned compression_type = op->compression_type;
-       int ret;
+       int ret, more;
 
        /* don't refetch csum type/compression type */
        barrier();
 
+       BUG_ON(!bio_sectors(orig));
+
        /* Need to decompress data? */
        if ((op->flags & BCH_WRITE_DATA_COMPRESSED) &&
            (crc_uncompressed_size(NULL, &op->crc) != op->size ||
-            crc_compressed_size(NULL, &op->crc) > ob->sectors_free)) {
+            crc_compressed_size(NULL, &op->crc) > wp->sectors_free)) {
                int ret;
 
                ret = bch2_bio_uncompress_inplace(c, orig, op->size, op->crc);
@@ -424,19 +389,16 @@ static int bch2_write_extent(struct bch_write_op *op,
                                   op->crc.nonce,
                                   op->crc.csum,
                                   op->crc.csum_type,
-                                  ob);
+                                  wp->ob);
 
                bio                     = orig;
-               wbio                    = to_wbio(bio);
-               wbio->orig              = NULL;
-               wbio->bounce            = false;
-               wbio->put_bio           = false;
-               ret                     = 0;
+               wbio                    = wbio_init(bio);
+               more                    = 0;
        } else if (csum_type != BCH_CSUM_NONE ||
                   compression_type != BCH_COMPRESSION_NONE) {
                /* all units here in bytes */
                unsigned total_output = 0, output_available =
-                       min(ob->sectors_free << 9, orig->bi_iter.bi_size);
+                       min(wp->sectors_free << 9, orig->bi_iter.bi_size);
                unsigned crc_nonce = bch2_csum_type_is_encryption(csum_type)
                        ? op->nonce : 0;
                struct bch_csum csum;
@@ -445,19 +407,18 @@ static int bch2_write_extent(struct bch_write_op *op,
                bio = bio_alloc_bioset(GFP_NOIO,
                                       DIV_ROUND_UP(output_available, PAGE_SIZE),
                                       &c->bio_write);
+               wbio                    = wbio_init(bio);
+               wbio->bounce            = true;
+               wbio->put_bio           = true;
+               /* copy WRITE_SYNC flag */
+               wbio->bio.bi_opf        = orig->bi_opf;
+
                /*
                 * XXX: can't use mempool for more than
                 * BCH_COMPRESSED_EXTENT_MAX worth of pages
                 */
                bch2_bio_alloc_pages_pool(c, bio, output_available);
 
-               /* copy WRITE_SYNC flag */
-               bio->bi_opf             = orig->bi_opf;
-               wbio                    = to_wbio(bio);
-               wbio->orig              = NULL;
-               wbio->bounce            = true;
-               wbio->put_bio           = true;
-
                do {
                        unsigned fragment_compression_type = compression_type;
                        size_t dst_len, src_len;
@@ -466,17 +427,12 @@ static int bch2_write_extent(struct bch_write_op *op,
                                         orig, &src_len,
                                         &fragment_compression_type);
 
-                       BUG_ON(!dst_len || dst_len > bio->bi_iter.bi_size);
-                       BUG_ON(!src_len || src_len > orig->bi_iter.bi_size);
-                       BUG_ON(dst_len & (block_bytes(c) - 1));
-                       BUG_ON(src_len & (block_bytes(c) - 1));
-
-                       swap(bio->bi_iter.bi_size, dst_len);
                        nonce = extent_nonce(op->version,
                                             crc_nonce,
                                             src_len >> 9,
-                                            fragment_compression_type),
+                                            fragment_compression_type);
 
+                       swap(bio->bi_iter.bi_size, dst_len);
                        bch2_encrypt_bio(c, csum_type, nonce, bio);
 
                        csum = bch2_checksum_bio(c, csum_type, nonce, bio);
@@ -485,7 +441,7 @@ static int bch2_write_extent(struct bch_write_op *op,
                        init_append_extent(op,
                                           dst_len >> 9, src_len >> 9,
                                           fragment_compression_type,
-                                          crc_nonce, csum, csum_type, ob);
+                                          crc_nonce, csum, csum_type, wp->ob);
 
                        total_output += dst_len;
                        bio_advance(bio, dst_len);
@@ -510,67 +466,50 @@ static int bch2_write_extent(struct bch_write_op *op,
                        mempool_free(bio->bi_io_vec[--bio->bi_vcnt].bv_page,
                                     &c->bio_bounce_pages);
 
-               ret = orig->bi_iter.bi_size != 0;
+               more = orig->bi_iter.bi_size != 0;
        } else {
-               bio = bio_next_split(orig, ob->sectors_free, GFP_NOIO,
+               bio = bio_next_split(orig, wp->sectors_free, GFP_NOIO,
                                     &c->bio_write);
-
-               wbio                    = to_wbio(bio);
-               wbio->orig              = NULL;
-               wbio->bounce            = false;
+               wbio                    = wbio_init(bio);
                wbio->put_bio           = bio != orig;
 
                init_append_extent(op, bio_sectors(bio), bio_sectors(bio),
                                   compression_type, 0,
-                                  (struct bch_csum) { 0 }, csum_type, ob);
+                                  (struct bch_csum) { 0 }, csum_type, wp->ob);
 
-               ret = bio != orig;
+               more = bio != orig;
        }
 
+       /* might have done a realloc... */
+
+       key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
+
+       ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
+                                   BCH_DATA_USER);
+       if (ret)
+               return ret;
+
        bio->bi_end_io  = bch2_write_endio;
        bio->bi_private = &op->cl;
        bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
 
        closure_get(bio->bi_private);
 
-       /* might have done a realloc... */
-
-       key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
-
-       bch2_check_mark_super(c, key_to_write, false);
-
-       bch2_submit_wbio_replicas(to_wbio(bio), c, key_to_write);
-       return ret;
+       bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
+                                 key_to_write);
+       return more;
 }
 
 static void __bch2_write(struct closure *cl)
 {
        struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
        struct bch_fs *c = op->c;
-       struct bio *bio = &op->bio->bio;
        unsigned open_bucket_nr = 0;
-       struct open_bucket *b;
+       struct write_point *wp;
+       struct open_bucket *ob;
        int ret;
 
-       memset(op->open_buckets, 0, sizeof(op->open_buckets));
-
-       if (op->flags & BCH_WRITE_DISCARD) {
-               op->flags |= BCH_WRITE_DONE;
-               bch2_write_discard(cl);
-               bio_put(bio);
-               continue_at(cl, bch2_write_done, index_update_wq(op));
-       }
-
-       /*
-        * Journal writes are marked REQ_PREFLUSH; if the original write was a
-        * flush, it'll wait on the journal write.
-        */
-       bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA);
-
        do {
-               EBUG_ON(bio->bi_iter.bi_sector != op->pos.offset);
-               EBUG_ON(!bio_sectors(bio));
-
                if (open_bucket_nr == ARRAY_SIZE(op->open_buckets))
                        continue_at(cl, bch2_write_index, index_update_wq(op));
 
@@ -581,16 +520,19 @@ static void __bch2_write(struct closure *cl)
                                        BKEY_EXTENT_U64s_MAX))
                        continue_at(cl, bch2_write_index, index_update_wq(op));
 
-               b = bch2_alloc_sectors_start(c, op->wp,
+               wp = bch2_alloc_sectors_start(c, BCH_DATA_USER,
+                       op->devs,
+                       op->write_point,
                        op->nr_replicas,
                        c->opts.data_replicas_required,
                        op->alloc_reserve,
+                       op->flags,
                        (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
-               EBUG_ON(!b);
+               EBUG_ON(!wp);
 
-               if (unlikely(IS_ERR(b))) {
-                       if (unlikely(PTR_ERR(b) != -EAGAIN)) {
-                               ret = PTR_ERR(b);
+               if (unlikely(IS_ERR(wp))) {
+                       if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
+                               ret = PTR_ERR(wp);
                                goto err;
                        }
 
@@ -623,13 +565,15 @@ static void __bch2_write(struct closure *cl)
                        continue;
                }
 
-               BUG_ON(b - c->open_buckets == 0 ||
-                      b - c->open_buckets > U8_MAX);
-               op->open_buckets[open_bucket_nr++] = b - c->open_buckets;
+               ob = wp->ob;
+
+               BUG_ON(ob - c->open_buckets == 0 ||
+                      ob - c->open_buckets > U8_MAX);
+               op->open_buckets[open_bucket_nr++] = ob - c->open_buckets;
 
-               ret = bch2_write_extent(op, b, bio);
+               ret = bch2_write_extent(op, wp);
 
-               bch2_alloc_sectors_done(c, op->wp, b);
+               bch2_alloc_sectors_done(c, wp);
 
                if (ret < 0)
                        goto err;
@@ -638,27 +582,15 @@ static void __bch2_write(struct closure *cl)
        op->flags |= BCH_WRITE_DONE;
        continue_at(cl, bch2_write_index, index_update_wq(op));
 err:
-       if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) {
-               /*
-                * If we were writing cached data, not doing the write is fine
-                * so long as we discard whatever would have been overwritten -
-                * then it's equivalent to doing the write and immediately
-                * reclaiming it.
-                */
-
-               bch2_write_discard(cl);
-       } else {
-               /*
-                * Right now we can only error here if we went RO - the
-                * allocation failed, but we already checked for -ENOSPC when we
-                * got our reservation.
-                *
-                * XXX capacity might have changed, but we don't check for that
-                * yet:
-                */
-               op->error = ret;
-       }
-
+       /*
+        * Right now we can only error here if we went RO - the
+        * allocation failed, but we already checked for -ENOSPC when we
+        * got our reservation.
+        *
+        * XXX capacity might have changed, but we don't check for that
+        * yet:
+        */
+       op->error = ret;
        op->flags |= BCH_WRITE_DONE;
 
        /*
@@ -708,16 +640,13 @@ void bch2_wake_delayed_writes(unsigned long data)
  * after the data is written it calls bch_journal, and after the keys have been
  * added to the next journal write they're inserted into the btree.
  *
- * It inserts the data in op->bio; bi_sector is used for the key offset, and
- * op->inode is used for the key inode.
- *
  * If op->discard is true, instead of inserting the data it invalidates the
  * region of the cache represented by op->bio and op->inode.
  */
 void bch2_write(struct closure *cl)
 {
        struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-       struct bio *bio = &op->bio->bio;
+       struct bio *bio = &op->wbio.bio;
        struct bch_fs *c = op->c;
        u64 inode = op->pos.inode;
 
@@ -734,20 +663,19 @@ void bch2_write(struct closure *cl)
                op->version.lo =
                        atomic64_inc_return(&c->key_version) + 1;
 
-       if (!(op->flags & BCH_WRITE_DISCARD))
-               bch2_increment_clock(c, bio_sectors(bio), WRITE);
+       bch2_increment_clock(c, bio_sectors(bio), WRITE);
 
        /* Don't call bch2_next_delay() if rate is >= 1 GB/sec */
 
-       if (c->foreground_write_ratelimit_enabled &&
-           c->foreground_write_pd.rate.rate < (1 << 30) &&
-           !(op->flags & BCH_WRITE_DISCARD) && op->wp->throttle) {
+       if ((op->flags & BCH_WRITE_THROTTLE) &&
+           c->foreground_write_ratelimit_enabled &&
+           c->foreground_write_pd.rate.rate < (1 << 30)) {
                unsigned long flags;
                u64 delay;
 
                spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
                bch2_ratelimit_increment(&c->foreground_write_pd.rate,
-                                       bio->bi_iter.bi_size);
+                                        bio->bi_iter.bi_size);
 
                delay = bch2_ratelimit_delay(&c->foreground_write_pd.rate);
 
@@ -781,27 +709,30 @@ void bch2_write(struct closure *cl)
 }
 
 void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
-                      struct bch_write_bio *bio, struct disk_reservation res,
-                      struct write_point *wp, struct bpos pos,
-                      u64 *journal_seq, unsigned flags)
+                       struct disk_reservation res,
+                       struct bch_devs_mask *devs,
+                       unsigned long write_point,
+                       struct bpos pos,
+                       u64 *journal_seq, unsigned flags)
 {
        EBUG_ON(res.sectors && !res.nr_replicas);
 
        op->c           = c;
        op->io_wq       = index_update_wq(op);
-       op->bio         = bio;
        op->written     = 0;
        op->error       = 0;
        op->flags       = flags;
        op->csum_type   = bch2_data_checksum_type(c);
-       op->compression_type = c->opts.compression;
+       op->compression_type =
+               bch2_compression_opt_to_type(c->opts.compression);
        op->nr_replicas = res.nr_replicas;
        op->alloc_reserve = RESERVE_NONE;
        op->nonce       = 0;
        op->pos         = pos;
        op->version     = ZERO_VERSION;
        op->res         = res;
-       op->wp          = wp;
+       op->devs        = devs;
+       op->write_point = write_point;
 
        if (journal_seq) {
                op->journal_seq_p = journal_seq;
@@ -812,6 +743,9 @@ void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 
        op->index_update_fn = bch2_write_index_default;
 
+       memset(op->open_buckets, 0, sizeof(op->open_buckets));
+       memset(&op->failed, 0, sizeof(op->failed));
+
        bch2_keylist_init(&op->insert_keys,
                          op->inline_keys,
                          ARRAY_SIZE(op->inline_keys));
@@ -820,53 +754,230 @@ void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
                get_random_bytes(&op->version, sizeof(op->version));
 }
 
-/* Discard */
-
-/* bch_discard - discard a range of keys from start_key to end_key.
- * @c          filesystem
- * @start_key  pointer to start location
- *             NOTE: discard starts at bkey_start_offset(start_key)
- * @end_key    pointer to end location
- *             NOTE: discard ends at KEY_OFFSET(end_key)
- * @version    version of discard (0ULL if none)
- *
- * Returns:
- *      0 on success
- *     <0 on error
- *
- * XXX: this needs to be refactored with inode_truncate, or more
- *     appropriately inode_truncate should call this
- */
-int bch2_discard(struct bch_fs *c, struct bpos start,
-                struct bpos end, struct bversion version,
-                struct disk_reservation *disk_res,
-                struct extent_insert_hook *hook,
-                u64 *journal_seq)
-{
-       return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, start, end, version,
-                                     disk_res, hook, journal_seq);
-}
-
 /* Cache promotion on read */
 
-struct cache_promote_op {
+struct promote_op {
        struct closure          cl;
        struct migrate_write    write;
        struct bio_vec          bi_inline_vecs[0]; /* must be last */
 };
 
+static void promote_done(struct closure *cl)
+{
+       struct promote_op *op =
+               container_of(cl, struct promote_op, cl);
+       struct bch_fs *c = op->write.op.c;
+
+       percpu_ref_put(&c->writes);
+       bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
+       kfree(op);
+}
+
+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
+{
+       struct bch_fs *c = rbio->c;
+       struct closure *cl = &op->cl;
+       struct bio *bio = &op->write.op.wbio.bio;
+
+       BUG_ON(!rbio->split || !rbio->bounce);
+
+       if (!percpu_ref_tryget(&c->writes))
+               return;
+
+       trace_promote(&rbio->bio);
+
+       /* we now own pages: */
+       swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
+       memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+              sizeof(struct bio_vec) * bio->bi_vcnt);
+       rbio->promote = NULL;
+
+       closure_init(cl, NULL);
+       closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
+       closure_return_with_destructor(cl, promote_done);
+}
+
+/*
+ * XXX: multiple promotes can race with each other, wastefully. Keep a list of
+ * outstanding promotes?
+ */
+static struct promote_op *promote_alloc(struct bch_fs *c,
+                                       struct bvec_iter iter,
+                                       struct bkey_s_c k,
+                                       struct extent_pick_ptr *pick,
+                                       bool read_full)
+{
+       struct promote_op *op;
+       struct bio *bio;
+       /*
+        * biovec needs to be big enough to hold decompressed data, if
+        * bch2_write_extent() has to decompress/recompress it:
+        */
+       unsigned sectors = max_t(unsigned, k.k->size,
+                     crc_uncompressed_size(NULL, &pick->crc));
+       unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+
+       op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
+       if (!op)
+               return NULL;
+
+       bio = &op->write.op.wbio.bio;
+       bio_init(bio, bio->bi_inline_vecs, pages);
+
+       bio->bi_iter = iter;
+
+       if (pick->crc.compression_type) {
+               op->write.op.flags     |= BCH_WRITE_DATA_COMPRESSED;
+               op->write.op.crc        = pick->crc;
+               op->write.op.size       = k.k->size;
+       } else if (read_full) {
+               /*
+                * Adjust bio to correspond to _live_ portion of @k -
+                * which might be less than what we're actually reading:
+                */
+               bio->bi_iter.bi_size = sectors << 9;
+               bio_advance(bio, pick->crc.offset << 9);
+               BUG_ON(bio_sectors(bio) < k.k->size);
+               bio->bi_iter.bi_size = k.k->size << 9;
+       } else {
+               /*
+                * Set insert pos to correspond to what we're actually
+                * reading:
+                */
+               op->write.op.pos.offset = iter.bi_sector;
+       }
+       bch2_migrate_write_init(c, &op->write,
+                               c->fastest_devs,
+                               k, NULL,
+                               BCH_WRITE_ALLOC_NOWAIT|
+                               BCH_WRITE_CACHED);
+       op->write.promote = true;
+
+       return op;
+}
+
+/* only promote if we're not reading from the fastest tier: */
+static bool should_promote(struct bch_fs *c,
+                          struct extent_pick_ptr *pick, unsigned flags)
+{
+       if (!(flags & BCH_READ_MAY_PROMOTE))
+               return false;
+
+       if (flags & BCH_READ_IN_RETRY)
+               return false;
+
+       if (percpu_ref_is_dying(&c->writes))
+               return false;
+
+       return c->fastest_tier &&
+               c->fastest_tier < c->tiers + pick->ca->mi.tier;
+}
+
 /* Read */
 
-static int bio_checksum_uncompress(struct bch_fs *c,
-                                  struct bch_read_bio *rbio)
+#define READ_RETRY_AVOID       1
+#define READ_RETRY             2
+#define READ_ERR               3
+
+static inline struct bch_read_bio *
+bch2_rbio_parent(struct bch_read_bio *rbio)
+{
+       return rbio->split ? rbio->parent : rbio;
+}
+
+__always_inline
+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
+                          struct workqueue_struct *wq)
+{
+
+       if (!wq || rbio->process_context) {
+               fn(&rbio->work);
+       } else {
+               rbio->work.func         = fn;
+               rbio->process_context   = true;
+               queue_work(wq, &rbio->work);
+       }
+}
+
+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
 {
+       struct bch_read_bio *parent = rbio->parent;
+
+       BUG_ON(!rbio->split);
+
+       if (rbio->promote)
+               kfree(rbio->promote);
+       if (rbio->bounce)
+               bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
+       bio_put(&rbio->bio);
+
+       return parent;
+}
+
+static void bch2_rbio_done(struct bch_read_bio *rbio)
+{
+       if (rbio->promote)
+               kfree(rbio->promote);
+       rbio->promote = NULL;
+
+       if (rbio->split)
+               rbio = bch2_rbio_free(rbio);
+       bio_endio(&rbio->bio);
+}
+
+static void bch2_rbio_retry(struct work_struct *work)
+{
+       struct bch_read_bio *rbio =
+               container_of(work, struct bch_read_bio, work);
+       struct bch_fs *c                = rbio->c;
+       struct bvec_iter iter           = rbio->bvec_iter;
+       unsigned flags                  = rbio->flags;
+       u64 inode                       = rbio->inode;
+       struct bch_devs_mask avoid;
+
+       trace_read_retry(&rbio->bio);
+
+       memset(&avoid, 0, sizeof(avoid));
+
+       if (rbio->retry == READ_RETRY_AVOID)
+               __set_bit(rbio->pick.ca->dev_idx, avoid.d);
+
+       if (rbio->split)
+               rbio = bch2_rbio_free(rbio);
+       else
+               rbio->bio.bi_error = 0;
+
+       flags |= BCH_READ_MUST_CLONE;
+       flags |= BCH_READ_IN_RETRY;
+
+       __bch2_read(c, rbio, iter, inode, &avoid, flags);
+}
+
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
+{
+       rbio->retry = retry;
+
+       if (rbio->flags & BCH_READ_IN_RETRY)
+               return;
+
+       if (retry == READ_ERR) {
+               bch2_rbio_parent(rbio)->bio.bi_error = error;
+               bch2_rbio_done(rbio);
+       } else {
+               bch2_rbio_punt(rbio, bch2_rbio_retry, rbio->c->wq);
+       }
+}
+
+static int bch2_rbio_checksum_uncompress(struct bio *dst,
+                                        struct bch_read_bio *rbio)
+{
+       struct bch_fs *c = rbio->c;
        struct bio *src = &rbio->bio;
-       struct bio *dst = &bch2_rbio_parent(rbio)->bio;
-       struct bvec_iter dst_iter = rbio->parent_iter;
+       struct bvec_iter dst_iter = rbio->bvec_iter;
        struct nonce nonce = extent_nonce(rbio->version,
-                               rbio->crc.nonce,
-                               crc_uncompressed_size(NULL, &rbio->crc),
-                               rbio->crc.compression_type);
+                               rbio->pick.crc.nonce,
+                               crc_uncompressed_size(NULL, &rbio->pick.crc),
+                               rbio->pick.crc.compression_type);
        struct bch_csum csum;
        int ret = 0;
 
@@ -877,130 +988,64 @@ static int bio_checksum_uncompress(struct bch_fs *c,
         * in order to promote
         */
        if (rbio->bounce) {
-               src->bi_iter.bi_size    = crc_compressed_size(NULL, &rbio->crc) << 9;
+               src->bi_iter.bi_size    = crc_compressed_size(NULL, &rbio->pick.crc) << 9;
                src->bi_iter.bi_idx     = 0;
                src->bi_iter.bi_bvec_done = 0;
        } else {
-               src->bi_iter = rbio->parent_iter;
+               src->bi_iter = rbio->bvec_iter;
        }
 
-       csum = bch2_checksum_bio(c, rbio->crc.csum_type, nonce, src);
-       if (bch2_dev_nonfatal_io_err_on(bch2_crc_cmp(rbio->crc.csum, csum),
-                                       rbio->ca,
+       csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, nonce, src);
+       if (bch2_dev_io_err_on(bch2_crc_cmp(rbio->pick.crc.csum, csum),
+                              rbio->pick.ca,
                        "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
-                       rbio->inode, (u64) rbio->parent_iter.bi_sector << 9,
-                       rbio->crc.csum.hi, rbio->crc.csum.lo, csum.hi, csum.lo,
-                       rbio->crc.csum_type))
+                       rbio->inode, (u64) rbio->bvec_iter.bi_sector << 9,
+                       rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
+                       csum.hi, csum.lo,
+                       rbio->pick.crc.csum_type))
                ret = -EIO;
 
        /*
         * If there was a checksum error, still copy the data back - unless it
         * was compressed, we don't want to decompress bad data:
         */
-       if (rbio->crc.compression_type != BCH_COMPRESSION_NONE) {
+       if (rbio->pick.crc.compression_type != BCH_COMPRESSION_NONE) {
                if (!ret) {
-                       bch2_encrypt_bio(c, rbio->crc.csum_type, nonce, src);
+                       bch2_encrypt_bio(c, rbio->pick.crc.csum_type, nonce, src);
                        ret = bch2_bio_uncompress(c, src, dst,
-                                                dst_iter, rbio->crc);
+                                                dst_iter, rbio->pick.crc);
                        if (ret)
                                __bcache_io_error(c, "decompression error");
                }
        } else if (rbio->bounce) {
-               bio_advance(src, rbio->crc.offset << 9);
+               bio_advance(src, rbio->pick.crc.offset << 9);
 
                /* don't need to decrypt the entire bio: */
                BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
                src->bi_iter.bi_size = dst_iter.bi_size;
 
-               nonce = nonce_add(nonce, rbio->crc.offset << 9);
+               nonce = nonce_add(nonce, rbio->pick.crc.offset << 9);
 
-               bch2_encrypt_bio(c, rbio->crc.csum_type,
+               bch2_encrypt_bio(c, rbio->pick.crc.csum_type,
                                nonce, src);
 
-               bio_copy_data_iter(dst, dst_iter,
-                                  src, src->bi_iter);
+               bio_copy_data_iter(dst, &dst_iter,
+                                  src, &src->bi_iter);
        } else {
-               bch2_encrypt_bio(c, rbio->crc.csum_type, nonce, src);
+               bch2_encrypt_bio(c, rbio->pick.crc.csum_type, nonce, src);
        }
 
        return ret;
 }
 
-static void bch2_rbio_free(struct bch_read_bio *rbio)
-{
-       struct bch_fs *c = rbio->c;
-       struct bio *bio = &rbio->bio;
-
-       BUG_ON(rbio->ca);
-       BUG_ON(!rbio->split);
-
-       if (rbio->promote)
-               kfree(rbio->promote);
-       if (rbio->bounce)
-               bch2_bio_free_pages_pool(c, bio);
-
-       bio_put(bio);
-}
-
-static void bch2_rbio_done(struct bch_read_bio *rbio)
-{
-       struct bio *orig = &bch2_rbio_parent(rbio)->bio;
-
-       percpu_ref_put(&rbio->ca->io_ref);
-       rbio->ca = NULL;
-
-       if (rbio->split) {
-               if (rbio->bio.bi_error)
-                       orig->bi_error = rbio->bio.bi_error;
-
-               bio_endio(orig);
-               bch2_rbio_free(rbio);
-       } else {
-               if (rbio->promote)
-                       kfree(rbio->promote);
-
-               orig->bi_end_io = rbio->orig_bi_end_io;
-               bio_endio_nodec(orig);
-       }
-}
-
-static void bch2_rbio_error(struct bch_read_bio *rbio, int error)
-{
-       bch2_rbio_parent(rbio)->bio.bi_error = error;
-       bch2_rbio_done(rbio);
-}
-
-static void bch2_rbio_retry(struct bch_fs *c, struct bch_read_bio *rbio)
-{
-       unsigned long flags;
-
-       percpu_ref_put(&rbio->ca->io_ref);
-       rbio->ca = NULL;
-
-       spin_lock_irqsave(&c->read_retry_lock, flags);
-       bio_list_add(&c->read_retry_list, &rbio->bio);
-       spin_unlock_irqrestore(&c->read_retry_lock, flags);
-       queue_work(c->wq, &c->read_retry_work);
-}
-
-static void cache_promote_done(struct closure *cl)
-{
-       struct cache_promote_op *op =
-               container_of(cl, struct cache_promote_op, cl);
-
-       bch2_bio_free_pages_pool(op->write.op.c, &op->write.wbio.bio);
-       kfree(op);
-}
-
 /* Inner part that may run in process context */
 static void __bch2_read_endio(struct work_struct *work)
 {
        struct bch_read_bio *rbio =
                container_of(work, struct bch_read_bio, work);
-       struct bch_fs *c = rbio->c;
        int ret;
 
-       ret = bio_checksum_uncompress(c, rbio);
+       ret = bch2_rbio_checksum_uncompress(&bch2_rbio_parent(rbio)->bio, rbio);
        if (ret) {
                /*
                 * Checksum error: if the bio wasn't bounced, we may have been
@@ -1008,34 +1053,19 @@ static void __bch2_read_endio(struct work_struct *work)
                 * scribble over) - retry the read, bouncing it this time:
                 */
                if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
-                       rbio->flags |= BCH_READ_FORCE_BOUNCE;
-                       bch2_rbio_retry(c, rbio);
+                       rbio->flags |= BCH_READ_MUST_BOUNCE;
+                       bch2_rbio_error(rbio, READ_RETRY, ret);
                } else {
-                       bch2_rbio_error(rbio, -EIO);
+                       bch2_rbio_error(rbio, READ_RETRY_AVOID, ret);
                }
                return;
        }
 
-       if (rbio->promote) {
-               struct cache_promote_op *promote = rbio->promote;
-               struct closure *cl = &promote->cl;
-
-               BUG_ON(!rbio->split || !rbio->bounce);
-
-               trace_promote(&rbio->bio);
-
-               /* we now own pages: */
-               swap(promote->write.wbio.bio.bi_vcnt, rbio->bio.bi_vcnt);
-               rbio->promote = NULL;
-
-               bch2_rbio_done(rbio);
+       if (rbio->promote)
+               promote_start(rbio->promote, rbio);
 
-               closure_init(cl, &c->cl);
-               closure_call(&promote->write.op.cl, bch2_write, c->wq, cl);
-               closure_return_with_destructor(cl, cache_promote_done);
-       } else {
+       if (likely(!(rbio->flags & BCH_READ_IN_RETRY)))
                bch2_rbio_done(rbio);
-       }
 }
 
 static void bch2_read_endio(struct bio *bio)
@@ -1043,90 +1073,55 @@ static void bch2_read_endio(struct bio *bio)
        struct bch_read_bio *rbio =
                container_of(bio, struct bch_read_bio, bio);
        struct bch_fs *c = rbio->c;
+       struct workqueue_struct *wq = NULL;
+
+       percpu_ref_put(&rbio->pick.ca->io_ref);
 
-       if (bch2_dev_nonfatal_io_err_on(bio->bi_error, rbio->ca, "data read")) {
-               /* XXX: retry IO errors when we have another replica */
-               bch2_rbio_error(rbio, bio->bi_error);
+       if (!rbio->split)
+               rbio->bio.bi_end_io = rbio->end_io;
+
+       if (bch2_dev_io_err_on(bio->bi_error, rbio->pick.ca, "data read")) {
+               bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_error);
                return;
        }
 
-       if (rbio->ptr.cached &&
+       if (rbio->pick.ptr.cached &&
            (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
-            ptr_stale(rbio->ca, &rbio->ptr))) {
-               atomic_long_inc(&c->cache_read_races);
+            ptr_stale(rbio->pick.ca, &rbio->pick.ptr))) {
+               atomic_long_inc(&c->read_realloc_races);
 
                if (rbio->flags & BCH_READ_RETRY_IF_STALE)
-                       bch2_rbio_retry(c, rbio);
+                       bch2_rbio_error(rbio, READ_RETRY, -EINTR);
                else
-                       bch2_rbio_error(rbio, -EINTR);
+                       bch2_rbio_error(rbio, READ_ERR, -EINTR);
                return;
        }
 
-       if (rbio->crc.compression_type ||
-           bch2_csum_type_is_encryption(rbio->crc.csum_type))
-               queue_work(system_unbound_wq, &rbio->work);
-       else if (rbio->crc.csum_type)
-               queue_work(system_highpri_wq, &rbio->work);
-       else
-               __bch2_read_endio(&rbio->work);
-}
-
-static bool should_promote(struct bch_fs *c,
-                          struct extent_pick_ptr *pick, unsigned flags)
-{
-       if (!(flags & BCH_READ_PROMOTE))
-               return false;
-
-       if (percpu_ref_is_dying(&c->writes))
-               return false;
+       if (rbio->pick.crc.compression_type ||
+           bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
+               wq = system_unbound_wq;
+       else if (rbio->pick.crc.csum_type)
+               wq = system_highpri_wq;
 
-       return c->fastest_tier &&
-               c->fastest_tier < c->tiers + pick->ca->mi.tier;
+       bch2_rbio_punt(rbio, __bch2_read_endio, wq);
 }
 
-void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
-                         struct bvec_iter iter, struct bkey_s_c k,
-                         struct extent_pick_ptr *pick, unsigned flags)
+int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
+                      struct bvec_iter iter, struct bkey_s_c k,
+                      struct extent_pick_ptr *pick, unsigned flags)
 {
        struct bch_read_bio *rbio;
-       struct cache_promote_op *promote_op = NULL;
+       struct promote_op *promote_op = NULL;
        unsigned skip = iter.bi_sector - bkey_start_offset(k.k);
        bool bounce = false, split, read_full = false;
+       int ret = 0;
 
        bch2_increment_clock(c, bio_sectors(&orig->bio), READ);
+       PTR_BUCKET(pick->ca, &pick->ptr)->prio[READ] = c->prio_clock[READ].hand;
 
        EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
                k.k->p.offset < bvec_iter_end_sector(iter));
 
-       /* only promote if we're not reading from the fastest tier: */
-
-       /*
-        * XXX: multiple promotes can race with each other, wastefully. Keep a
-        * list of outstanding promotes?
-        */
-       if (should_promote(c, pick, flags)) {
-               /*
-                * biovec needs to be big enough to hold decompressed data, if
-                * the bch2_write_extent() has to decompress/recompress it:
-                */
-               unsigned sectors =
-                       max_t(unsigned, k.k->size,
-                             crc_uncompressed_size(NULL, &pick->crc));
-               unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
-
-               promote_op = kmalloc(sizeof(*promote_op) +
-                               sizeof(struct bio_vec) * pages, GFP_NOIO);
-               if (promote_op) {
-                       struct bio *promote_bio = &promote_op->write.wbio.bio;
-
-                       bio_init(promote_bio);
-                       promote_bio->bi_max_vecs = pages;
-                       promote_bio->bi_io_vec  = promote_bio->bi_inline_vecs;
-                       bounce = true;
-                       /* could also set read_full */
-               }
-       }
-
        /*
         * note: if compression_type and crc_type both == none, then
         * compressed/uncompressed size is zero
@@ -1136,25 +1131,30 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
             (bvec_iter_sectors(iter) != crc_uncompressed_size(NULL, &pick->crc) ||
              (bch2_csum_type_is_encryption(pick->crc.csum_type) &&
               (flags & BCH_READ_USER_MAPPED)) ||
-             (flags & BCH_READ_FORCE_BOUNCE)))) {
+             (flags & BCH_READ_MUST_BOUNCE)))) {
                read_full = true;
                bounce = true;
        }
 
+       if (should_promote(c, pick, flags))
+               promote_op = promote_alloc(c, iter, k, pick, read_full);
+
+       /* could also set read_full */
+       if (promote_op)
+               bounce = true;
+
        if (bounce) {
                unsigned sectors = read_full
                        ? (crc_compressed_size(NULL, &pick->crc) ?: k.k->size)
                        : bvec_iter_sectors(iter);
 
-               rbio = container_of(bio_alloc_bioset(GFP_NOIO,
+               rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
                                        DIV_ROUND_UP(sectors, PAGE_SECTORS),
-                                       &c->bio_read_split),
-                                   struct bch_read_bio, bio);
+                                       &c->bio_read_split));
 
                bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
                split = true;
-       } else if (!(flags & BCH_READ_MAY_REUSE_BIO) ||
-                  !(flags & BCH_READ_IS_LAST)) {
+       } else if (flags & BCH_READ_MUST_CLONE) {
                /*
                 * Have to clone if there were any splits, due to error
                 * reporting issues (if a split errored, and retrying didn't
@@ -1163,9 +1163,8 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
                 * from the whole bio, in which case we don't want to retry and
                 * lose the error)
                 */
-               rbio = container_of(bio_clone_fast(&orig->bio,
-                                       GFP_NOIO, &c->bio_read_split),
-                                   struct bch_read_bio, bio);
+               rbio = rbio_init(bio_clone_fast(&orig->bio,
+                                             GFP_NOIO, &c->bio_read_split));
                rbio->bio.bi_iter = iter;
                split = true;
        } else {
@@ -1175,80 +1174,39 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
                BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
        }
 
-       if (!(flags & BCH_READ_IS_LAST))
-               __bio_inc_remaining(&orig->bio);
+       rbio->c                 = c;
 
        if (split)
                rbio->parent    = orig;
        else
-               rbio->orig_bi_end_io = orig->bio.bi_end_io;
-       rbio->parent_iter       = iter;
+               rbio->end_io    = orig->bio.bi_end_io;
 
+       rbio->bvec_iter         = iter;
        rbio->flags             = flags;
        rbio->bounce            = bounce;
        rbio->split             = split;
-       rbio->c                 = c;
-       rbio->ca                = pick->ca;
-       rbio->ptr               = pick->ptr;
-       rbio->crc               = pick->crc;
+       rbio->process_context   = false;
+       rbio->retry             = 0;
+       rbio->pick              = *pick;
        /*
         * crc.compressed_size will be 0 if there wasn't any checksum
         * information, also we need to stash the original size of the bio if we
         * bounced (which isn't necessarily the original key size, if we bounced
         * only for promoting)
         */
-       rbio->crc._compressed_size = bio_sectors(&rbio->bio) - 1;
+       rbio->pick.crc._compressed_size = bio_sectors(&rbio->bio) - 1;
        rbio->version           = k.k->version;
        rbio->promote           = promote_op;
        rbio->inode             = k.k->p.inode;
-       INIT_WORK(&rbio->work, __bch2_read_endio);
+       INIT_WORK(&rbio->work, NULL);
 
        rbio->bio.bi_bdev       = pick->ca->disk_sb.bdev;
        rbio->bio.bi_opf        = orig->bio.bi_opf;
        rbio->bio.bi_iter.bi_sector = pick->ptr.offset;
        rbio->bio.bi_end_io     = bch2_read_endio;
 
-       if (promote_op) {
-               struct bio *promote_bio = &promote_op->write.wbio.bio;
-
-               promote_bio->bi_iter = rbio->bio.bi_iter;
-               memcpy(promote_bio->bi_io_vec, rbio->bio.bi_io_vec,
-                      sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
-
-               bch2_migrate_write_init(c, &promote_op->write,
-                                      &c->promote_write_point,
-                                      k, NULL,
-                                      BCH_WRITE_ALLOC_NOWAIT|
-                                      BCH_WRITE_CACHED);
-               promote_op->write.promote = true;
-
-               if (rbio->crc.compression_type) {
-                       promote_op->write.op.flags |= BCH_WRITE_DATA_COMPRESSED;
-                       promote_op->write.op.crc = rbio->crc;
-                       promote_op->write.op.size = k.k->size;
-               } else if (read_full) {
-                       /*
-                        * Adjust bio to correspond to _live_ portion of @k -
-                        * which might be less than what we're actually reading:
-                        */
-                       bio_advance(promote_bio, rbio->crc.offset << 9);
-                       BUG_ON(bio_sectors(promote_bio) < k.k->size);
-                       promote_bio->bi_iter.bi_size = k.k->size << 9;
-               } else {
-                       /*
-                        * Set insert pos to correspond to what we're actually
-                        * reading:
-                        */
-                       promote_op->write.op.pos.offset = iter.bi_sector;
-               }
-
-               promote_bio->bi_iter.bi_sector =
-                       promote_op->write.op.pos.offset;
-       }
-
-       /* _after_ promete stuff has looked at rbio->crc.offset */
        if (read_full)
-               rbio->crc.offset += skip;
+               rbio->pick.crc.offset += skip;
        else
                rbio->bio.bi_iter.bi_sector += skip;
 
@@ -1257,27 +1215,39 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
        if (bounce)
                trace_read_bounce(&rbio->bio);
 
-       if (!(flags & BCH_READ_IS_LAST))
-               trace_read_split(&rbio->bio);
+       this_cpu_add(pick->ca->io_done->sectors[READ][BCH_DATA_USER],
+                    bio_sectors(&rbio->bio));
+
+       if (likely(!(flags & BCH_READ_IN_RETRY))) {
+               submit_bio(&rbio->bio);
+       } else {
+               submit_bio_wait(&rbio->bio);
+
+               rbio->process_context = true;
+               bch2_read_endio(&rbio->bio);
 
-       generic_make_request(&rbio->bio);
+               ret = rbio->retry;
+               if (!ret)
+                       bch2_rbio_done(rbio);
+       }
+
+       return ret;
 }
 
-static void bch2_read_iter(struct bch_fs *c, struct bch_read_bio *rbio,
-                         struct bvec_iter bvec_iter, u64 inode,
-                         unsigned flags)
+void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+                struct bvec_iter bvec_iter, u64 inode,
+                struct bch_devs_mask *avoid, unsigned flags)
 {
-       struct bio *bio = &rbio->bio;
        struct btree_iter iter;
        struct bkey_s_c k;
        int ret;
-
-       for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
-                                     POS(inode, bvec_iter.bi_sector), k) {
+retry:
+       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+                          POS(inode, bvec_iter.bi_sector),
+                          BTREE_ITER_WITH_HOLES, k) {
                BKEY_PADDED(k) tmp;
                struct extent_pick_ptr pick;
-               unsigned bytes, sectors;
-               bool is_last;
+               struct bvec_iter fragment;
 
                /*
                 * Unlock the iterator while the btree node's lock is still in
@@ -1287,43 +1257,47 @@ static void bch2_read_iter(struct bch_fs *c, struct bch_read_bio *rbio,
                k = bkey_i_to_s_c(&tmp.k);
                bch2_btree_iter_unlock(&iter);
 
-               bch2_extent_pick_ptr(c, k, &pick);
+               bch2_extent_pick_ptr(c, k, avoid, &pick);
                if (IS_ERR(pick.ca)) {
-                       bcache_io_error(c, bio, "no device to read from");
-                       bio_endio(bio);
+                       bcache_io_error(c, &rbio->bio, "no device to read from");
+                       bio_endio(&rbio->bio);
                        return;
                }
 
-               sectors = min_t(u64, k.k->p.offset,
-                               bvec_iter_end_sector(bvec_iter)) -
-                       bvec_iter.bi_sector;
-               bytes = sectors << 9;
-               is_last = bytes == bvec_iter.bi_size;
-               swap(bvec_iter.bi_size, bytes);
-
-               if (is_last)
-                       flags |= BCH_READ_IS_LAST;
+               fragment = bvec_iter;
+               fragment.bi_size = (min_t(u64, k.k->p.offset,
+                                         bvec_iter_end_sector(bvec_iter)) -
+                                   bvec_iter.bi_sector) << 9;
 
                if (pick.ca) {
-                       PTR_BUCKET(pick.ca, &pick.ptr)->read_prio =
-                               c->prio_clock[READ].hand;
-
-                       bch2_read_extent_iter(c, rbio, bvec_iter,
-                                            k, &pick, flags);
+                       if (fragment.bi_size != bvec_iter.bi_size) {
+                               bio_inc_remaining(&rbio->bio);
+                               flags |= BCH_READ_MUST_CLONE;
+                               trace_read_split(&rbio->bio);
+                       }
 
-                       flags &= ~BCH_READ_MAY_REUSE_BIO;
+                       ret = __bch2_read_extent(c, rbio, fragment,
+                                                k, &pick, flags);
+                       switch (ret) {
+                       case READ_RETRY_AVOID:
+                               __set_bit(pick.ca->dev_idx, avoid->d);
+                       case READ_RETRY:
+                               goto retry;
+                       case READ_ERR:
+                               bio_endio(&rbio->bio);
+                               return;
+                       };
                } else {
-                       zero_fill_bio_iter(bio, bvec_iter);
+                       zero_fill_bio_iter(&rbio->bio, fragment);
 
-                       if (is_last)
-                               bio_endio(bio);
+                       if (fragment.bi_size == bvec_iter.bi_size)
+                               bio_endio(&rbio->bio);
                }
 
-               if (is_last)
+               if (fragment.bi_size == bvec_iter.bi_size)
                        return;
 
-               swap(bvec_iter.bi_size, bytes);
-               bio_advance_iter(bio, &bvec_iter, bytes);
+               bio_advance_iter(&rbio->bio, &bvec_iter, fragment.bi_size);
        }
 
        /*
@@ -1332,56 +1306,6 @@ static void bch2_read_iter(struct bch_fs *c, struct bch_read_bio *rbio,
         */
        ret = bch2_btree_iter_unlock(&iter);
        BUG_ON(!ret);
-       bcache_io_error(c, bio, "btree IO error %i", ret);
-       bio_endio(bio);
-}
-
-void bch2_read(struct bch_fs *c, struct bch_read_bio *bio, u64 inode)
-{
-       bch2_read_iter(c, bio, bio->bio.bi_iter, inode,
-                     BCH_READ_RETRY_IF_STALE|
-                     BCH_READ_PROMOTE|
-                     BCH_READ_MAY_REUSE_BIO|
-                     BCH_READ_USER_MAPPED);
-}
-
-/**
- * bch_read_retry - re-submit a bio originally from bch2_read()
- */
-static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio)
-{
-       struct bch_read_bio *parent = bch2_rbio_parent(rbio);
-       struct bvec_iter iter = rbio->parent_iter;
-       unsigned flags = rbio->flags;
-       u64 inode = rbio->inode;
-
-       trace_read_retry(&rbio->bio);
-
-       if (rbio->split)
-               bch2_rbio_free(rbio);
-       else
-               rbio->bio.bi_end_io = rbio->orig_bi_end_io;
-
-       bch2_read_iter(c, parent, iter, inode, flags);
-}
-
-void bch2_read_retry_work(struct work_struct *work)
-{
-       struct bch_fs *c = container_of(work, struct bch_fs,
-                                          read_retry_work);
-       struct bch_read_bio *rbio;
-       struct bio *bio;
-       unsigned long flags;
-
-       while (1) {
-               spin_lock_irqsave(&c->read_retry_lock, flags);
-               bio = bio_list_pop(&c->read_retry_list);
-               spin_unlock_irqrestore(&c->read_retry_lock, flags);
-
-               if (!bio)
-                       break;
-
-               rbio = container_of(bio, struct bch_read_bio, bio);
-               bch2_read_retry(c, rbio);
-       }
+       bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+       bio_endio(&rbio->bio);
 }