Update bcachefs sources to 9e7ae5219c bcachefs: Make write points more dynamic

[bcachefs-tools-debian] / libbcachefs / io.c
diff --git a/libbcachefs/io.c b/libbcachefs/io.c

index da06845cd1a775105883bff5db6e9d0eac5c2784..e5fc72da83bde530874a73574b62b7bde4d22db1 100644 (file)
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -27,13 +27,6 @@
  
  #include <trace/events/bcachefs.h>
  
-static inline void __bio_inc_remaining(struct bio *bio)
-{
-       bio_set_flag(bio, BIO_CHAIN);
-       smp_mb__before_atomic();
-       atomic_inc(&bio->__bi_remaining);
-}
-
  /* Allocate, free from mempool: */
  
  void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
@@ -85,39 +78,23 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
  
  /* Bios with headers */
  
-static void bch2_submit_wbio(struct bch_fs *c, struct bch_write_bio *wbio,
-                           struct bch_dev *ca, const struct bch_extent_ptr *ptr)
-{
-       wbio->ca                = ca;
-       wbio->submit_time_us    = local_clock_us();
-       wbio->bio.bi_iter.bi_sector = ptr->offset;
-       wbio->bio.bi_bdev       = ca ? ca->disk_sb.bdev : NULL;
-
-       if (!ca)
-               bcache_io_error(c, &wbio->bio, "device has been removed");
-       else
-               generic_make_request(&wbio->bio);
-}
-
  void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
+                              enum bch_data_type type,
                                const struct bkey_i *k)
  {
         struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
         const struct bch_extent_ptr *ptr;
         struct bch_write_bio *n;
         struct bch_dev *ca;
+       unsigned ptr_idx = 0;
  
         BUG_ON(c->opts.nochanges);
  
-       wbio->split = false;
-       wbio->c = c;
-
         extent_for_each_ptr(e, ptr) {
+               BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
+                      !c->devs[ptr->dev]);
+
                 ca = c->devs[ptr->dev];
-               if (!percpu_ref_tryget(&ca->io_ref)) {
-                       bch2_submit_wbio(c, wbio, NULL, ptr);
-                       break;
-               }
  
                 if (ptr + 1 < &extent_entry_last(e)->ptr) {
                         n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
@@ -125,21 +102,38 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
  
                         n->bio.bi_end_io        = wbio->bio.bi_end_io;
                         n->bio.bi_private       = wbio->bio.bi_private;
-                       n->c                    = c;
-                       n->orig                 = &wbio->bio;
-                       n->bounce               = false;
+                       n->parent               = wbio;
                         n->split                = true;
+                       n->bounce               = false;
                         n->put_bio              = true;
                         n->bio.bi_opf           = wbio->bio.bi_opf;
-                       __bio_inc_remaining(n->orig);
+                       bio_inc_remaining(&wbio->bio);
                 } else {
                         n = wbio;
+                       n->split                = false;
                 }
  
+               n->c                    = c;
+               n->ca                   = ca;
+               n->ptr_idx              = ptr_idx++;
+               n->submit_time_us       = local_clock_us();
+               n->bio.bi_iter.bi_sector = ptr->offset;
+
                 if (!journal_flushes_device(ca))
                         n->bio.bi_opf |= REQ_FUA;
  
-               bch2_submit_wbio(c, n, ca, ptr);
+               if (likely(percpu_ref_tryget(&ca->io_ref))) {
+                       this_cpu_add(ca->io_done->sectors[WRITE][type],
+                                    bio_sectors(&n->bio));
+
+                       n->have_io_ref          = true;
+                       n->bio.bi_bdev          = ca->disk_sb.bdev;
+                       submit_bio(&n->bio);
+               } else {
+                       n->have_io_ref          = false;
+                       bcache_io_error(c, &n->bio, "device has been removed");
+                       bio_endio(&n->bio);
+               }
         }
  }
  
@@ -188,8 +182,9 @@ static int bch2_write_index_default(struct bch_write_op *op)
         struct btree_iter iter;
         int ret;
  
-       bch2_btree_iter_init_intent(&iter, op->c, BTREE_ID_EXTENTS,
-               bkey_start_pos(&bch2_keylist_front(keys)->k));
+       bch2_btree_iter_init(&iter, op->c, BTREE_ID_EXTENTS,
+                            bkey_start_pos(&bch2_keylist_front(keys)->k),
+                            BTREE_ITER_INTENT);
  
         ret = bch2_btree_insert_list_at(&iter, keys, &op->res,
                                        NULL, op_journal_seq(op),
@@ -246,99 +241,68 @@ static void bch2_write_index(struct closure *cl)
         }
  }
  
-/**
- * bch_write_discard - discard range of keys
- *
- * Used to implement discard, and to handle when writethrough write hits
- * a write error on the cache device.
- */
-static void bch2_write_discard(struct closure *cl)
-{
-       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-       struct bio *bio = &op->bio->bio;
-       struct bpos end = op->pos;
-
-       end.offset += bio_sectors(bio);
-
-       op->error = bch2_discard(op->c, op->pos, end, op->version,
-                               &op->res, NULL, NULL);
-}
-
-/*
- * Convert extents to be inserted to discards after an error:
- */
  static void bch2_write_io_error(struct closure *cl)
  {
         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+       struct keylist *keys = &op->insert_keys;
+       struct bch_fs *c = op->c;
+       struct bch_extent_ptr *ptr;
+       struct bkey_i *k;
+       int ret;
  
-       if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) {
-               struct bkey_i *src = bch2_keylist_front(&op->insert_keys);
-               struct bkey_i *dst = bch2_keylist_front(&op->insert_keys);
-
-               /*
-                * Our data write just errored, which means we've got a bunch
-                * of keys to insert that point to data that wasn't
-                * successfully written.
-                *
-                * We don't have to insert those keys but we still have to
-                * invalidate that region of the cache - so, if we just strip
-                * off all the pointers from the keys we'll accomplish just
-                * that.
-                */
+       for_each_keylist_key(keys, k) {
+               struct bkey_i *n = bkey_next(k);
+               struct bkey_s_extent e = bkey_i_to_s_extent(k);
  
-               while (src != op->insert_keys.top) {
-                       struct bkey_i *n = bkey_next(src);
+               extent_for_each_ptr_backwards(e, ptr)
+                       if (test_bit(ptr->dev, op->failed.d))
+                               bch2_extent_drop_ptr(e, ptr);
  
-                       set_bkey_val_u64s(&src->k, 0);
-                       src->k.type = KEY_TYPE_DISCARD;
-                       bkey_copy(dst, src);
+               memmove(bkey_next(k), n, (void *) keys->top - (void *) n);
+               keys->top_p -= (u64 *) n - (u64 *) bkey_next(k);
  
-                       dst = bkey_next(dst);
-                       src = n;
+               ret = bch2_extent_nr_ptrs(e.c)
+                       ? bch2_check_mark_super(c, e.c, BCH_DATA_USER)
+                       : -EIO;
+               if (ret) {
+                       keys->top = keys->keys;
+                       op->error = ret;
+                       op->flags |= BCH_WRITE_DONE;
+                       break;
                 }
-
-               op->insert_keys.top = dst;
-               op->flags |= BCH_WRITE_DISCARD;
-       } else {
-               /* TODO: We could try to recover from this. */
-               while (!bch2_keylist_empty(&op->insert_keys))
-                       bch2_keylist_pop_front(&op->insert_keys);
-
-               op->error = -EIO;
-               op->flags |= BCH_WRITE_DONE;
         }
  
+       memset(&op->failed, 0, sizeof(op->failed));
+
         bch2_write_index(cl);
+       return;
  }
  
  static void bch2_write_endio(struct bio *bio)
  {
-       struct closure *cl = bio->bi_private;
-       struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-       struct bch_write_bio *wbio = to_wbio(bio);
-       struct bch_fs *c = wbio->c;
-       struct bio *orig = wbio->orig;
-       struct bch_dev *ca = wbio->ca;
-
-       if (bch2_dev_nonfatal_io_err_on(bio->bi_error, ca,
-                                      "data write")) {
+       struct closure *cl              = bio->bi_private;
+       struct bch_write_op *op         = container_of(cl, struct bch_write_op, cl);
+       struct bch_write_bio *wbio      = to_wbio(bio);
+       struct bch_write_bio *parent    = wbio->split ? wbio->parent : NULL;
+       struct bch_fs *c                = wbio->c;
+       struct bch_dev *ca              = wbio->ca;
+
+       if (bch2_dev_io_err_on(bio->bi_error, ca, "data write")) {
+               set_bit(ca->dev_idx, op->failed.d);
                 set_closure_fn(cl, bch2_write_io_error, index_update_wq(op));
         }
  
-       if (ca)
+       if (wbio->have_io_ref)
                 percpu_ref_put(&ca->io_ref);
  
-       if (bio->bi_error && orig)
-               orig->bi_error = bio->bi_error;
-
         if (wbio->bounce)
                 bch2_bio_free_pages_pool(c, bio);
  
         if (wbio->put_bio)
                 bio_put(bio);
  
-       if (orig)
-               bio_endio(orig);
+       if (parent)
+               bio_endio(&parent->bio);
         else
                 closure_put(cl);
  }
@@ -386,11 +350,10 @@ static void init_append_extent(struct bch_write_op *op,
         bch2_keylist_push(&op->insert_keys);
  }
  
-static int bch2_write_extent(struct bch_write_op *op,
-                           struct open_bucket *ob,
-                           struct bio *orig)
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
  {
         struct bch_fs *c = op->c;
+       struct bio *orig = &op->wbio.bio;
         struct bio *bio;
         struct bch_write_bio *wbio;
         unsigned key_to_write_offset = op->insert_keys.top_p -
@@ -398,15 +361,17 @@ static int bch2_write_extent(struct bch_write_op *op,
         struct bkey_i *key_to_write;
         unsigned csum_type = op->csum_type;
         unsigned compression_type = op->compression_type;
-       int ret;
+       int ret, more;
  
         /* don't refetch csum type/compression type */
         barrier();
  
+       BUG_ON(!bio_sectors(orig));
+
         /* Need to decompress data? */
         if ((op->flags & BCH_WRITE_DATA_COMPRESSED) &&
             (crc_uncompressed_size(NULL, &op->crc) != op->size ||
-            crc_compressed_size(NULL, &op->crc) > ob->sectors_free)) {
+            crc_compressed_size(NULL, &op->crc) > wp->sectors_free)) {
                 int ret;
  
                 ret = bch2_bio_uncompress_inplace(c, orig, op->size, op->crc);
@@ -424,19 +389,16 @@ static int bch2_write_extent(struct bch_write_op *op,
                                    op->crc.nonce,
                                    op->crc.csum,
                                    op->crc.csum_type,
-                                  ob);
+                                  wp->ob);
  
                 bio                     = orig;
-               wbio                    = to_wbio(bio);
-               wbio->orig              = NULL;
-               wbio->bounce            = false;
-               wbio->put_bio           = false;
-               ret                     = 0;
+               wbio                    = wbio_init(bio);
+               more                    = 0;
         } else if (csum_type != BCH_CSUM_NONE ||
                    compression_type != BCH_COMPRESSION_NONE) {
                 /* all units here in bytes */
                 unsigned total_output = 0, output_available =
-                       min(ob->sectors_free << 9, orig->bi_iter.bi_size);
+                       min(wp->sectors_free << 9, orig->bi_iter.bi_size);
                 unsigned crc_nonce = bch2_csum_type_is_encryption(csum_type)
                         ? op->nonce : 0;
                 struct bch_csum csum;
@@ -445,19 +407,18 @@ static int bch2_write_extent(struct bch_write_op *op,
                 bio = bio_alloc_bioset(GFP_NOIO,
                                        DIV_ROUND_UP(output_available, PAGE_SIZE),
                                        &c->bio_write);
+               wbio                    = wbio_init(bio);
+               wbio->bounce            = true;
+               wbio->put_bio           = true;
+               /* copy WRITE_SYNC flag */
+               wbio->bio.bi_opf        = orig->bi_opf;
+
                 /*
                  * XXX: can't use mempool for more than
                  * BCH_COMPRESSED_EXTENT_MAX worth of pages
                  */
                 bch2_bio_alloc_pages_pool(c, bio, output_available);
  
-               /* copy WRITE_SYNC flag */
-               bio->bi_opf             = orig->bi_opf;
-               wbio                    = to_wbio(bio);
-               wbio->orig              = NULL;
-               wbio->bounce            = true;
-               wbio->put_bio           = true;
-
                 do {
                         unsigned fragment_compression_type = compression_type;
                         size_t dst_len, src_len;
@@ -466,17 +427,12 @@ static int bch2_write_extent(struct bch_write_op *op,
                                          orig, &src_len,
                                          &fragment_compression_type);
  
-                       BUG_ON(!dst_len || dst_len > bio->bi_iter.bi_size);
-                       BUG_ON(!src_len || src_len > orig->bi_iter.bi_size);
-                       BUG_ON(dst_len & (block_bytes(c) - 1));
-                       BUG_ON(src_len & (block_bytes(c) - 1));
-
-                       swap(bio->bi_iter.bi_size, dst_len);
                         nonce = extent_nonce(op->version,
                                              crc_nonce,
                                              src_len >> 9,
-                                            fragment_compression_type),
+                                            fragment_compression_type);
  
+                       swap(bio->bi_iter.bi_size, dst_len);
                         bch2_encrypt_bio(c, csum_type, nonce, bio);
  
                         csum = bch2_checksum_bio(c, csum_type, nonce, bio);
@@ -485,7 +441,7 @@ static int bch2_write_extent(struct bch_write_op *op,
                         init_append_extent(op,
                                            dst_len >> 9, src_len >> 9,
                                            fragment_compression_type,
-                                          crc_nonce, csum, csum_type, ob);
+                                          crc_nonce, csum, csum_type, wp->ob);
  
                         total_output += dst_len;
                         bio_advance(bio, dst_len);
@@ -510,67 +466,50 @@ static int bch2_write_extent(struct bch_write_op *op,
                         mempool_free(bio->bi_io_vec[--bio->bi_vcnt].bv_page,
                                      &c->bio_bounce_pages);
  
-               ret = orig->bi_iter.bi_size != 0;
+               more = orig->bi_iter.bi_size != 0;
         } else {
-               bio = bio_next_split(orig, ob->sectors_free, GFP_NOIO,
+               bio = bio_next_split(orig, wp->sectors_free, GFP_NOIO,
                                      &c->bio_write);
-
-               wbio                    = to_wbio(bio);
-               wbio->orig              = NULL;
-               wbio->bounce            = false;
+               wbio                    = wbio_init(bio);
                 wbio->put_bio           = bio != orig;
  
                 init_append_extent(op, bio_sectors(bio), bio_sectors(bio),
                                    compression_type, 0,
-                                  (struct bch_csum) { 0 }, csum_type, ob);
+                                  (struct bch_csum) { 0 }, csum_type, wp->ob);
  
-               ret = bio != orig;
+               more = bio != orig;
         }
  
+       /* might have done a realloc... */
+
+       key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
+
+       ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
+                                   BCH_DATA_USER);
+       if (ret)
+               return ret;
+
         bio->bi_end_io  = bch2_write_endio;
         bio->bi_private = &op->cl;
         bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
  
         closure_get(bio->bi_private);
  
-       /* might have done a realloc... */
-
-       key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
-
-       bch2_check_mark_super(c, key_to_write, false);
-
-       bch2_submit_wbio_replicas(to_wbio(bio), c, key_to_write);
-       return ret;
+       bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
+                                 key_to_write);
+       return more;
  }
  
  static void __bch2_write(struct closure *cl)
  {
         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
         struct bch_fs *c = op->c;
-       struct bio *bio = &op->bio->bio;
         unsigned open_bucket_nr = 0;
-       struct open_bucket *b;
+       struct write_point *wp;
+       struct open_bucket *ob;
         int ret;
  
-       memset(op->open_buckets, 0, sizeof(op->open_buckets));
-
-       if (op->flags & BCH_WRITE_DISCARD) {
-               op->flags |= BCH_WRITE_DONE;
-               bch2_write_discard(cl);
-               bio_put(bio);
-               continue_at(cl, bch2_write_done, index_update_wq(op));
-       }
-
-       /*
-        * Journal writes are marked REQ_PREFLUSH; if the original write was a
-        * flush, it'll wait on the journal write.
-        */
-       bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA);
-
         do {
-               EBUG_ON(bio->bi_iter.bi_sector != op->pos.offset);
-               EBUG_ON(!bio_sectors(bio));
-
                 if (open_bucket_nr == ARRAY_SIZE(op->open_buckets))
                         continue_at(cl, bch2_write_index, index_update_wq(op));
  
@@ -581,16 +520,19 @@ static void __bch2_write(struct closure *cl)
                                         BKEY_EXTENT_U64s_MAX))
                         continue_at(cl, bch2_write_index, index_update_wq(op));
  
-               b = bch2_alloc_sectors_start(c, op->wp,
+               wp = bch2_alloc_sectors_start(c, BCH_DATA_USER,
+                       op->devs,
+                       op->write_point,
                         op->nr_replicas,
                         c->opts.data_replicas_required,
                         op->alloc_reserve,
+                       op->flags,
                         (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
-               EBUG_ON(!b);
+               EBUG_ON(!wp);
  
-               if (unlikely(IS_ERR(b))) {
-                       if (unlikely(PTR_ERR(b) != -EAGAIN)) {
-                               ret = PTR_ERR(b);
+               if (unlikely(IS_ERR(wp))) {
+                       if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
+                               ret = PTR_ERR(wp);
                                 goto err;
                         }
  
@@ -623,13 +565,15 @@ static void __bch2_write(struct closure *cl)
                         continue;
                 }
  
-               BUG_ON(b - c->open_buckets == 0 ||
-                      b - c->open_buckets > U8_MAX);
-               op->open_buckets[open_bucket_nr++] = b - c->open_buckets;
+               ob = wp->ob;
+
+               BUG_ON(ob - c->open_buckets == 0 ||
+                      ob - c->open_buckets > U8_MAX);
+               op->open_buckets[open_bucket_nr++] = ob - c->open_buckets;
  
-               ret = bch2_write_extent(op, b, bio);
+               ret = bch2_write_extent(op, wp);
  
-               bch2_alloc_sectors_done(c, op->wp, b);
+               bch2_alloc_sectors_done(c, wp);
  
                 if (ret < 0)
                         goto err;
@@ -638,27 +582,15 @@ static void __bch2_write(struct closure *cl)
         op->flags |= BCH_WRITE_DONE;
         continue_at(cl, bch2_write_index, index_update_wq(op));
  err:
-       if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) {
-               /*
-                * If we were writing cached data, not doing the write is fine
-                * so long as we discard whatever would have been overwritten -
-                * then it's equivalent to doing the write and immediately
-                * reclaiming it.
-                */
-
-               bch2_write_discard(cl);
-       } else {
-               /*
-                * Right now we can only error here if we went RO - the
-                * allocation failed, but we already checked for -ENOSPC when we
-                * got our reservation.
-                *
-                * XXX capacity might have changed, but we don't check for that
-                * yet:
-                */
-               op->error = ret;
-       }
-
+       /*
+        * Right now we can only error here if we went RO - the
+        * allocation failed, but we already checked for -ENOSPC when we
+        * got our reservation.
+        *
+        * XXX capacity might have changed, but we don't check for that
+        * yet:
+        */
+       op->error = ret;
         op->flags |= BCH_WRITE_DONE;
  
         /*
@@ -708,16 +640,13 @@ void bch2_wake_delayed_writes(unsigned long data)
   * after the data is written it calls bch_journal, and after the keys have been
   * added to the next journal write they're inserted into the btree.
   *
- * It inserts the data in op->bio; bi_sector is used for the key offset, and
- * op->inode is used for the key inode.
- *
   * If op->discard is true, instead of inserting the data it invalidates the
   * region of the cache represented by op->bio and op->inode.
   */
  void bch2_write(struct closure *cl)
  {
         struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-       struct bio *bio = &op->bio->bio;
+       struct bio *bio = &op->wbio.bio;
         struct bch_fs *c = op->c;
         u64 inode = op->pos.inode;
  
@@ -734,20 +663,19 @@ void bch2_write(struct closure *cl)
                 op->version.lo =
                         atomic64_inc_return(&c->key_version) + 1;
  
-       if (!(op->flags & BCH_WRITE_DISCARD))
-               bch2_increment_clock(c, bio_sectors(bio), WRITE);
+       bch2_increment_clock(c, bio_sectors(bio), WRITE);
  
         /* Don't call bch2_next_delay() if rate is >= 1 GB/sec */
  
-       if (c->foreground_write_ratelimit_enabled &&
-           c->foreground_write_pd.rate.rate < (1 << 30) &&
-           !(op->flags & BCH_WRITE_DISCARD) && op->wp->throttle) {
+       if ((op->flags & BCH_WRITE_THROTTLE) &&
+           c->foreground_write_ratelimit_enabled &&
+           c->foreground_write_pd.rate.rate < (1 << 30)) {
                 unsigned long flags;
                 u64 delay;
  
                 spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
                 bch2_ratelimit_increment(&c->foreground_write_pd.rate,
-                                       bio->bi_iter.bi_size);
+                                        bio->bi_iter.bi_size);
  
                 delay = bch2_ratelimit_delay(&c->foreground_write_pd.rate);
  
@@ -781,27 +709,30 @@ void bch2_write(struct closure *cl)
  }
  
  void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
-                      struct bch_write_bio *bio, struct disk_reservation res,
-                      struct write_point *wp, struct bpos pos,
-                      u64 *journal_seq, unsigned flags)
+                       struct disk_reservation res,
+                       struct bch_devs_mask *devs,
+                       unsigned long write_point,
+                       struct bpos pos,
+                       u64 *journal_seq, unsigned flags)
  {
         EBUG_ON(res.sectors && !res.nr_replicas);
  
         op->c           = c;
         op->io_wq       = index_update_wq(op);
-       op->bio         = bio;
         op->written     = 0;
         op->error       = 0;
         op->flags       = flags;
         op->csum_type   = bch2_data_checksum_type(c);
-       op->compression_type = c->opts.compression;
+       op->compression_type =
+               bch2_compression_opt_to_type(c->opts.compression);
         op->nr_replicas = res.nr_replicas;
         op->alloc_reserve = RESERVE_NONE;
         op->nonce       = 0;
         op->pos         = pos;
         op->version     = ZERO_VERSION;
         op->res         = res;
-       op->wp          = wp;
+       op->devs        = devs;
+       op->write_point = write_point;
  
         if (journal_seq) {
                 op->journal_seq_p = journal_seq;
@@ -812,6 +743,9 @@ void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
  
         op->index_update_fn = bch2_write_index_default;
  
+       memset(op->open_buckets, 0, sizeof(op->open_buckets));
+       memset(&op->failed, 0, sizeof(op->failed));
+
         bch2_keylist_init(&op->insert_keys,
                           op->inline_keys,
                           ARRAY_SIZE(op->inline_keys));
@@ -820,53 +754,230 @@ void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
                 get_random_bytes(&op->version, sizeof(op->version));
  }
  
-/* Discard */
-
-/* bch_discard - discard a range of keys from start_key to end_key.
- * @c          filesystem
- * @start_key  pointer to start location
- *             NOTE: discard starts at bkey_start_offset(start_key)
- * @end_key    pointer to end location
- *             NOTE: discard ends at KEY_OFFSET(end_key)
- * @version    version of discard (0ULL if none)
- *
- * Returns:
- *      0 on success
- *     <0 on error
- *
- * XXX: this needs to be refactored with inode_truncate, or more
- *     appropriately inode_truncate should call this
- */
-int bch2_discard(struct bch_fs *c, struct bpos start,
-                struct bpos end, struct bversion version,
-                struct disk_reservation *disk_res,
-                struct extent_insert_hook *hook,
-                u64 *journal_seq)
-{
-       return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, start, end, version,
-                                     disk_res, hook, journal_seq);
-}
-
  /* Cache promotion on read */
  
-struct cache_promote_op {
+struct promote_op {
         struct closure          cl;
         struct migrate_write    write;
         struct bio_vec          bi_inline_vecs[0]; /* must be last */
  };
  
+static void promote_done(struct closure *cl)
+{
+       struct promote_op *op =
+               container_of(cl, struct promote_op, cl);
+       struct bch_fs *c = op->write.op.c;
+
+       percpu_ref_put(&c->writes);
+       bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
+       kfree(op);
+}
+
+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
+{
+       struct bch_fs *c = rbio->c;
+       struct closure *cl = &op->cl;
+       struct bio *bio = &op->write.op.wbio.bio;
+
+       BUG_ON(!rbio->split || !rbio->bounce);
+
+       if (!percpu_ref_tryget(&c->writes))
+               return;
+
+       trace_promote(&rbio->bio);
+
+       /* we now own pages: */
+       swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
+       memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+              sizeof(struct bio_vec) * bio->bi_vcnt);
+       rbio->promote = NULL;
+
+       closure_init(cl, NULL);
+       closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
+       closure_return_with_destructor(cl, promote_done);
+}
+
+/*
+ * XXX: multiple promotes can race with each other, wastefully. Keep a list of
+ * outstanding promotes?
+ */
+static struct promote_op *promote_alloc(struct bch_fs *c,
+                                       struct bvec_iter iter,
+                                       struct bkey_s_c k,
+                                       struct extent_pick_ptr *pick,
+                                       bool read_full)
+{
+       struct promote_op *op;
+       struct bio *bio;
+       /*
+        * biovec needs to be big enough to hold decompressed data, if
+        * bch2_write_extent() has to decompress/recompress it:
+        */
+       unsigned sectors = max_t(unsigned, k.k->size,
+                     crc_uncompressed_size(NULL, &pick->crc));
+       unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+
+       op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
+       if (!op)
+               return NULL;
+
+       bio = &op->write.op.wbio.bio;
+       bio_init(bio, bio->bi_inline_vecs, pages);
+
+       bio->bi_iter = iter;
+
+       if (pick->crc.compression_type) {
+               op->write.op.flags     |= BCH_WRITE_DATA_COMPRESSED;
+               op->write.op.crc        = pick->crc;
+               op->write.op.size       = k.k->size;
+       } else if (read_full) {
+               /*
+                * Adjust bio to correspond to _live_ portion of @k -
+                * which might be less than what we're actually reading:
+                */
+               bio->bi_iter.bi_size = sectors << 9;
+               bio_advance(bio, pick->crc.offset << 9);
+               BUG_ON(bio_sectors(bio) < k.k->size);
+               bio->bi_iter.bi_size = k.k->size << 9;
+       } else {
+               /*
+                * Set insert pos to correspond to what we're actually
+                * reading:
+                */
+               op->write.op.pos.offset = iter.bi_sector;
+       }
+       bch2_migrate_write_init(c, &op->write,
+                               c->fastest_devs,
+                               k, NULL,
+                               BCH_WRITE_ALLOC_NOWAIT|
+                               BCH_WRITE_CACHED);
+       op->write.promote = true;
+
+       return op;
+}
+
+/* only promote if we're not reading from the fastest tier: */
+static bool should_promote(struct bch_fs *c,
+                          struct extent_pick_ptr *pick, unsigned flags)
+{
+       if (!(flags & BCH_READ_MAY_PROMOTE))
+               return false;
+
+       if (flags & BCH_READ_IN_RETRY)
+               return false;
+
+       if (percpu_ref_is_dying(&c->writes))
+               return false;
+
+       return c->fastest_tier &&
+               c->fastest_tier < c->tiers + pick->ca->mi.tier;
+}
+
  /* Read */
  
-static int bio_checksum_uncompress(struct bch_fs *c,
-                                  struct bch_read_bio *rbio)
+#define READ_RETRY_AVOID       1
+#define READ_RETRY             2
+#define READ_ERR               3
+
+static inline struct bch_read_bio *
+bch2_rbio_parent(struct bch_read_bio *rbio)
+{
+       return rbio->split ? rbio->parent : rbio;
+}
+
+__always_inline
+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
+                          struct workqueue_struct *wq)
+{
+
+       if (!wq || rbio->process_context) {
+               fn(&rbio->work);
+       } else {
+               rbio->work.func         = fn;
+               rbio->process_context   = true;
+               queue_work(wq, &rbio->work);
+       }
+}
+
+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
  {
+       struct bch_read_bio *parent = rbio->parent;
+
+       BUG_ON(!rbio->split);
+
+       if (rbio->promote)
+               kfree(rbio->promote);
+       if (rbio->bounce)
+               bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
+       bio_put(&rbio->bio);
+
+       return parent;
+}
+
+static void bch2_rbio_done(struct bch_read_bio *rbio)
+{
+       if (rbio->promote)
+               kfree(rbio->promote);
+       rbio->promote = NULL;
+
+       if (rbio->split)
+               rbio = bch2_rbio_free(rbio);
+       bio_endio(&rbio->bio);
+}
+
+static void bch2_rbio_retry(struct work_struct *work)
+{
+       struct bch_read_bio *rbio =
+               container_of(work, struct bch_read_bio, work);
+       struct bch_fs *c                = rbio->c;
+       struct bvec_iter iter           = rbio->bvec_iter;
+       unsigned flags                  = rbio->flags;
+       u64 inode                       = rbio->inode;
+       struct bch_devs_mask avoid;
+
+       trace_read_retry(&rbio->bio);
+
+       memset(&avoid, 0, sizeof(avoid));
+
+       if (rbio->retry == READ_RETRY_AVOID)
+               __set_bit(rbio->pick.ca->dev_idx, avoid.d);
+
+       if (rbio->split)
+               rbio = bch2_rbio_free(rbio);
+       else
+               rbio->bio.bi_error = 0;
+
+       flags |= BCH_READ_MUST_CLONE;
+       flags |= BCH_READ_IN_RETRY;
+
+       __bch2_read(c, rbio, iter, inode, &avoid, flags);
+}
+
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
+{
+       rbio->retry = retry;
+
+       if (rbio->flags & BCH_READ_IN_RETRY)
+               return;
+
+       if (retry == READ_ERR) {
+               bch2_rbio_parent(rbio)->bio.bi_error = error;
+               bch2_rbio_done(rbio);
+       } else {
+               bch2_rbio_punt(rbio, bch2_rbio_retry, rbio->c->wq);
+       }
+}
+
+static int bch2_rbio_checksum_uncompress(struct bio *dst,
+                                        struct bch_read_bio *rbio)
+{
+       struct bch_fs *c = rbio->c;
         struct bio *src = &rbio->bio;
-       struct bio *dst = &bch2_rbio_parent(rbio)->bio;
-       struct bvec_iter dst_iter = rbio->parent_iter;
+       struct bvec_iter dst_iter = rbio->bvec_iter;
         struct nonce nonce = extent_nonce(rbio->version,
-                               rbio->crc.nonce,
-                               crc_uncompressed_size(NULL, &rbio->crc),
-                               rbio->crc.compression_type);
+                               rbio->pick.crc.nonce,
+                               crc_uncompressed_size(NULL, &rbio->pick.crc),
+                               rbio->pick.crc.compression_type);
         struct bch_csum csum;
         int ret = 0;
  
@@ -877,130 +988,64 @@ static int bio_checksum_uncompress(struct bch_fs *c,
          * in order to promote
          */
         if (rbio->bounce) {
-               src->bi_iter.bi_size    = crc_compressed_size(NULL, &rbio->crc) << 9;
+               src->bi_iter.bi_size    = crc_compressed_size(NULL, &rbio->pick.crc) << 9;
                 src->bi_iter.bi_idx     = 0;
                 src->bi_iter.bi_bvec_done = 0;
         } else {
-               src->bi_iter = rbio->parent_iter;
+               src->bi_iter = rbio->bvec_iter;
         }
  
-       csum = bch2_checksum_bio(c, rbio->crc.csum_type, nonce, src);
-       if (bch2_dev_nonfatal_io_err_on(bch2_crc_cmp(rbio->crc.csum, csum),
-                                       rbio->ca,
+       csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, nonce, src);
+       if (bch2_dev_io_err_on(bch2_crc_cmp(rbio->pick.crc.csum, csum),
+                              rbio->pick.ca,
                         "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
-                       rbio->inode, (u64) rbio->parent_iter.bi_sector << 9,
-                       rbio->crc.csum.hi, rbio->crc.csum.lo, csum.hi, csum.lo,
-                       rbio->crc.csum_type))
+                       rbio->inode, (u64) rbio->bvec_iter.bi_sector << 9,
+                       rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
+                       csum.hi, csum.lo,
+                       rbio->pick.crc.csum_type))
                 ret = -EIO;
  
         /*
          * If there was a checksum error, still copy the data back - unless it
          * was compressed, we don't want to decompress bad data:
          */
-       if (rbio->crc.compression_type != BCH_COMPRESSION_NONE) {
+       if (rbio->pick.crc.compression_type != BCH_COMPRESSION_NONE) {
                 if (!ret) {
-                       bch2_encrypt_bio(c, rbio->crc.csum_type, nonce, src);
+                       bch2_encrypt_bio(c, rbio->pick.crc.csum_type, nonce, src);
                         ret = bch2_bio_uncompress(c, src, dst,
-                                                dst_iter, rbio->crc);
+                                                dst_iter, rbio->pick.crc);
                         if (ret)
                                 __bcache_io_error(c, "decompression error");
                 }
         } else if (rbio->bounce) {
-               bio_advance(src, rbio->crc.offset << 9);
+               bio_advance(src, rbio->pick.crc.offset << 9);
  
                 /* don't need to decrypt the entire bio: */
                 BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
                 src->bi_iter.bi_size = dst_iter.bi_size;
  
-               nonce = nonce_add(nonce, rbio->crc.offset << 9);
+               nonce = nonce_add(nonce, rbio->pick.crc.offset << 9);
  
-               bch2_encrypt_bio(c, rbio->crc.csum_type,
+               bch2_encrypt_bio(c, rbio->pick.crc.csum_type,
                                 nonce, src);
  
-               bio_copy_data_iter(dst, dst_iter,
-                                  src, src->bi_iter);
+               bio_copy_data_iter(dst, &dst_iter,
+                                  src, &src->bi_iter);
         } else {
-               bch2_encrypt_bio(c, rbio->crc.csum_type, nonce, src);
+               bch2_encrypt_bio(c, rbio->pick.crc.csum_type, nonce, src);
         }
  
         return ret;
  }
  
-static void bch2_rbio_free(struct bch_read_bio *rbio)
-{
-       struct bch_fs *c = rbio->c;
-       struct bio *bio = &rbio->bio;
-
-       BUG_ON(rbio->ca);
-       BUG_ON(!rbio->split);
-
-       if (rbio->promote)
-               kfree(rbio->promote);
-       if (rbio->bounce)
-               bch2_bio_free_pages_pool(c, bio);
-
-       bio_put(bio);
-}
-
-static void bch2_rbio_done(struct bch_read_bio *rbio)
-{
-       struct bio *orig = &bch2_rbio_parent(rbio)->bio;
-
-       percpu_ref_put(&rbio->ca->io_ref);
-       rbio->ca = NULL;
-
-       if (rbio->split) {
-               if (rbio->bio.bi_error)
-                       orig->bi_error = rbio->bio.bi_error;
-
-               bio_endio(orig);
-               bch2_rbio_free(rbio);
-       } else {
-               if (rbio->promote)
-                       kfree(rbio->promote);
-
-               orig->bi_end_io = rbio->orig_bi_end_io;
-               bio_endio_nodec(orig);
-       }
-}
-
-static void bch2_rbio_error(struct bch_read_bio *rbio, int error)
-{
-       bch2_rbio_parent(rbio)->bio.bi_error = error;
-       bch2_rbio_done(rbio);
-}
-
-static void bch2_rbio_retry(struct bch_fs *c, struct bch_read_bio *rbio)
-{
-       unsigned long flags;
-
-       percpu_ref_put(&rbio->ca->io_ref);
-       rbio->ca = NULL;
-
-       spin_lock_irqsave(&c->read_retry_lock, flags);
-       bio_list_add(&c->read_retry_list, &rbio->bio);
-       spin_unlock_irqrestore(&c->read_retry_lock, flags);
-       queue_work(c->wq, &c->read_retry_work);
-}
-
-static void cache_promote_done(struct closure *cl)
-{
-       struct cache_promote_op *op =
-               container_of(cl, struct cache_promote_op, cl);
-
-       bch2_bio_free_pages_pool(op->write.op.c, &op->write.wbio.bio);
-       kfree(op);
-}
-
  /* Inner part that may run in process context */
  static void __bch2_read_endio(struct work_struct *work)
  {
         struct bch_read_bio *rbio =
                 container_of(work, struct bch_read_bio, work);
-       struct bch_fs *c = rbio->c;
         int ret;
  
-       ret = bio_checksum_uncompress(c, rbio);
+       ret = bch2_rbio_checksum_uncompress(&bch2_rbio_parent(rbio)->bio, rbio);
         if (ret) {
                 /*
                  * Checksum error: if the bio wasn't bounced, we may have been
@@ -1008,34 +1053,19 @@ static void __bch2_read_endio(struct work_struct *work)
                  * scribble over) - retry the read, bouncing it this time:
                  */
                 if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
-                       rbio->flags |= BCH_READ_FORCE_BOUNCE;
-                       bch2_rbio_retry(c, rbio);
+                       rbio->flags |= BCH_READ_MUST_BOUNCE;
+                       bch2_rbio_error(rbio, READ_RETRY, ret);
                 } else {
-                       bch2_rbio_error(rbio, -EIO);
+                       bch2_rbio_error(rbio, READ_RETRY_AVOID, ret);
                 }
                 return;
         }
  
-       if (rbio->promote) {
-               struct cache_promote_op *promote = rbio->promote;
-               struct closure *cl = &promote->cl;
-
-               BUG_ON(!rbio->split || !rbio->bounce);
-
-               trace_promote(&rbio->bio);
-
-               /* we now own pages: */
-               swap(promote->write.wbio.bio.bi_vcnt, rbio->bio.bi_vcnt);
-               rbio->promote = NULL;
-
-               bch2_rbio_done(rbio);
+       if (rbio->promote)
+               promote_start(rbio->promote, rbio);
  
-               closure_init(cl, &c->cl);
-               closure_call(&promote->write.op.cl, bch2_write, c->wq, cl);
-               closure_return_with_destructor(cl, cache_promote_done);
-       } else {
+       if (likely(!(rbio->flags & BCH_READ_IN_RETRY)))
                 bch2_rbio_done(rbio);
-       }
  }
  
  static void bch2_read_endio(struct bio *bio)
@@ -1043,90 +1073,55 @@ static void bch2_read_endio(struct bio *bio)
         struct bch_read_bio *rbio =
                 container_of(bio, struct bch_read_bio, bio);
         struct bch_fs *c = rbio->c;
+       struct workqueue_struct *wq = NULL;
+
+       percpu_ref_put(&rbio->pick.ca->io_ref);
  
-       if (bch2_dev_nonfatal_io_err_on(bio->bi_error, rbio->ca, "data read")) {
-               /* XXX: retry IO errors when we have another replica */
-               bch2_rbio_error(rbio, bio->bi_error);
+       if (!rbio->split)
+               rbio->bio.bi_end_io = rbio->end_io;
+
+       if (bch2_dev_io_err_on(bio->bi_error, rbio->pick.ca, "data read")) {
+               bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_error);
                 return;
         }
  
-       if (rbio->ptr.cached &&
+       if (rbio->pick.ptr.cached &&
             (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
-            ptr_stale(rbio->ca, &rbio->ptr))) {
-               atomic_long_inc(&c->cache_read_races);
+            ptr_stale(rbio->pick.ca, &rbio->pick.ptr))) {
+               atomic_long_inc(&c->read_realloc_races);
  
                 if (rbio->flags & BCH_READ_RETRY_IF_STALE)
-                       bch2_rbio_retry(c, rbio);
+                       bch2_rbio_error(rbio, READ_RETRY, -EINTR);
                 else
-                       bch2_rbio_error(rbio, -EINTR);
+                       bch2_rbio_error(rbio, READ_ERR, -EINTR);
                 return;
         }
  
-       if (rbio->crc.compression_type ||
-           bch2_csum_type_is_encryption(rbio->crc.csum_type))
-               queue_work(system_unbound_wq, &rbio->work);
-       else if (rbio->crc.csum_type)
-               queue_work(system_highpri_wq, &rbio->work);
-       else
-               __bch2_read_endio(&rbio->work);
-}
-
-static bool should_promote(struct bch_fs *c,
-                          struct extent_pick_ptr *pick, unsigned flags)
-{
-       if (!(flags & BCH_READ_PROMOTE))
-               return false;
-
-       if (percpu_ref_is_dying(&c->writes))
-               return false;
+       if (rbio->pick.crc.compression_type ||
+           bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
+               wq = system_unbound_wq;
+       else if (rbio->pick.crc.csum_type)
+               wq = system_highpri_wq;
  
-       return c->fastest_tier &&
-               c->fastest_tier < c->tiers + pick->ca->mi.tier;
+       bch2_rbio_punt(rbio, __bch2_read_endio, wq);
  }
  
-void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
-                         struct bvec_iter iter, struct bkey_s_c k,
-                         struct extent_pick_ptr *pick, unsigned flags)
+int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
+                      struct bvec_iter iter, struct bkey_s_c k,
+                      struct extent_pick_ptr *pick, unsigned flags)
  {
         struct bch_read_bio *rbio;
-       struct cache_promote_op *promote_op = NULL;
+       struct promote_op *promote_op = NULL;
         unsigned skip = iter.bi_sector - bkey_start_offset(k.k);
         bool bounce = false, split, read_full = false;
+       int ret = 0;
  
         bch2_increment_clock(c, bio_sectors(&orig->bio), READ);
+       PTR_BUCKET(pick->ca, &pick->ptr)->prio[READ] = c->prio_clock[READ].hand;
  
         EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
                 k.k->p.offset < bvec_iter_end_sector(iter));
  
-       /* only promote if we're not reading from the fastest tier: */
-
-       /*
-        * XXX: multiple promotes can race with each other, wastefully. Keep a
-        * list of outstanding promotes?
-        */
-       if (should_promote(c, pick, flags)) {
-               /*
-                * biovec needs to be big enough to hold decompressed data, if
-                * the bch2_write_extent() has to decompress/recompress it:
-                */
-               unsigned sectors =
-                       max_t(unsigned, k.k->size,
-                             crc_uncompressed_size(NULL, &pick->crc));
-               unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
-
-               promote_op = kmalloc(sizeof(*promote_op) +
-                               sizeof(struct bio_vec) * pages, GFP_NOIO);
-               if (promote_op) {
-                       struct bio *promote_bio = &promote_op->write.wbio.bio;
-
-                       bio_init(promote_bio);
-                       promote_bio->bi_max_vecs = pages;
-                       promote_bio->bi_io_vec  = promote_bio->bi_inline_vecs;
-                       bounce = true;
-                       /* could also set read_full */
-               }
-       }
-
         /*
          * note: if compression_type and crc_type both == none, then
          * compressed/uncompressed size is zero
@@ -1136,25 +1131,30 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
              (bvec_iter_sectors(iter) != crc_uncompressed_size(NULL, &pick->crc) ||
               (bch2_csum_type_is_encryption(pick->crc.csum_type) &&
                (flags & BCH_READ_USER_MAPPED)) ||
-             (flags & BCH_READ_FORCE_BOUNCE)))) {
+             (flags & BCH_READ_MUST_BOUNCE)))) {
                 read_full = true;
                 bounce = true;
         }
  
+       if (should_promote(c, pick, flags))
+               promote_op = promote_alloc(c, iter, k, pick, read_full);
+
+       /* could also set read_full */
+       if (promote_op)
+               bounce = true;
+
         if (bounce) {
                 unsigned sectors = read_full
                         ? (crc_compressed_size(NULL, &pick->crc) ?: k.k->size)
                         : bvec_iter_sectors(iter);
  
-               rbio = container_of(bio_alloc_bioset(GFP_NOIO,
+               rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
                                         DIV_ROUND_UP(sectors, PAGE_SECTORS),
-                                       &c->bio_read_split),
-                                   struct bch_read_bio, bio);
+                                       &c->bio_read_split));
  
                 bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
                 split = true;
-       } else if (!(flags & BCH_READ_MAY_REUSE_BIO) ||
-                  !(flags & BCH_READ_IS_LAST)) {
+       } else if (flags & BCH_READ_MUST_CLONE) {
                 /*
                  * Have to clone if there were any splits, due to error
                  * reporting issues (if a split errored, and retrying didn't
@@ -1163,9 +1163,8 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
                  * from the whole bio, in which case we don't want to retry and
                  * lose the error)
                  */
-               rbio = container_of(bio_clone_fast(&orig->bio,
-                                       GFP_NOIO, &c->bio_read_split),
-                                   struct bch_read_bio, bio);
+               rbio = rbio_init(bio_clone_fast(&orig->bio,
+                                             GFP_NOIO, &c->bio_read_split));
                 rbio->bio.bi_iter = iter;
                 split = true;
         } else {
@@ -1175,80 +1174,39 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
                 BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
         }
  
-       if (!(flags & BCH_READ_IS_LAST))
-               __bio_inc_remaining(&orig->bio);
+       rbio->c                 = c;
  
         if (split)
                 rbio->parent    = orig;
         else
-               rbio->orig_bi_end_io = orig->bio.bi_end_io;
-       rbio->parent_iter       = iter;
+               rbio->end_io    = orig->bio.bi_end_io;
  
+       rbio->bvec_iter         = iter;
         rbio->flags             = flags;
         rbio->bounce            = bounce;
         rbio->split             = split;
-       rbio->c                 = c;
-       rbio->ca                = pick->ca;
-       rbio->ptr               = pick->ptr;
-       rbio->crc               = pick->crc;
+       rbio->process_context   = false;
+       rbio->retry             = 0;
+       rbio->pick              = *pick;
         /*
          * crc.compressed_size will be 0 if there wasn't any checksum
          * information, also we need to stash the original size of the bio if we
          * bounced (which isn't necessarily the original key size, if we bounced
          * only for promoting)
          */
-       rbio->crc._compressed_size = bio_sectors(&rbio->bio) - 1;
+       rbio->pick.crc._compressed_size = bio_sectors(&rbio->bio) - 1;
         rbio->version           = k.k->version;
         rbio->promote           = promote_op;
         rbio->inode             = k.k->p.inode;
-       INIT_WORK(&rbio->work, __bch2_read_endio);
+       INIT_WORK(&rbio->work, NULL);
  
         rbio->bio.bi_bdev       = pick->ca->disk_sb.bdev;
         rbio->bio.bi_opf        = orig->bio.bi_opf;
         rbio->bio.bi_iter.bi_sector = pick->ptr.offset;
         rbio->bio.bi_end_io     = bch2_read_endio;
  
-       if (promote_op) {
-               struct bio *promote_bio = &promote_op->write.wbio.bio;
-
-               promote_bio->bi_iter = rbio->bio.bi_iter;
-               memcpy(promote_bio->bi_io_vec, rbio->bio.bi_io_vec,
-                      sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
-
-               bch2_migrate_write_init(c, &promote_op->write,
-                                      &c->promote_write_point,
-                                      k, NULL,
-                                      BCH_WRITE_ALLOC_NOWAIT|
-                                      BCH_WRITE_CACHED);
-               promote_op->write.promote = true;
-
-               if (rbio->crc.compression_type) {
-                       promote_op->write.op.flags |= BCH_WRITE_DATA_COMPRESSED;
-                       promote_op->write.op.crc = rbio->crc;
-                       promote_op->write.op.size = k.k->size;
-               } else if (read_full) {
-                       /*
-                        * Adjust bio to correspond to _live_ portion of @k -
-                        * which might be less than what we're actually reading:
-                        */
-                       bio_advance(promote_bio, rbio->crc.offset << 9);
-                       BUG_ON(bio_sectors(promote_bio) < k.k->size);
-                       promote_bio->bi_iter.bi_size = k.k->size << 9;
-               } else {
-                       /*
-                        * Set insert pos to correspond to what we're actually
-                        * reading:
-                        */
-                       promote_op->write.op.pos.offset = iter.bi_sector;
-               }
-
-               promote_bio->bi_iter.bi_sector =
-                       promote_op->write.op.pos.offset;
-       }
-
-       /* _after_ promete stuff has looked at rbio->crc.offset */
         if (read_full)
-               rbio->crc.offset += skip;
+               rbio->pick.crc.offset += skip;
         else
                 rbio->bio.bi_iter.bi_sector += skip;
  
@@ -1257,27 +1215,39 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
         if (bounce)
                 trace_read_bounce(&rbio->bio);
  
-       if (!(flags & BCH_READ_IS_LAST))
-               trace_read_split(&rbio->bio);
+       this_cpu_add(pick->ca->io_done->sectors[READ][BCH_DATA_USER],
+                    bio_sectors(&rbio->bio));
+
+       if (likely(!(flags & BCH_READ_IN_RETRY))) {
+               submit_bio(&rbio->bio);
+       } else {
+               submit_bio_wait(&rbio->bio);
+
+               rbio->process_context = true;
+               bch2_read_endio(&rbio->bio);
  
-       generic_make_request(&rbio->bio);
+               ret = rbio->retry;
+               if (!ret)
+                       bch2_rbio_done(rbio);
+       }
+
+       return ret;
  }
  
-static void bch2_read_iter(struct bch_fs *c, struct bch_read_bio *rbio,
-                         struct bvec_iter bvec_iter, u64 inode,
-                         unsigned flags)
+void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+                struct bvec_iter bvec_iter, u64 inode,
+                struct bch_devs_mask *avoid, unsigned flags)
  {
-       struct bio *bio = &rbio->bio;
         struct btree_iter iter;
         struct bkey_s_c k;
         int ret;
-
-       for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
-                                     POS(inode, bvec_iter.bi_sector), k) {
+retry:
+       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+                          POS(inode, bvec_iter.bi_sector),
+                          BTREE_ITER_WITH_HOLES, k) {
                 BKEY_PADDED(k) tmp;
                 struct extent_pick_ptr pick;
-               unsigned bytes, sectors;
-               bool is_last;
+               struct bvec_iter fragment;
  
                 /*
                  * Unlock the iterator while the btree node's lock is still in
@@ -1287,43 +1257,47 @@ static void bch2_read_iter(struct bch_fs *c, struct bch_read_bio *rbio,
                 k = bkey_i_to_s_c(&tmp.k);
                 bch2_btree_iter_unlock(&iter);
  
-               bch2_extent_pick_ptr(c, k, &pick);
+               bch2_extent_pick_ptr(c, k, avoid, &pick);
                 if (IS_ERR(pick.ca)) {
-                       bcache_io_error(c, bio, "no device to read from");
-                       bio_endio(bio);
+                       bcache_io_error(c, &rbio->bio, "no device to read from");
+                       bio_endio(&rbio->bio);
                         return;
                 }
  
-               sectors = min_t(u64, k.k->p.offset,
-                               bvec_iter_end_sector(bvec_iter)) -
-                       bvec_iter.bi_sector;
-               bytes = sectors << 9;
-               is_last = bytes == bvec_iter.bi_size;
-               swap(bvec_iter.bi_size, bytes);
-
-               if (is_last)
-                       flags |= BCH_READ_IS_LAST;
+               fragment = bvec_iter;
+               fragment.bi_size = (min_t(u64, k.k->p.offset,
+                                         bvec_iter_end_sector(bvec_iter)) -
+                                   bvec_iter.bi_sector) << 9;
  
                 if (pick.ca) {
-                       PTR_BUCKET(pick.ca, &pick.ptr)->read_prio =
-                               c->prio_clock[READ].hand;
-
-                       bch2_read_extent_iter(c, rbio, bvec_iter,
-                                            k, &pick, flags);
+                       if (fragment.bi_size != bvec_iter.bi_size) {
+                               bio_inc_remaining(&rbio->bio);
+                               flags |= BCH_READ_MUST_CLONE;
+                               trace_read_split(&rbio->bio);
+                       }
  
-                       flags &= ~BCH_READ_MAY_REUSE_BIO;
+                       ret = __bch2_read_extent(c, rbio, fragment,
+                                                k, &pick, flags);
+                       switch (ret) {
+                       case READ_RETRY_AVOID:
+                               __set_bit(pick.ca->dev_idx, avoid->d);
+                       case READ_RETRY:
+                               goto retry;
+                       case READ_ERR:
+                               bio_endio(&rbio->bio);
+                               return;
+                       };
                 } else {
-                       zero_fill_bio_iter(bio, bvec_iter);
+                       zero_fill_bio_iter(&rbio->bio, fragment);
  
-                       if (is_last)
-                               bio_endio(bio);
+                       if (fragment.bi_size == bvec_iter.bi_size)
+                               bio_endio(&rbio->bio);
                 }
  
-               if (is_last)
+               if (fragment.bi_size == bvec_iter.bi_size)
                         return;
  
-               swap(bvec_iter.bi_size, bytes);
-               bio_advance_iter(bio, &bvec_iter, bytes);
+               bio_advance_iter(&rbio->bio, &bvec_iter, fragment.bi_size);
         }
  
         /*
@@ -1332,56 +1306,6 @@ static void bch2_read_iter(struct bch_fs *c, struct bch_read_bio *rbio,
          */
         ret = bch2_btree_iter_unlock(&iter);
         BUG_ON(!ret);
-       bcache_io_error(c, bio, "btree IO error %i", ret);
-       bio_endio(bio);
-}
-
-void bch2_read(struct bch_fs *c, struct bch_read_bio *bio, u64 inode)
-{
-       bch2_read_iter(c, bio, bio->bio.bi_iter, inode,
-                     BCH_READ_RETRY_IF_STALE|
-                     BCH_READ_PROMOTE|
-                     BCH_READ_MAY_REUSE_BIO|
-                     BCH_READ_USER_MAPPED);
-}
-
-/**
- * bch_read_retry - re-submit a bio originally from bch2_read()
- */
-static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio)
-{
-       struct bch_read_bio *parent = bch2_rbio_parent(rbio);
-       struct bvec_iter iter = rbio->parent_iter;
-       unsigned flags = rbio->flags;
-       u64 inode = rbio->inode;
-
-       trace_read_retry(&rbio->bio);
-
-       if (rbio->split)
-               bch2_rbio_free(rbio);
-       else
-               rbio->bio.bi_end_io = rbio->orig_bi_end_io;
-
-       bch2_read_iter(c, parent, iter, inode, flags);
-}
-
-void bch2_read_retry_work(struct work_struct *work)
-{
-       struct bch_fs *c = container_of(work, struct bch_fs,
-                                          read_retry_work);
-       struct bch_read_bio *rbio;
-       struct bio *bio;
-       unsigned long flags;
-
-       while (1) {
-               spin_lock_irqsave(&c->read_retry_lock, flags);
-               bio = bio_list_pop(&c->read_retry_list);
-               spin_unlock_irqrestore(&c->read_retry_lock, flags);
-
-               if (!bio)
-                       break;
-
-               rbio = container_of(bio, struct bch_read_bio, bio);
-               bch2_read_retry(c, rbio);
-       }
+       bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+       bio_endio(&rbio->bio);
  }