X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fio.c;h=9eed97bac5449fa9845a29fc149326be038493f9;hb=e7c2bb91bce30a987c8c4e2875f2c63e887d3aa5;hp=78cdaa32c0a19272920ff2afb64117769830a6cd;hpb=565b4a74d6c25c78b0d2b82d9529595fc6269308;p=bcachefs-tools-debian

diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index 78cdaa3..9eed97b 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -27,13 +27,6 @@
 
 #include <trace/events/bcachefs.h>
 
-static inline void __bio_inc_remaining(struct bio *bio)
-{
-	bio_set_flag(bio, BIO_CHAIN);
-	smp_mb__before_atomic();
-	atomic_inc(&bio->__bi_remaining);
-}
-
 /* Allocate, free from mempool: */
 
 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
@@ -97,6 +90,9 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 	BUG_ON(c->opts.nochanges);
 
 	extent_for_each_ptr(e, ptr) {
+		BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
+		       !c->devs[ptr->dev]);
+
 		ca = c->devs[ptr->dev];
 
 		if (ptr + 1 < &extent_entry_last(e)->ptr) {
@@ -110,7 +106,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 			n->bounce		= false;
 			n->put_bio		= true;
 			n->bio.bi_opf		= wbio->bio.bi_opf;
-			__bio_inc_remaining(&wbio->bio);
+			bio_inc_remaining(&wbio->bio);
 		} else {
 			n = wbio;
 			n->split		= false;
@@ -128,7 +124,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 		if (likely(percpu_ref_tryget(&ca->io_ref))) {
 			n->have_io_ref		= true;
 			n->bio.bi_bdev		= ca->disk_sb.bdev;
-			generic_make_request(&n->bio);
+			submit_bio(&n->bio);
 		} else {
 			n->have_io_ref		= false;
 			bcache_io_error(c, &n->bio, "device has been removed");
@@ -241,68 +237,41 @@ static void bch2_write_index(struct closure *cl)
 	}
 }
 
-/**
- * bch_write_discard - discard range of keys
- *
- * Used to implement discard, and to handle when writethrough write hits
- * a write error on the cache device.
- */
-static void bch2_write_discard(struct closure *cl)
-{
-	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-	struct bpos end = op->pos;
-
-	end.offset += bio_sectors(&op->wbio.bio);
-
-	op->error = bch2_discard(op->c, op->pos, end, op->version,
-				&op->res, NULL, NULL);
-}
-
-/*
- * Convert extents to be inserted to discards after an error:
- */
 static void bch2_write_io_error(struct closure *cl)
 {
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+	struct keylist *keys = &op->insert_keys;
+	struct bch_fs *c = op->c;
+	struct bch_extent_ptr *ptr;
+	struct bkey_i *k;
+	int ret;
 
-	if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) {
-		struct bkey_i *src = bch2_keylist_front(&op->insert_keys);
-		struct bkey_i *dst = bch2_keylist_front(&op->insert_keys);
-
-		/*
-		 * Our data write just errored, which means we've got a bunch
-		 * of keys to insert that point to data that wasn't
-		 * successfully written.
-		 *
-		 * We don't have to insert those keys but we still have to
-		 * invalidate that region of the cache - so, if we just strip
-		 * off all the pointers from the keys we'll accomplish just
-		 * that.
-		 */
+	for_each_keylist_key(keys, k) {
+		struct bkey_i *n = bkey_next(k);
+		struct bkey_s_extent e = bkey_i_to_s_extent(k);
 
-		while (src != op->insert_keys.top) {
-			struct bkey_i *n = bkey_next(src);
+		extent_for_each_ptr_backwards(e, ptr)
+			if (test_bit(ptr->dev, op->failed.d))
+				bch2_extent_drop_ptr(e, ptr);
 
-			set_bkey_val_u64s(&src->k, 0);
-			src->k.type = KEY_TYPE_DISCARD;
-			bkey_copy(dst, src);
+		memmove(bkey_next(k), n, (void *) keys->top - (void *) n);
+		keys->top_p -= (u64 *) n - (u64 *) bkey_next(k);
 
-			dst = bkey_next(dst);
-			src = n;
+		ret = bch2_extent_nr_ptrs(e.c)
+			? bch2_check_mark_super(c, e.c, BCH_DATA_USER)
+			: -EIO;
+		if (ret) {
+			keys->top = keys->keys;
+			op->error = ret;
+			op->flags |= BCH_WRITE_DONE;
+			break;
 		}
-
-		op->insert_keys.top = dst;
-		op->flags |= BCH_WRITE_DISCARD;
-	} else {
-		/* TODO: We could try to recover from this. */
-		while (!bch2_keylist_empty(&op->insert_keys))
-			bch2_keylist_pop_front(&op->insert_keys);
-
-		op->error = -EIO;
-		op->flags |= BCH_WRITE_DONE;
 	}
 
+	memset(&op->failed, 0, sizeof(op->failed));
+
 	bch2_write_index(cl);
+	return;
 }
 
 static void bch2_write_endio(struct bio *bio)
@@ -314,9 +283,10 @@ static void bch2_write_endio(struct bio *bio)
 	struct bch_fs *c		= wbio->c;
 	struct bch_dev *ca		= wbio->ca;
 
-	if (bch2_dev_nonfatal_io_err_on(bio->bi_error, ca,
-					"data write"))
+	if (bch2_dev_io_err_on(bio->bi_error, ca, "data write")) {
+		set_bit(ca->dev_idx, op->failed.d);
 		set_closure_fn(cl, bch2_write_io_error, index_update_wq(op));
+	}
 
 	if (wbio->have_io_ref)
 		percpu_ref_put(&ca->io_ref);
@@ -538,14 +508,6 @@ static void __bch2_write(struct closure *cl)
 	struct open_bucket *b;
 	int ret;
 
-	memset(op->open_buckets, 0, sizeof(op->open_buckets));
-
-	if (op->flags & BCH_WRITE_DISCARD) {
-		bch2_write_discard(cl);
-		op->flags |= BCH_WRITE_DONE;
-		continue_at(cl, bch2_write_done, index_update_wq(op));
-	}
-
 	do {
 		if (open_bucket_nr == ARRAY_SIZE(op->open_buckets))
 			continue_at(cl, bch2_write_index, index_update_wq(op));
@@ -614,27 +576,15 @@ static void __bch2_write(struct closure *cl)
 	op->flags |= BCH_WRITE_DONE;
 	continue_at(cl, bch2_write_index, index_update_wq(op));
 err:
-	if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) {
-		/*
-		 * If we were writing cached data, not doing the write is fine
-		 * so long as we discard whatever would have been overwritten -
-		 * then it's equivalent to doing the write and immediately
-		 * reclaiming it.
-		 */
-
-		bch2_write_discard(cl);
-	} else {
-		/*
-		 * Right now we can only error here if we went RO - the
-		 * allocation failed, but we already checked for -ENOSPC when we
-		 * got our reservation.
-		 *
-		 * XXX capacity might have changed, but we don't check for that
-		 * yet:
-		 */
-		op->error = ret;
-	}
-
+	/*
+	 * Right now we can only error here if we went RO - the
+	 * allocation failed, but we already checked for -ENOSPC when we
+	 * got our reservation.
+	 *
+	 * XXX capacity might have changed, but we don't check for that
+	 * yet:
+	 */
+	op->error = ret;
 	op->flags |= BCH_WRITE_DONE;
 
 	/*
@@ -707,14 +657,13 @@ void bch2_write(struct closure *cl)
 		op->version.lo =
 			atomic64_inc_return(&c->key_version) + 1;
 
-	if (!(op->flags & BCH_WRITE_DISCARD))
-		bch2_increment_clock(c, bio_sectors(bio), WRITE);
+	bch2_increment_clock(c, bio_sectors(bio), WRITE);
 
 	/* Don't call bch2_next_delay() if rate is >= 1 GB/sec */
 
 	if (c->foreground_write_ratelimit_enabled &&
 	    c->foreground_write_pd.rate.rate < (1 << 30) &&
-	    !(op->flags & BCH_WRITE_DISCARD) && op->wp->throttle) {
+	    op->wp->throttle) {
 		unsigned long flags;
 		u64 delay;
 
@@ -784,6 +733,9 @@ void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 
 	op->index_update_fn = bch2_write_index_default;
 
+	memset(op->open_buckets, 0, sizeof(op->open_buckets));
+	memset(&op->failed, 0, sizeof(op->failed));
+
 	bch2_keylist_init(&op->insert_keys,
 			  op->inline_keys,
 			  ARRAY_SIZE(op->inline_keys));
@@ -792,53 +744,228 @@ void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
 		get_random_bytes(&op->version, sizeof(op->version));
 }
 
-/* Discard */
-
-/* bch_discard - discard a range of keys from start_key to end_key.
- * @c		filesystem
- * @start_key	pointer to start location
- *		NOTE: discard starts at bkey_start_offset(start_key)
- * @end_key	pointer to end location
- *		NOTE: discard ends at KEY_OFFSET(end_key)
- * @version	version of discard (0ULL if none)
- *
- * Returns:
- *	 0 on success
- *	<0 on error
- *
- * XXX: this needs to be refactored with inode_truncate, or more
- *	appropriately inode_truncate should call this
- */
-int bch2_discard(struct bch_fs *c, struct bpos start,
-		 struct bpos end, struct bversion version,
-		 struct disk_reservation *disk_res,
-		 struct extent_insert_hook *hook,
-		 u64 *journal_seq)
-{
-	return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, start, end, version,
-				      disk_res, hook, journal_seq);
-}
-
 /* Cache promotion on read */
 
-struct cache_promote_op {
+struct promote_op {
 	struct closure		cl;
 	struct migrate_write	write;
 	struct bio_vec		bi_inline_vecs[0]; /* must be last */
 };
 
+static void promote_done(struct closure *cl)
+{
+	struct promote_op *op =
+		container_of(cl, struct promote_op, cl);
+	struct bch_fs *c = op->write.op.c;
+
+	percpu_ref_put(&c->writes);
+	bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
+	kfree(op);
+}
+
+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
+{
+	struct bch_fs *c = rbio->c;
+	struct closure *cl = &op->cl;
+	struct bio *bio = &op->write.op.wbio.bio;
+
+	BUG_ON(!rbio->split || !rbio->bounce);
+
+	if (!percpu_ref_tryget(&c->writes))
+		return;
+
+	trace_promote(&rbio->bio);
+
+	/* we now own pages: */
+	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
+	memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+	       sizeof(struct bio_vec) * bio->bi_vcnt);
+	rbio->promote = NULL;
+
+	closure_init(cl, NULL);
+	closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
+	closure_return_with_destructor(cl, promote_done);
+}
+
+/*
+ * XXX: multiple promotes can race with each other, wastefully. Keep a list of
+ * outstanding promotes?
+ */
+static struct promote_op *promote_alloc(struct bch_fs *c,
+					struct bvec_iter iter,
+					struct bkey_s_c k,
+					struct extent_pick_ptr *pick,
+					bool read_full)
+{
+	struct promote_op *op;
+	struct bio *bio;
+	/*
+	 * biovec needs to be big enough to hold decompressed data, if
+	 * bch2_write_extent() has to decompress/recompress it:
+	 */
+	unsigned sectors = max_t(unsigned, k.k->size,
+		      crc_uncompressed_size(NULL, &pick->crc));
+	unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+
+	op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
+	if (!op)
+		return NULL;
+
+	bio = &op->write.op.wbio.bio;
+	bio_init(bio, bio->bi_inline_vecs, pages);
+
+	bio->bi_iter = iter;
+
+	if (pick->crc.compression_type) {
+		op->write.op.flags     |= BCH_WRITE_DATA_COMPRESSED;
+		op->write.op.crc	= pick->crc;
+		op->write.op.size	= k.k->size;
+	} else if (read_full) {
+		/*
+		 * Adjust bio to correspond to _live_ portion of @k -
+		 * which might be less than what we're actually reading:
+		 */
+		bio_advance(bio, pick->crc.offset << 9);
+		BUG_ON(bio_sectors(bio) < k.k->size);
+		bio->bi_iter.bi_size = k.k->size << 9;
+	} else {
+		/*
+		 * Set insert pos to correspond to what we're actually
+		 * reading:
+		 */
+		op->write.op.pos.offset = iter.bi_sector;
+	}
+	bch2_migrate_write_init(c, &op->write, &c->promote_write_point,
+				k, NULL,
+				BCH_WRITE_ALLOC_NOWAIT|
+				BCH_WRITE_CACHED);
+	op->write.promote = true;
+
+	return op;
+}
+
+/* only promote if we're not reading from the fastest tier: */
+static bool should_promote(struct bch_fs *c,
+			   struct extent_pick_ptr *pick, unsigned flags)
+{
+	if (!(flags & BCH_READ_MAY_PROMOTE))
+		return false;
+
+	if (flags & BCH_READ_IN_RETRY)
+		return false;
+
+	if (percpu_ref_is_dying(&c->writes))
+		return false;
+
+	return c->fastest_tier &&
+		c->fastest_tier < c->tiers + pick->ca->mi.tier;
+}
+
 /* Read */
 
-static int bio_checksum_uncompress(struct bch_fs *c,
-				   struct bch_read_bio *rbio)
+#define READ_RETRY_AVOID	1
+#define READ_RETRY		2
+#define READ_ERR		3
+
+static inline struct bch_read_bio *
+bch2_rbio_parent(struct bch_read_bio *rbio)
 {
+	return rbio->split ? rbio->parent : rbio;
+}
+
+__always_inline
+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
+			   struct workqueue_struct *wq)
+{
+
+	if (!wq || rbio->process_context) {
+		fn(&rbio->work);
+	} else {
+		rbio->work.func		= fn;
+		rbio->process_context	= true;
+		queue_work(wq, &rbio->work);
+	}
+}
+
+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
+{
+	struct bch_read_bio *parent = rbio->parent;
+
+	BUG_ON(!rbio->split);
+
+	if (rbio->promote)
+		kfree(rbio->promote);
+	if (rbio->bounce)
+		bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
+	bio_put(&rbio->bio);
+
+	return parent;
+}
+
+static void bch2_rbio_done(struct bch_read_bio *rbio)
+{
+	if (rbio->promote)
+		kfree(rbio->promote);
+	rbio->promote = NULL;
+
+	if (rbio->split)
+		rbio = bch2_rbio_free(rbio);
+	bio_endio(&rbio->bio);
+}
+
+static void bch2_rbio_retry(struct work_struct *work)
+{
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
+	struct bch_fs *c		= rbio->c;
+	struct bvec_iter iter		= rbio->bvec_iter;
+	unsigned flags			= rbio->flags;
+	u64 inode			= rbio->inode;
+	struct bch_devs_mask avoid;
+
+	trace_read_retry(&rbio->bio);
+
+	memset(&avoid, 0, sizeof(avoid));
+
+	if (rbio->retry == READ_RETRY_AVOID)
+		__set_bit(rbio->pick.ca->dev_idx, avoid.d);
+
+	if (rbio->split)
+		rbio = bch2_rbio_free(rbio);
+	else
+		rbio->bio.bi_error = 0;
+
+	flags |= BCH_READ_MUST_CLONE;
+	flags |= BCH_READ_IN_RETRY;
+
+	__bch2_read(c, rbio, iter, inode, &avoid, flags);
+}
+
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
+{
+	rbio->retry = retry;
+
+	if (rbio->flags & BCH_READ_IN_RETRY)
+		return;
+
+	if (retry == READ_ERR) {
+		bch2_rbio_parent(rbio)->bio.bi_error = error;
+		bch2_rbio_done(rbio);
+	} else {
+		bch2_rbio_punt(rbio, bch2_rbio_retry, rbio->c->wq);
+	}
+}
+
+static int bch2_rbio_checksum_uncompress(struct bio *dst,
+					 struct bch_read_bio *rbio)
+{
+	struct bch_fs *c = rbio->c;
 	struct bio *src = &rbio->bio;
-	struct bio *dst = &bch2_rbio_parent(rbio)->bio;
-	struct bvec_iter dst_iter = rbio->parent_iter;
+	struct bvec_iter dst_iter = rbio->bvec_iter;
 	struct nonce nonce = extent_nonce(rbio->version,
-				rbio->crc.nonce,
-				crc_uncompressed_size(NULL, &rbio->crc),
-				rbio->crc.compression_type);
+				rbio->pick.crc.nonce,
+				crc_uncompressed_size(NULL, &rbio->pick.crc),
+				rbio->pick.crc.compression_type);
 	struct bch_csum csum;
 	int ret = 0;
 
@@ -849,130 +976,64 @@ static int bio_checksum_uncompress(struct bch_fs *c,
 	 * in order to promote
 	 */
 	if (rbio->bounce) {
-		src->bi_iter.bi_size	= crc_compressed_size(NULL, &rbio->crc) << 9;
+		src->bi_iter.bi_size	= crc_compressed_size(NULL, &rbio->pick.crc) << 9;
 		src->bi_iter.bi_idx	= 0;
 		src->bi_iter.bi_bvec_done = 0;
 	} else {
-		src->bi_iter = rbio->parent_iter;
+		src->bi_iter = rbio->bvec_iter;
 	}
 
-	csum = bch2_checksum_bio(c, rbio->crc.csum_type, nonce, src);
-	if (bch2_dev_nonfatal_io_err_on(bch2_crc_cmp(rbio->crc.csum, csum),
-					rbio->ca,
+	csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, nonce, src);
+	if (bch2_dev_io_err_on(bch2_crc_cmp(rbio->pick.crc.csum, csum),
+			       rbio->pick.ca,
 			"data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
-			rbio->inode, (u64) rbio->parent_iter.bi_sector << 9,
-			rbio->crc.csum.hi, rbio->crc.csum.lo, csum.hi, csum.lo,
-			rbio->crc.csum_type))
+			rbio->inode, (u64) rbio->bvec_iter.bi_sector << 9,
+			rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
+			csum.hi, csum.lo,
+			rbio->pick.crc.csum_type))
 		ret = -EIO;
 
 	/*
 	 * If there was a checksum error, still copy the data back - unless it
 	 * was compressed, we don't want to decompress bad data:
 	 */
-	if (rbio->crc.compression_type != BCH_COMPRESSION_NONE) {
+	if (rbio->pick.crc.compression_type != BCH_COMPRESSION_NONE) {
 		if (!ret) {
-			bch2_encrypt_bio(c, rbio->crc.csum_type, nonce, src);
+			bch2_encrypt_bio(c, rbio->pick.crc.csum_type, nonce, src);
 			ret = bch2_bio_uncompress(c, src, dst,
-						 dst_iter, rbio->crc);
+						 dst_iter, rbio->pick.crc);
 			if (ret)
 				__bcache_io_error(c, "decompression error");
 		}
 	} else if (rbio->bounce) {
-		bio_advance(src, rbio->crc.offset << 9);
+		bio_advance(src, rbio->pick.crc.offset << 9);
 
 		/* don't need to decrypt the entire bio: */
 		BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
 		src->bi_iter.bi_size = dst_iter.bi_size;
 
-		nonce = nonce_add(nonce, rbio->crc.offset << 9);
+		nonce = nonce_add(nonce, rbio->pick.crc.offset << 9);
 
-		bch2_encrypt_bio(c, rbio->crc.csum_type,
+		bch2_encrypt_bio(c, rbio->pick.crc.csum_type,
 				nonce, src);
 
 		bio_copy_data_iter(dst, &dst_iter,
 				   src, &src->bi_iter);
 	} else {
-		bch2_encrypt_bio(c, rbio->crc.csum_type, nonce, src);
+		bch2_encrypt_bio(c, rbio->pick.crc.csum_type, nonce, src);
 	}
 
 	return ret;
 }
 
-static void bch2_rbio_free(struct bch_read_bio *rbio)
-{
-	struct bch_fs *c = rbio->c;
-	struct bio *bio = &rbio->bio;
-
-	BUG_ON(rbio->ca);
-	BUG_ON(!rbio->split);
-
-	if (rbio->promote)
-		kfree(rbio->promote);
-	if (rbio->bounce)
-		bch2_bio_free_pages_pool(c, bio);
-
-	bio_put(bio);
-}
-
-static void bch2_rbio_done(struct bch_read_bio *rbio)
-{
-	struct bio *orig = &bch2_rbio_parent(rbio)->bio;
-
-	percpu_ref_put(&rbio->ca->io_ref);
-	rbio->ca = NULL;
-
-	if (rbio->split) {
-		if (rbio->bio.bi_error)
-			orig->bi_error = rbio->bio.bi_error;
-
-		bio_endio(orig);
-		bch2_rbio_free(rbio);
-	} else {
-		if (rbio->promote)
-			kfree(rbio->promote);
-
-		orig->bi_end_io = rbio->orig_bi_end_io;
-		bio_endio_nodec(orig);
-	}
-}
-
-static void bch2_rbio_error(struct bch_read_bio *rbio, int error)
-{
-	bch2_rbio_parent(rbio)->bio.bi_error = error;
-	bch2_rbio_done(rbio);
-}
-
-static void bch2_rbio_retry(struct bch_fs *c, struct bch_read_bio *rbio)
-{
-	unsigned long flags;
-
-	percpu_ref_put(&rbio->ca->io_ref);
-	rbio->ca = NULL;
-
-	spin_lock_irqsave(&c->read_retry_lock, flags);
-	bio_list_add(&c->read_retry_list, &rbio->bio);
-	spin_unlock_irqrestore(&c->read_retry_lock, flags);
-	queue_work(c->wq, &c->read_retry_work);
-}
-
-static void cache_promote_done(struct closure *cl)
-{
-	struct cache_promote_op *op =
-		container_of(cl, struct cache_promote_op, cl);
-
-	bch2_bio_free_pages_pool(op->write.op.c, &op->write.op.wbio.bio);
-	kfree(op);
-}
-
 /* Inner part that may run in process context */
 static void __bch2_read_endio(struct work_struct *work)
 {
 	struct bch_read_bio *rbio =
 		container_of(work, struct bch_read_bio, work);
-	struct bch_fs *c = rbio->c;
 	int ret;
 
-	ret = bio_checksum_uncompress(c, rbio);
+	ret = bch2_rbio_checksum_uncompress(&bch2_rbio_parent(rbio)->bio, rbio);
 	if (ret) {
 		/*
 		 * Checksum error: if the bio wasn't bounced, we may have been
@@ -980,34 +1041,19 @@ static void __bch2_read_endio(struct work_struct *work)
 		 * scribble over) - retry the read, bouncing it this time:
 		 */
 		if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
-			rbio->flags |= BCH_READ_FORCE_BOUNCE;
-			bch2_rbio_retry(c, rbio);
+			rbio->flags |= BCH_READ_MUST_BOUNCE;
+			bch2_rbio_error(rbio, READ_RETRY, ret);
 		} else {
-			bch2_rbio_error(rbio, -EIO);
+			bch2_rbio_error(rbio, READ_RETRY_AVOID, ret);
 		}
 		return;
 	}
 
-	if (rbio->promote) {
-		struct cache_promote_op *promote = rbio->promote;
-		struct closure *cl = &promote->cl;
-
-		BUG_ON(!rbio->split || !rbio->bounce);
-
-		trace_promote(&rbio->bio);
-
-		/* we now own pages: */
-		swap(promote->write.op.wbio.bio.bi_vcnt, rbio->bio.bi_vcnt);
-		rbio->promote = NULL;
-
-		bch2_rbio_done(rbio);
+	if (rbio->promote)
+		promote_start(rbio->promote, rbio);
 
-		closure_init(cl, &c->cl);
-		closure_call(&promote->write.op.cl, bch2_write, c->wq, cl);
-		closure_return_with_destructor(cl, cache_promote_done);
-	} else {
+	if (likely(!(rbio->flags & BCH_READ_IN_RETRY)))
 		bch2_rbio_done(rbio);
-	}
 }
 
 static void bch2_read_endio(struct bio *bio)
@@ -1015,90 +1061,55 @@ static void bch2_read_endio(struct bio *bio)
 	struct bch_read_bio *rbio =
 		container_of(bio, struct bch_read_bio, bio);
 	struct bch_fs *c = rbio->c;
+	struct workqueue_struct *wq = NULL;
+
+	percpu_ref_put(&rbio->pick.ca->io_ref);
 
-	if (bch2_dev_nonfatal_io_err_on(bio->bi_error, rbio->ca, "data read")) {
-		/* XXX: retry IO errors when we have another replica */
-		bch2_rbio_error(rbio, bio->bi_error);
+	if (!rbio->split)
+		rbio->bio.bi_end_io = rbio->end_io;
+
+	if (bch2_dev_io_err_on(bio->bi_error, rbio->pick.ca, "data read")) {
+		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_error);
 		return;
 	}
 
-	if (rbio->ptr.cached &&
+	if (rbio->pick.ptr.cached &&
 	    (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
-	     ptr_stale(rbio->ca, &rbio->ptr))) {
+	     ptr_stale(rbio->pick.ca, &rbio->pick.ptr))) {
 		atomic_long_inc(&c->read_realloc_races);
 
 		if (rbio->flags & BCH_READ_RETRY_IF_STALE)
-			bch2_rbio_retry(c, rbio);
+			bch2_rbio_error(rbio, READ_RETRY, -EINTR);
 		else
-			bch2_rbio_error(rbio, -EINTR);
+			bch2_rbio_error(rbio, READ_ERR, -EINTR);
 		return;
 	}
 
-	if (rbio->crc.compression_type ||
-	    bch2_csum_type_is_encryption(rbio->crc.csum_type))
-		queue_work(system_unbound_wq, &rbio->work);
-	else if (rbio->crc.csum_type)
-		queue_work(system_highpri_wq, &rbio->work);
-	else
-		__bch2_read_endio(&rbio->work);
-}
-
-static bool should_promote(struct bch_fs *c,
-			   struct extent_pick_ptr *pick, unsigned flags)
-{
-	if (!(flags & BCH_READ_PROMOTE))
-		return false;
-
-	if (percpu_ref_is_dying(&c->writes))
-		return false;
+	if (rbio->pick.crc.compression_type ||
+	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
+		wq = system_unbound_wq;
+	else if (rbio->pick.crc.csum_type)
+		wq = system_highpri_wq;
 
-	return c->fastest_tier &&
-		c->fastest_tier < c->tiers + pick->ca->mi.tier;
+	bch2_rbio_punt(rbio, __bch2_read_endio, wq);
 }
 
-void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
-			  struct bvec_iter iter, struct bkey_s_c k,
-			  struct extent_pick_ptr *pick, unsigned flags)
+int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
+		       struct bvec_iter iter, struct bkey_s_c k,
+		       struct extent_pick_ptr *pick, unsigned flags)
 {
 	struct bch_read_bio *rbio;
-	struct cache_promote_op *promote_op = NULL;
+	struct promote_op *promote_op = NULL;
 	unsigned skip = iter.bi_sector - bkey_start_offset(k.k);
 	bool bounce = false, split, read_full = false;
+	int ret = 0;
 
 	bch2_increment_clock(c, bio_sectors(&orig->bio), READ);
+	PTR_BUCKET(pick->ca, &pick->ptr)->prio[READ] = c->prio_clock[READ].hand;
 
 	EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
 		k.k->p.offset < bvec_iter_end_sector(iter));
 
-	/* only promote if we're not reading from the fastest tier: */
-
-	/*
-	 * XXX: multiple promotes can race with each other, wastefully. Keep a
-	 * list of outstanding promotes?
-	 */
-	if (should_promote(c, pick, flags)) {
-		/*
-		 * biovec needs to be big enough to hold decompressed data, if
-		 * the bch2_write_extent() has to decompress/recompress it:
-		 */
-		unsigned sectors =
-			max_t(unsigned, k.k->size,
-			      crc_uncompressed_size(NULL, &pick->crc));
-		unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
-
-		promote_op = kmalloc(sizeof(*promote_op) +
-				sizeof(struct bio_vec) * pages, GFP_NOIO);
-		if (promote_op) {
-			struct bio *promote_bio = &promote_op->write.op.wbio.bio;
-
-			bio_init(promote_bio,
-				 promote_bio->bi_inline_vecs,
-				 pages);
-			bounce = true;
-			/* could also set read_full */
-		}
-	}
-
 	/*
 	 * note: if compression_type and crc_type both == none, then
 	 * compressed/uncompressed size is zero
@@ -1108,25 +1119,30 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
 	     (bvec_iter_sectors(iter) != crc_uncompressed_size(NULL, &pick->crc) ||
 	      (bch2_csum_type_is_encryption(pick->crc.csum_type) &&
 	       (flags & BCH_READ_USER_MAPPED)) ||
-	      (flags & BCH_READ_FORCE_BOUNCE)))) {
+	      (flags & BCH_READ_MUST_BOUNCE)))) {
 		read_full = true;
 		bounce = true;
 	}
 
+	if (should_promote(c, pick, flags))
+		promote_op = promote_alloc(c, iter, k, pick, read_full);
+
+	/* could also set read_full */
+	if (promote_op)
+		bounce = true;
+
 	if (bounce) {
 		unsigned sectors = read_full
 			? (crc_compressed_size(NULL, &pick->crc) ?: k.k->size)
 			: bvec_iter_sectors(iter);
 
-		rbio = container_of(bio_alloc_bioset(GFP_NOIO,
+		rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
 					DIV_ROUND_UP(sectors, PAGE_SECTORS),
-					&c->bio_read_split),
-				    struct bch_read_bio, bio);
+					&c->bio_read_split));
 
 		bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
 		split = true;
-	} else if (!(flags & BCH_READ_MAY_REUSE_BIO) ||
-		   !(flags & BCH_READ_IS_LAST)) {
+	} else if (flags & BCH_READ_MUST_CLONE) {
 		/*
 		 * Have to clone if there were any splits, due to error
 		 * reporting issues (if a split errored, and retrying didn't
@@ -1135,9 +1151,8 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
 		 * from the whole bio, in which case we don't want to retry and
 		 * lose the error)
 		 */
-		rbio = container_of(bio_clone_fast(&orig->bio,
-					GFP_NOIO, &c->bio_read_split),
-				    struct bch_read_bio, bio);
+		rbio = rbio_init(bio_clone_fast(&orig->bio,
+					      GFP_NOIO, &c->bio_read_split));
 		rbio->bio.bi_iter = iter;
 		split = true;
 	} else {
@@ -1147,80 +1162,39 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
 		BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
 	}
 
-	if (!(flags & BCH_READ_IS_LAST))
-		__bio_inc_remaining(&orig->bio);
+	rbio->c			= c;
 
 	if (split)
 		rbio->parent	= orig;
 	else
-		rbio->orig_bi_end_io = orig->bio.bi_end_io;
-	rbio->parent_iter	= iter;
+		rbio->end_io	= orig->bio.bi_end_io;
 
+	rbio->bvec_iter		= iter;
 	rbio->flags		= flags;
 	rbio->bounce		= bounce;
 	rbio->split		= split;
-	rbio->c			= c;
-	rbio->ca		= pick->ca;
-	rbio->ptr		= pick->ptr;
-	rbio->crc		= pick->crc;
+	rbio->process_context	= false;
+	rbio->retry		= 0;
+	rbio->pick		= *pick;
 	/*
 	 * crc.compressed_size will be 0 if there wasn't any checksum
 	 * information, also we need to stash the original size of the bio if we
 	 * bounced (which isn't necessarily the original key size, if we bounced
 	 * only for promoting)
 	 */
-	rbio->crc._compressed_size = bio_sectors(&rbio->bio) - 1;
+	rbio->pick.crc._compressed_size = bio_sectors(&rbio->bio) - 1;
 	rbio->version		= k.k->version;
 	rbio->promote		= promote_op;
 	rbio->inode		= k.k->p.inode;
-	INIT_WORK(&rbio->work, __bch2_read_endio);
+	INIT_WORK(&rbio->work, NULL);
 
 	rbio->bio.bi_bdev	= pick->ca->disk_sb.bdev;
 	rbio->bio.bi_opf	= orig->bio.bi_opf;
 	rbio->bio.bi_iter.bi_sector = pick->ptr.offset;
 	rbio->bio.bi_end_io	= bch2_read_endio;
 
-	if (promote_op) {
-		struct bio *promote_bio = &promote_op->write.op.wbio.bio;
-
-		promote_bio->bi_iter = rbio->bio.bi_iter;
-		memcpy(promote_bio->bi_io_vec, rbio->bio.bi_io_vec,
-		       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
-
-		bch2_migrate_write_init(c, &promote_op->write,
-				       &c->promote_write_point,
-				       k, NULL,
-				       BCH_WRITE_ALLOC_NOWAIT|
-				       BCH_WRITE_CACHED);
-		promote_op->write.promote = true;
-
-		if (rbio->crc.compression_type) {
-			promote_op->write.op.flags |= BCH_WRITE_DATA_COMPRESSED;
-			promote_op->write.op.crc = rbio->crc;
-			promote_op->write.op.size = k.k->size;
-		} else if (read_full) {
-			/*
-			 * Adjust bio to correspond to _live_ portion of @k -
-			 * which might be less than what we're actually reading:
-			 */
-			bio_advance(promote_bio, rbio->crc.offset << 9);
-			BUG_ON(bio_sectors(promote_bio) < k.k->size);
-			promote_bio->bi_iter.bi_size = k.k->size << 9;
-		} else {
-			/*
-			 * Set insert pos to correspond to what we're actually
-			 * reading:
-			 */
-			promote_op->write.op.pos.offset = iter.bi_sector;
-		}
-
-		promote_bio->bi_iter.bi_sector =
-			promote_op->write.op.pos.offset;
-	}
-
-	/* _after_ promete stuff has looked at rbio->crc.offset */
 	if (read_full)
-		rbio->crc.offset += skip;
+		rbio->pick.crc.offset += skip;
 	else
 		rbio->bio.bi_iter.bi_sector += skip;
 
@@ -1229,28 +1203,36 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
 	if (bounce)
 		trace_read_bounce(&rbio->bio);
 
-	if (!(flags & BCH_READ_IS_LAST))
-		trace_read_split(&rbio->bio);
+	if (likely(!(flags & BCH_READ_IN_RETRY))) {
+		submit_bio(&rbio->bio);
+	} else {
+		submit_bio_wait(&rbio->bio);
+
+		rbio->process_context = true;
+		bch2_read_endio(&rbio->bio);
 
-	generic_make_request(&rbio->bio);
+		ret = rbio->retry;
+		if (!ret)
+			bch2_rbio_done(rbio);
+	}
+
+	return ret;
 }
 
-static void bch2_read_iter(struct bch_fs *c, struct bch_read_bio *rbio,
-			  struct bvec_iter bvec_iter, u64 inode,
-			  unsigned flags)
+void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+		 struct bvec_iter bvec_iter, u64 inode,
+		 struct bch_devs_mask *avoid, unsigned flags)
 {
-	struct bio *bio = &rbio->bio;
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
-
+retry:
 	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
 			   POS(inode, bvec_iter.bi_sector),
 			   BTREE_ITER_WITH_HOLES, k) {
 		BKEY_PADDED(k) tmp;
 		struct extent_pick_ptr pick;
-		unsigned bytes, sectors;
-		bool is_last;
+		struct bvec_iter fragment;
 
 		/*
 		 * Unlock the iterator while the btree node's lock is still in
@@ -1260,43 +1242,47 @@ static void bch2_read_iter(struct bch_fs *c, struct bch_read_bio *rbio,
 		k = bkey_i_to_s_c(&tmp.k);
 		bch2_btree_iter_unlock(&iter);
 
-		bch2_extent_pick_ptr(c, k, &pick);
+		bch2_extent_pick_ptr(c, k, avoid, &pick);
 		if (IS_ERR(pick.ca)) {
-			bcache_io_error(c, bio, "no device to read from");
-			bio_endio(bio);
+			bcache_io_error(c, &rbio->bio, "no device to read from");
+			bio_endio(&rbio->bio);
 			return;
 		}
 
-		sectors = min_t(u64, k.k->p.offset,
-				bvec_iter_end_sector(bvec_iter)) -
-			bvec_iter.bi_sector;
-		bytes = sectors << 9;
-		is_last = bytes == bvec_iter.bi_size;
-		swap(bvec_iter.bi_size, bytes);
-
-		if (is_last)
-			flags |= BCH_READ_IS_LAST;
+		fragment = bvec_iter;
+		fragment.bi_size = (min_t(u64, k.k->p.offset,
+					  bvec_iter_end_sector(bvec_iter)) -
+				    bvec_iter.bi_sector) << 9;
 
 		if (pick.ca) {
-			PTR_BUCKET(pick.ca, &pick.ptr)->prio[READ] =
-				c->prio_clock[READ].hand;
-
-			bch2_read_extent_iter(c, rbio, bvec_iter,
-					      k, &pick, flags);
+			if (fragment.bi_size != bvec_iter.bi_size) {
+				bio_inc_remaining(&rbio->bio);
+				flags |= BCH_READ_MUST_CLONE;
+				trace_read_split(&rbio->bio);
+			}
 
-			flags &= ~BCH_READ_MAY_REUSE_BIO;
+			ret = __bch2_read_extent(c, rbio, fragment,
+						 k, &pick, flags);
+			switch (ret) {
+			case READ_RETRY_AVOID:
+				__set_bit(pick.ca->dev_idx, avoid->d);
+			case READ_RETRY:
+				goto retry;
+			case READ_ERR:
+				bio_endio(&rbio->bio);
+				return;
+			};
 		} else {
-			zero_fill_bio_iter(bio, bvec_iter);
+			zero_fill_bio_iter(&rbio->bio, fragment);
 
-			if (is_last)
-				bio_endio(bio);
+			if (fragment.bi_size == bvec_iter.bi_size)
+				bio_endio(&rbio->bio);
 		}
 
-		if (is_last)
+		if (fragment.bi_size == bvec_iter.bi_size)
 			return;
 
-		swap(bvec_iter.bi_size, bytes);
-		bio_advance_iter(bio, &bvec_iter, bytes);
+		bio_advance_iter(&rbio->bio, &bvec_iter, fragment.bi_size);
 	}
 
 	/*
@@ -1305,55 +1291,6 @@ static void bch2_read_iter(struct bch_fs *c, struct bch_read_bio *rbio,
 	 */
 	ret = bch2_btree_iter_unlock(&iter);
 	BUG_ON(!ret);
-	bcache_io_error(c, bio, "btree IO error %i", ret);
-	bio_endio(bio);
-}
-
-void bch2_read(struct bch_fs *c, struct bch_read_bio *bio, u64 inode)
-{
-	bch2_read_iter(c, bio, bio->bio.bi_iter, inode,
-		      BCH_READ_RETRY_IF_STALE|
-		      BCH_READ_PROMOTE|
-		      BCH_READ_MAY_REUSE_BIO|
-		      BCH_READ_USER_MAPPED);
-}
-
-/**
- * bch_read_retry - re-submit a bio originally from bch2_read()
- */
-static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio)
-{
-	struct bch_read_bio *parent = bch2_rbio_parent(rbio);
-	struct bvec_iter iter = rbio->parent_iter;
-	unsigned flags = rbio->flags;
-	u64 inode = rbio->inode;
-
-	trace_read_retry(&rbio->bio);
-
-	if (rbio->split)
-		bch2_rbio_free(rbio);
-	else
-		rbio->bio.bi_end_io = rbio->orig_bi_end_io;
-
-	bch2_read_iter(c, parent, iter, inode, flags);
-}
-
-void bch2_read_retry_work(struct work_struct *work)
-{
-	struct bch_fs *c = container_of(work, struct bch_fs,
-					   read_retry_work);
-	struct bch_read_bio *rbio;
-	struct bio *bio;
-
-	while (1) {
-		spin_lock_irq(&c->read_retry_lock);
-		bio = bio_list_pop(&c->read_retry_list);
-		spin_unlock_irq(&c->read_retry_lock);
-
-		if (!bio)
-			break;
-
-		rbio = container_of(bio, struct bch_read_bio, bio);
-		bch2_read_retry(c, rbio);
-	}
+	bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+	bio_endio(&rbio->bio);
 }