#include <trace/events/bcachefs.h>
-static inline void __bio_inc_remaining(struct bio *bio)
-{
- bio_set_flag(bio, BIO_CHAIN);
- smp_mb__before_atomic();
- atomic_inc(&bio->__bi_remaining);
-}
-
/* Allocate, free from mempool: */
void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
/* Bios with headers */
-static void bch2_submit_wbio(struct bch_fs *c, struct bch_write_bio *wbio,
- struct bch_dev *ca, const struct bch_extent_ptr *ptr)
-{
- wbio->ca = ca;
- wbio->submit_time_us = local_clock_us();
- wbio->bio.bi_iter.bi_sector = ptr->offset;
- wbio->bio.bi_bdev = ca ? ca->disk_sb.bdev : NULL;
-
- if (!ca)
- bcache_io_error(c, &wbio->bio, "device has been removed");
- else
- generic_make_request(&wbio->bio);
-}
-
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
+ enum bch_data_type type,
const struct bkey_i *k)
{
struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
const struct bch_extent_ptr *ptr;
struct bch_write_bio *n;
struct bch_dev *ca;
+ unsigned ptr_idx = 0;
BUG_ON(c->opts.nochanges);
- wbio->split = false;
- wbio->c = c;
-
extent_for_each_ptr(e, ptr) {
+ BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
+ !c->devs[ptr->dev]);
+
ca = c->devs[ptr->dev];
- if (!percpu_ref_tryget(&ca->io_ref)) {
- bch2_submit_wbio(c, wbio, NULL, ptr);
- break;
- }
if (ptr + 1 < &extent_entry_last(e)->ptr) {
n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO,
n->bio.bi_end_io = wbio->bio.bi_end_io;
n->bio.bi_private = wbio->bio.bi_private;
- n->c = c;
- n->orig = &wbio->bio;
- n->bounce = false;
+ n->parent = wbio;
n->split = true;
+ n->bounce = false;
n->put_bio = true;
n->bio.bi_opf = wbio->bio.bi_opf;
- __bio_inc_remaining(n->orig);
+ bio_inc_remaining(&wbio->bio);
} else {
n = wbio;
+ n->split = false;
}
+ n->c = c;
+ n->ca = ca;
+ n->ptr_idx = ptr_idx++;
+ n->submit_time_us = local_clock_us();
+ n->bio.bi_iter.bi_sector = ptr->offset;
+
if (!journal_flushes_device(ca))
n->bio.bi_opf |= REQ_FUA;
- bch2_submit_wbio(c, n, ca, ptr);
+ if (likely(percpu_ref_tryget(&ca->io_ref))) {
+ this_cpu_add(ca->io_done->sectors[WRITE][type],
+ bio_sectors(&n->bio));
+
+ n->have_io_ref = true;
+ n->bio.bi_bdev = ca->disk_sb.bdev;
+ submit_bio(&n->bio);
+ } else {
+ n->have_io_ref = false;
+ bcache_io_error(c, &n->bio, "device has been removed");
+ bio_endio(&n->bio);
+ }
}
}
struct btree_iter iter;
int ret;
- bch2_btree_iter_init_intent(&iter, op->c, BTREE_ID_EXTENTS,
- bkey_start_pos(&bch2_keylist_front(keys)->k));
+ bch2_btree_iter_init(&iter, op->c, BTREE_ID_EXTENTS,
+ bkey_start_pos(&bch2_keylist_front(keys)->k),
+ BTREE_ITER_INTENT);
ret = bch2_btree_insert_list_at(&iter, keys, &op->res,
NULL, op_journal_seq(op),
}
}
-/**
- * bch_write_discard - discard range of keys
- *
- * Used to implement discard, and to handle when writethrough write hits
- * a write error on the cache device.
- */
-static void bch2_write_discard(struct closure *cl)
-{
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bio *bio = &op->bio->bio;
- struct bpos end = op->pos;
-
- end.offset += bio_sectors(bio);
-
- op->error = bch2_discard(op->c, op->pos, end, op->version,
- &op->res, NULL, NULL);
-}
-
-/*
- * Convert extents to be inserted to discards after an error:
- */
static void bch2_write_io_error(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+ struct keylist *keys = &op->insert_keys;
+ struct bch_fs *c = op->c;
+ struct bch_extent_ptr *ptr;
+ struct bkey_i *k;
+ int ret;
- if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) {
- struct bkey_i *src = bch2_keylist_front(&op->insert_keys);
- struct bkey_i *dst = bch2_keylist_front(&op->insert_keys);
-
- /*
- * Our data write just errored, which means we've got a bunch
- * of keys to insert that point to data that wasn't
- * successfully written.
- *
- * We don't have to insert those keys but we still have to
- * invalidate that region of the cache - so, if we just strip
- * off all the pointers from the keys we'll accomplish just
- * that.
- */
+ for_each_keylist_key(keys, k) {
+ struct bkey_i *n = bkey_next(k);
+ struct bkey_s_extent e = bkey_i_to_s_extent(k);
- while (src != op->insert_keys.top) {
- struct bkey_i *n = bkey_next(src);
+ extent_for_each_ptr_backwards(e, ptr)
+ if (test_bit(ptr->dev, op->failed.d))
+ bch2_extent_drop_ptr(e, ptr);
- set_bkey_val_u64s(&src->k, 0);
- src->k.type = KEY_TYPE_DISCARD;
- bkey_copy(dst, src);
+ memmove(bkey_next(k), n, (void *) keys->top - (void *) n);
+ keys->top_p -= (u64 *) n - (u64 *) bkey_next(k);
- dst = bkey_next(dst);
- src = n;
+ ret = bch2_extent_nr_ptrs(e.c)
+ ? bch2_check_mark_super(c, e.c, BCH_DATA_USER)
+ : -EIO;
+ if (ret) {
+ keys->top = keys->keys;
+ op->error = ret;
+ op->flags |= BCH_WRITE_DONE;
+ break;
}
-
- op->insert_keys.top = dst;
- op->flags |= BCH_WRITE_DISCARD;
- } else {
- /* TODO: We could try to recover from this. */
- while (!bch2_keylist_empty(&op->insert_keys))
- bch2_keylist_pop_front(&op->insert_keys);
-
- op->error = -EIO;
- op->flags |= BCH_WRITE_DONE;
}
+ memset(&op->failed, 0, sizeof(op->failed));
+
bch2_write_index(cl);
+ return;
}
static void bch2_write_endio(struct bio *bio)
{
- struct closure *cl = bio->bi_private;
- struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bch_write_bio *wbio = to_wbio(bio);
- struct bch_fs *c = wbio->c;
- struct bio *orig = wbio->orig;
- struct bch_dev *ca = wbio->ca;
-
- if (bch2_dev_nonfatal_io_err_on(bio->bi_error, ca,
- "data write")) {
+ struct closure *cl = bio->bi_private;
+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
+ struct bch_write_bio *wbio = to_wbio(bio);
+ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
+ struct bch_fs *c = wbio->c;
+ struct bch_dev *ca = wbio->ca;
+
+ if (bch2_dev_io_err_on(bio->bi_error, ca, "data write")) {
+ set_bit(ca->dev_idx, op->failed.d);
set_closure_fn(cl, bch2_write_io_error, index_update_wq(op));
}
- if (ca)
+ if (wbio->have_io_ref)
percpu_ref_put(&ca->io_ref);
- if (bio->bi_error && orig)
- orig->bi_error = bio->bi_error;
-
if (wbio->bounce)
bch2_bio_free_pages_pool(c, bio);
if (wbio->put_bio)
bio_put(bio);
- if (orig)
- bio_endio(orig);
+ if (parent)
+ bio_endio(&parent->bio);
else
closure_put(cl);
}
bch2_keylist_push(&op->insert_keys);
}
-static int bch2_write_extent(struct bch_write_op *op,
- struct open_bucket *ob,
- struct bio *orig)
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
{
struct bch_fs *c = op->c;
+ struct bio *orig = &op->wbio.bio;
struct bio *bio;
struct bch_write_bio *wbio;
unsigned key_to_write_offset = op->insert_keys.top_p -
struct bkey_i *key_to_write;
unsigned csum_type = op->csum_type;
unsigned compression_type = op->compression_type;
- int ret;
+ int ret, more;
/* don't refetch csum type/compression type */
barrier();
+ BUG_ON(!bio_sectors(orig));
+
/* Need to decompress data? */
if ((op->flags & BCH_WRITE_DATA_COMPRESSED) &&
(crc_uncompressed_size(NULL, &op->crc) != op->size ||
- crc_compressed_size(NULL, &op->crc) > ob->sectors_free)) {
+ crc_compressed_size(NULL, &op->crc) > wp->sectors_free)) {
int ret;
ret = bch2_bio_uncompress_inplace(c, orig, op->size, op->crc);
op->crc.nonce,
op->crc.csum,
op->crc.csum_type,
- ob);
+ wp->ob);
bio = orig;
- wbio = to_wbio(bio);
- wbio->orig = NULL;
- wbio->bounce = false;
- wbio->put_bio = false;
- ret = 0;
+ wbio = wbio_init(bio);
+ more = 0;
} else if (csum_type != BCH_CSUM_NONE ||
compression_type != BCH_COMPRESSION_NONE) {
/* all units here in bytes */
unsigned total_output = 0, output_available =
- min(ob->sectors_free << 9, orig->bi_iter.bi_size);
+ min(wp->sectors_free << 9, orig->bi_iter.bi_size);
unsigned crc_nonce = bch2_csum_type_is_encryption(csum_type)
? op->nonce : 0;
struct bch_csum csum;
bio = bio_alloc_bioset(GFP_NOIO,
DIV_ROUND_UP(output_available, PAGE_SIZE),
&c->bio_write);
+ wbio = wbio_init(bio);
+ wbio->bounce = true;
+ wbio->put_bio = true;
+ /* copy WRITE_SYNC flag */
+ wbio->bio.bi_opf = orig->bi_opf;
+
/*
* XXX: can't use mempool for more than
* BCH_COMPRESSED_EXTENT_MAX worth of pages
*/
bch2_bio_alloc_pages_pool(c, bio, output_available);
- /* copy WRITE_SYNC flag */
- bio->bi_opf = orig->bi_opf;
- wbio = to_wbio(bio);
- wbio->orig = NULL;
- wbio->bounce = true;
- wbio->put_bio = true;
-
do {
unsigned fragment_compression_type = compression_type;
size_t dst_len, src_len;
orig, &src_len,
&fragment_compression_type);
- BUG_ON(!dst_len || dst_len > bio->bi_iter.bi_size);
- BUG_ON(!src_len || src_len > orig->bi_iter.bi_size);
- BUG_ON(dst_len & (block_bytes(c) - 1));
- BUG_ON(src_len & (block_bytes(c) - 1));
-
- swap(bio->bi_iter.bi_size, dst_len);
nonce = extent_nonce(op->version,
crc_nonce,
src_len >> 9,
- fragment_compression_type),
+ fragment_compression_type);
+ swap(bio->bi_iter.bi_size, dst_len);
bch2_encrypt_bio(c, csum_type, nonce, bio);
csum = bch2_checksum_bio(c, csum_type, nonce, bio);
init_append_extent(op,
dst_len >> 9, src_len >> 9,
fragment_compression_type,
- crc_nonce, csum, csum_type, ob);
+ crc_nonce, csum, csum_type, wp->ob);
total_output += dst_len;
bio_advance(bio, dst_len);
mempool_free(bio->bi_io_vec[--bio->bi_vcnt].bv_page,
&c->bio_bounce_pages);
- ret = orig->bi_iter.bi_size != 0;
+ more = orig->bi_iter.bi_size != 0;
} else {
- bio = bio_next_split(orig, ob->sectors_free, GFP_NOIO,
+ bio = bio_next_split(orig, wp->sectors_free, GFP_NOIO,
&c->bio_write);
-
- wbio = to_wbio(bio);
- wbio->orig = NULL;
- wbio->bounce = false;
+ wbio = wbio_init(bio);
wbio->put_bio = bio != orig;
init_append_extent(op, bio_sectors(bio), bio_sectors(bio),
compression_type, 0,
- (struct bch_csum) { 0 }, csum_type, ob);
+ (struct bch_csum) { 0 }, csum_type, wp->ob);
- ret = bio != orig;
+ more = bio != orig;
}
+ /* might have done a realloc... */
+
+ key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
+
+ ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
+ BCH_DATA_USER);
+ if (ret)
+ return ret;
+
bio->bi_end_io = bch2_write_endio;
bio->bi_private = &op->cl;
bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
closure_get(bio->bi_private);
- /* might have done a realloc... */
-
- key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
-
- bch2_check_mark_super(c, key_to_write, false);
-
- bch2_submit_wbio_replicas(to_wbio(bio), c, key_to_write);
- return ret;
+ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
+ key_to_write);
+ return more;
}
static void __bch2_write(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
struct bch_fs *c = op->c;
- struct bio *bio = &op->bio->bio;
unsigned open_bucket_nr = 0;
- struct open_bucket *b;
+ struct write_point *wp;
+ struct open_bucket *ob;
int ret;
- memset(op->open_buckets, 0, sizeof(op->open_buckets));
-
- if (op->flags & BCH_WRITE_DISCARD) {
- op->flags |= BCH_WRITE_DONE;
- bch2_write_discard(cl);
- bio_put(bio);
- continue_at(cl, bch2_write_done, index_update_wq(op));
- }
-
- /*
- * Journal writes are marked REQ_PREFLUSH; if the original write was a
- * flush, it'll wait on the journal write.
- */
- bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA);
-
do {
- EBUG_ON(bio->bi_iter.bi_sector != op->pos.offset);
- EBUG_ON(!bio_sectors(bio));
-
if (open_bucket_nr == ARRAY_SIZE(op->open_buckets))
continue_at(cl, bch2_write_index, index_update_wq(op));
BKEY_EXTENT_U64s_MAX))
continue_at(cl, bch2_write_index, index_update_wq(op));
- b = bch2_alloc_sectors_start(c, op->wp,
+ wp = bch2_alloc_sectors_start(c, BCH_DATA_USER,
+ op->devs,
+ op->write_point,
op->nr_replicas,
c->opts.data_replicas_required,
op->alloc_reserve,
+ op->flags,
(op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
- EBUG_ON(!b);
+ EBUG_ON(!wp);
- if (unlikely(IS_ERR(b))) {
- if (unlikely(PTR_ERR(b) != -EAGAIN)) {
- ret = PTR_ERR(b);
+ if (unlikely(IS_ERR(wp))) {
+ if (unlikely(PTR_ERR(wp) != -EAGAIN)) {
+ ret = PTR_ERR(wp);
goto err;
}
continue;
}
- BUG_ON(b - c->open_buckets == 0 ||
- b - c->open_buckets > U8_MAX);
- op->open_buckets[open_bucket_nr++] = b - c->open_buckets;
+ ob = wp->ob;
+
+ BUG_ON(ob - c->open_buckets == 0 ||
+ ob - c->open_buckets > U8_MAX);
+ op->open_buckets[open_bucket_nr++] = ob - c->open_buckets;
- ret = bch2_write_extent(op, b, bio);
+ ret = bch2_write_extent(op, wp);
- bch2_alloc_sectors_done(c, op->wp, b);
+ bch2_alloc_sectors_done(c, wp);
if (ret < 0)
goto err;
op->flags |= BCH_WRITE_DONE;
continue_at(cl, bch2_write_index, index_update_wq(op));
err:
- if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) {
- /*
- * If we were writing cached data, not doing the write is fine
- * so long as we discard whatever would have been overwritten -
- * then it's equivalent to doing the write and immediately
- * reclaiming it.
- */
-
- bch2_write_discard(cl);
- } else {
- /*
- * Right now we can only error here if we went RO - the
- * allocation failed, but we already checked for -ENOSPC when we
- * got our reservation.
- *
- * XXX capacity might have changed, but we don't check for that
- * yet:
- */
- op->error = ret;
- }
-
+ /*
+ * Right now we can only error here if we went RO - the
+ * allocation failed, but we already checked for -ENOSPC when we
+ * got our reservation.
+ *
+ * XXX capacity might have changed, but we don't check for that
+ * yet:
+ */
+ op->error = ret;
op->flags |= BCH_WRITE_DONE;
/*
* after the data is written it calls bch_journal, and after the keys have been
* added to the next journal write they're inserted into the btree.
*
- * It inserts the data in op->bio; bi_sector is used for the key offset, and
- * op->inode is used for the key inode.
- *
* If op->discard is true, instead of inserting the data it invalidates the
* region of the cache represented by op->bio and op->inode.
*/
void bch2_write(struct closure *cl)
{
struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
- struct bio *bio = &op->bio->bio;
+ struct bio *bio = &op->wbio.bio;
struct bch_fs *c = op->c;
u64 inode = op->pos.inode;
op->version.lo =
atomic64_inc_return(&c->key_version) + 1;
- if (!(op->flags & BCH_WRITE_DISCARD))
- bch2_increment_clock(c, bio_sectors(bio), WRITE);
+ bch2_increment_clock(c, bio_sectors(bio), WRITE);
/* Don't call bch2_next_delay() if rate is >= 1 GB/sec */
- if (c->foreground_write_ratelimit_enabled &&
- c->foreground_write_pd.rate.rate < (1 << 30) &&
- !(op->flags & BCH_WRITE_DISCARD) && op->wp->throttle) {
+ if ((op->flags & BCH_WRITE_THROTTLE) &&
+ c->foreground_write_ratelimit_enabled &&
+ c->foreground_write_pd.rate.rate < (1 << 30)) {
unsigned long flags;
u64 delay;
spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
bch2_ratelimit_increment(&c->foreground_write_pd.rate,
- bio->bi_iter.bi_size);
+ bio->bi_iter.bi_size);
delay = bch2_ratelimit_delay(&c->foreground_write_pd.rate);
}
void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
- struct bch_write_bio *bio, struct disk_reservation res,
- struct write_point *wp, struct bpos pos,
- u64 *journal_seq, unsigned flags)
+ struct disk_reservation res,
+ struct bch_devs_mask *devs,
+ unsigned long write_point,
+ struct bpos pos,
+ u64 *journal_seq, unsigned flags)
{
EBUG_ON(res.sectors && !res.nr_replicas);
op->c = c;
op->io_wq = index_update_wq(op);
- op->bio = bio;
op->written = 0;
op->error = 0;
op->flags = flags;
op->csum_type = bch2_data_checksum_type(c);
- op->compression_type = c->opts.compression;
+ op->compression_type =
+ bch2_compression_opt_to_type(c->opts.compression);
op->nr_replicas = res.nr_replicas;
op->alloc_reserve = RESERVE_NONE;
op->nonce = 0;
op->pos = pos;
op->version = ZERO_VERSION;
op->res = res;
- op->wp = wp;
+ op->devs = devs;
+ op->write_point = write_point;
if (journal_seq) {
op->journal_seq_p = journal_seq;
op->index_update_fn = bch2_write_index_default;
+ memset(op->open_buckets, 0, sizeof(op->open_buckets));
+ memset(&op->failed, 0, sizeof(op->failed));
+
bch2_keylist_init(&op->insert_keys,
op->inline_keys,
ARRAY_SIZE(op->inline_keys));
get_random_bytes(&op->version, sizeof(op->version));
}
-/* Discard */
-
-/* bch_discard - discard a range of keys from start_key to end_key.
- * @c filesystem
- * @start_key pointer to start location
- * NOTE: discard starts at bkey_start_offset(start_key)
- * @end_key pointer to end location
- * NOTE: discard ends at KEY_OFFSET(end_key)
- * @version version of discard (0ULL if none)
- *
- * Returns:
- * 0 on success
- * <0 on error
- *
- * XXX: this needs to be refactored with inode_truncate, or more
- * appropriately inode_truncate should call this
- */
-int bch2_discard(struct bch_fs *c, struct bpos start,
- struct bpos end, struct bversion version,
- struct disk_reservation *disk_res,
- struct extent_insert_hook *hook,
- u64 *journal_seq)
-{
- return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, start, end, version,
- disk_res, hook, journal_seq);
-}
-
/* Cache promotion on read */
-struct cache_promote_op {
+struct promote_op {
struct closure cl;
struct migrate_write write;
struct bio_vec bi_inline_vecs[0]; /* must be last */
};
+static void promote_done(struct closure *cl)
+{
+ struct promote_op *op =
+ container_of(cl, struct promote_op, cl);
+ struct bch_fs *c = op->write.op.c;
+
+ percpu_ref_put(&c->writes);
+ bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio);
+ kfree(op);
+}
+
+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
+{
+ struct bch_fs *c = rbio->c;
+ struct closure *cl = &op->cl;
+ struct bio *bio = &op->write.op.wbio.bio;
+
+ BUG_ON(!rbio->split || !rbio->bounce);
+
+ if (!percpu_ref_tryget(&c->writes))
+ return;
+
+ trace_promote(&rbio->bio);
+
+ /* we now own pages: */
+ swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
+ memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+ sizeof(struct bio_vec) * bio->bi_vcnt);
+ rbio->promote = NULL;
+
+ closure_init(cl, NULL);
+ closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
+ closure_return_with_destructor(cl, promote_done);
+}
+
+/*
+ * XXX: multiple promotes can race with each other, wastefully. Keep a list of
+ * outstanding promotes?
+ */
+static struct promote_op *promote_alloc(struct bch_fs *c,
+ struct bvec_iter iter,
+ struct bkey_s_c k,
+ struct extent_pick_ptr *pick,
+ bool read_full)
+{
+ struct promote_op *op;
+ struct bio *bio;
+ /*
+ * biovec needs to be big enough to hold decompressed data, if
+ * bch2_write_extent() has to decompress/recompress it:
+ */
+ unsigned sectors = max_t(unsigned, k.k->size,
+ crc_uncompressed_size(NULL, &pick->crc));
+ unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+
+ op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
+ if (!op)
+ return NULL;
+
+ bio = &op->write.op.wbio.bio;
+ bio_init(bio, bio->bi_inline_vecs, pages);
+
+ bio->bi_iter = iter;
+
+ if (pick->crc.compression_type) {
+ op->write.op.flags |= BCH_WRITE_DATA_COMPRESSED;
+ op->write.op.crc = pick->crc;
+ op->write.op.size = k.k->size;
+ } else if (read_full) {
+ /*
+ * Adjust bio to correspond to _live_ portion of @k -
+ * which might be less than what we're actually reading:
+ */
+ bio->bi_iter.bi_size = sectors << 9;
+ bio_advance(bio, pick->crc.offset << 9);
+ BUG_ON(bio_sectors(bio) < k.k->size);
+ bio->bi_iter.bi_size = k.k->size << 9;
+ } else {
+ /*
+ * Set insert pos to correspond to what we're actually
+ * reading:
+ */
+ op->write.op.pos.offset = iter.bi_sector;
+ }
+ bch2_migrate_write_init(c, &op->write,
+ c->fastest_devs,
+ k, NULL,
+ BCH_WRITE_ALLOC_NOWAIT|
+ BCH_WRITE_CACHED);
+ op->write.promote = true;
+
+ return op;
+}
+
+/* only promote if we're not reading from the fastest tier: */
+static bool should_promote(struct bch_fs *c,
+ struct extent_pick_ptr *pick, unsigned flags)
+{
+ if (!(flags & BCH_READ_MAY_PROMOTE))
+ return false;
+
+ if (flags & BCH_READ_IN_RETRY)
+ return false;
+
+ if (percpu_ref_is_dying(&c->writes))
+ return false;
+
+ return c->fastest_tier &&
+ c->fastest_tier < c->tiers + pick->ca->mi.tier;
+}
+
/* Read */
-static int bio_checksum_uncompress(struct bch_fs *c,
- struct bch_read_bio *rbio)
+#define READ_RETRY_AVOID 1
+#define READ_RETRY 2
+#define READ_ERR 3
+
+static inline struct bch_read_bio *
+bch2_rbio_parent(struct bch_read_bio *rbio)
+{
+ return rbio->split ? rbio->parent : rbio;
+}
+
+__always_inline
+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
+ struct workqueue_struct *wq)
+{
+
+ if (!wq || rbio->process_context) {
+ fn(&rbio->work);
+ } else {
+ rbio->work.func = fn;
+ rbio->process_context = true;
+ queue_work(wq, &rbio->work);
+ }
+}
+
+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio)
{
+ struct bch_read_bio *parent = rbio->parent;
+
+ BUG_ON(!rbio->split);
+
+ if (rbio->promote)
+ kfree(rbio->promote);
+ if (rbio->bounce)
+ bch2_bio_free_pages_pool(rbio->c, &rbio->bio);
+ bio_put(&rbio->bio);
+
+ return parent;
+}
+
+static void bch2_rbio_done(struct bch_read_bio *rbio)
+{
+ if (rbio->promote)
+ kfree(rbio->promote);
+ rbio->promote = NULL;
+
+ if (rbio->split)
+ rbio = bch2_rbio_free(rbio);
+ bio_endio(&rbio->bio);
+}
+
+static void bch2_rbio_retry(struct work_struct *work)
+{
+ struct bch_read_bio *rbio =
+ container_of(work, struct bch_read_bio, work);
+ struct bch_fs *c = rbio->c;
+ struct bvec_iter iter = rbio->bvec_iter;
+ unsigned flags = rbio->flags;
+ u64 inode = rbio->inode;
+ struct bch_devs_mask avoid;
+
+ trace_read_retry(&rbio->bio);
+
+ memset(&avoid, 0, sizeof(avoid));
+
+ if (rbio->retry == READ_RETRY_AVOID)
+ __set_bit(rbio->pick.ca->dev_idx, avoid.d);
+
+ if (rbio->split)
+ rbio = bch2_rbio_free(rbio);
+ else
+ rbio->bio.bi_error = 0;
+
+ flags |= BCH_READ_MUST_CLONE;
+ flags |= BCH_READ_IN_RETRY;
+
+ __bch2_read(c, rbio, iter, inode, &avoid, flags);
+}
+
+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
+{
+ rbio->retry = retry;
+
+ if (rbio->flags & BCH_READ_IN_RETRY)
+ return;
+
+ if (retry == READ_ERR) {
+ bch2_rbio_parent(rbio)->bio.bi_error = error;
+ bch2_rbio_done(rbio);
+ } else {
+ bch2_rbio_punt(rbio, bch2_rbio_retry, rbio->c->wq);
+ }
+}
+
+static int bch2_rbio_checksum_uncompress(struct bio *dst,
+ struct bch_read_bio *rbio)
+{
+ struct bch_fs *c = rbio->c;
struct bio *src = &rbio->bio;
- struct bio *dst = &bch2_rbio_parent(rbio)->bio;
- struct bvec_iter dst_iter = rbio->parent_iter;
+ struct bvec_iter dst_iter = rbio->bvec_iter;
struct nonce nonce = extent_nonce(rbio->version,
- rbio->crc.nonce,
- crc_uncompressed_size(NULL, &rbio->crc),
- rbio->crc.compression_type);
+ rbio->pick.crc.nonce,
+ crc_uncompressed_size(NULL, &rbio->pick.crc),
+ rbio->pick.crc.compression_type);
struct bch_csum csum;
int ret = 0;
* in order to promote
*/
if (rbio->bounce) {
- src->bi_iter.bi_size = crc_compressed_size(NULL, &rbio->crc) << 9;
+ src->bi_iter.bi_size = crc_compressed_size(NULL, &rbio->pick.crc) << 9;
src->bi_iter.bi_idx = 0;
src->bi_iter.bi_bvec_done = 0;
} else {
- src->bi_iter = rbio->parent_iter;
+ src->bi_iter = rbio->bvec_iter;
}
- csum = bch2_checksum_bio(c, rbio->crc.csum_type, nonce, src);
- if (bch2_dev_nonfatal_io_err_on(bch2_crc_cmp(rbio->crc.csum, csum),
- rbio->ca,
+ csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, nonce, src);
+ if (bch2_dev_io_err_on(bch2_crc_cmp(rbio->pick.crc.csum, csum),
+ rbio->pick.ca,
"data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
- rbio->inode, (u64) rbio->parent_iter.bi_sector << 9,
- rbio->crc.csum.hi, rbio->crc.csum.lo, csum.hi, csum.lo,
- rbio->crc.csum_type))
+ rbio->inode, (u64) rbio->bvec_iter.bi_sector << 9,
+ rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
+ csum.hi, csum.lo,
+ rbio->pick.crc.csum_type))
ret = -EIO;
/*
* If there was a checksum error, still copy the data back - unless it
* was compressed, we don't want to decompress bad data:
*/
- if (rbio->crc.compression_type != BCH_COMPRESSION_NONE) {
+ if (rbio->pick.crc.compression_type != BCH_COMPRESSION_NONE) {
if (!ret) {
- bch2_encrypt_bio(c, rbio->crc.csum_type, nonce, src);
+ bch2_encrypt_bio(c, rbio->pick.crc.csum_type, nonce, src);
ret = bch2_bio_uncompress(c, src, dst,
- dst_iter, rbio->crc);
+ dst_iter, rbio->pick.crc);
if (ret)
__bcache_io_error(c, "decompression error");
}
} else if (rbio->bounce) {
- bio_advance(src, rbio->crc.offset << 9);
+ bio_advance(src, rbio->pick.crc.offset << 9);
/* don't need to decrypt the entire bio: */
BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
src->bi_iter.bi_size = dst_iter.bi_size;
- nonce = nonce_add(nonce, rbio->crc.offset << 9);
+ nonce = nonce_add(nonce, rbio->pick.crc.offset << 9);
- bch2_encrypt_bio(c, rbio->crc.csum_type,
+ bch2_encrypt_bio(c, rbio->pick.crc.csum_type,
nonce, src);
- bio_copy_data_iter(dst, dst_iter,
- src, src->bi_iter);
+ bio_copy_data_iter(dst, &dst_iter,
+ src, &src->bi_iter);
} else {
- bch2_encrypt_bio(c, rbio->crc.csum_type, nonce, src);
+ bch2_encrypt_bio(c, rbio->pick.crc.csum_type, nonce, src);
}
return ret;
}
-static void bch2_rbio_free(struct bch_read_bio *rbio)
-{
- struct bch_fs *c = rbio->c;
- struct bio *bio = &rbio->bio;
-
- BUG_ON(rbio->ca);
- BUG_ON(!rbio->split);
-
- if (rbio->promote)
- kfree(rbio->promote);
- if (rbio->bounce)
- bch2_bio_free_pages_pool(c, bio);
-
- bio_put(bio);
-}
-
-static void bch2_rbio_done(struct bch_read_bio *rbio)
-{
- struct bio *orig = &bch2_rbio_parent(rbio)->bio;
-
- percpu_ref_put(&rbio->ca->io_ref);
- rbio->ca = NULL;
-
- if (rbio->split) {
- if (rbio->bio.bi_error)
- orig->bi_error = rbio->bio.bi_error;
-
- bio_endio(orig);
- bch2_rbio_free(rbio);
- } else {
- if (rbio->promote)
- kfree(rbio->promote);
-
- orig->bi_end_io = rbio->orig_bi_end_io;
- bio_endio_nodec(orig);
- }
-}
-
-static void bch2_rbio_error(struct bch_read_bio *rbio, int error)
-{
- bch2_rbio_parent(rbio)->bio.bi_error = error;
- bch2_rbio_done(rbio);
-}
-
-static void bch2_rbio_retry(struct bch_fs *c, struct bch_read_bio *rbio)
-{
- unsigned long flags;
-
- percpu_ref_put(&rbio->ca->io_ref);
- rbio->ca = NULL;
-
- spin_lock_irqsave(&c->read_retry_lock, flags);
- bio_list_add(&c->read_retry_list, &rbio->bio);
- spin_unlock_irqrestore(&c->read_retry_lock, flags);
- queue_work(c->wq, &c->read_retry_work);
-}
-
-static void cache_promote_done(struct closure *cl)
-{
- struct cache_promote_op *op =
- container_of(cl, struct cache_promote_op, cl);
-
- bch2_bio_free_pages_pool(op->write.op.c, &op->write.wbio.bio);
- kfree(op);
-}
-
/* Inner part that may run in process context */
static void __bch2_read_endio(struct work_struct *work)
{
struct bch_read_bio *rbio =
container_of(work, struct bch_read_bio, work);
- struct bch_fs *c = rbio->c;
int ret;
- ret = bio_checksum_uncompress(c, rbio);
+ ret = bch2_rbio_checksum_uncompress(&bch2_rbio_parent(rbio)->bio, rbio);
if (ret) {
/*
* Checksum error: if the bio wasn't bounced, we may have been
* scribble over) - retry the read, bouncing it this time:
*/
if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
- rbio->flags |= BCH_READ_FORCE_BOUNCE;
- bch2_rbio_retry(c, rbio);
+ rbio->flags |= BCH_READ_MUST_BOUNCE;
+ bch2_rbio_error(rbio, READ_RETRY, ret);
} else {
- bch2_rbio_error(rbio, -EIO);
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, ret);
}
return;
}
- if (rbio->promote) {
- struct cache_promote_op *promote = rbio->promote;
- struct closure *cl = &promote->cl;
-
- BUG_ON(!rbio->split || !rbio->bounce);
-
- trace_promote(&rbio->bio);
-
- /* we now own pages: */
- swap(promote->write.wbio.bio.bi_vcnt, rbio->bio.bi_vcnt);
- rbio->promote = NULL;
-
- bch2_rbio_done(rbio);
+ if (rbio->promote)
+ promote_start(rbio->promote, rbio);
- closure_init(cl, &c->cl);
- closure_call(&promote->write.op.cl, bch2_write, c->wq, cl);
- closure_return_with_destructor(cl, cache_promote_done);
- } else {
+ if (likely(!(rbio->flags & BCH_READ_IN_RETRY)))
bch2_rbio_done(rbio);
- }
}
static void bch2_read_endio(struct bio *bio)
struct bch_read_bio *rbio =
container_of(bio, struct bch_read_bio, bio);
struct bch_fs *c = rbio->c;
+ struct workqueue_struct *wq = NULL;
+
+ percpu_ref_put(&rbio->pick.ca->io_ref);
- if (bch2_dev_nonfatal_io_err_on(bio->bi_error, rbio->ca, "data read")) {
- /* XXX: retry IO errors when we have another replica */
- bch2_rbio_error(rbio, bio->bi_error);
+ if (!rbio->split)
+ rbio->bio.bi_end_io = rbio->end_io;
+
+ if (bch2_dev_io_err_on(bio->bi_error, rbio->pick.ca, "data read")) {
+ bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_error);
return;
}
- if (rbio->ptr.cached &&
+ if (rbio->pick.ptr.cached &&
(((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) ||
- ptr_stale(rbio->ca, &rbio->ptr))) {
- atomic_long_inc(&c->cache_read_races);
+ ptr_stale(rbio->pick.ca, &rbio->pick.ptr))) {
+ atomic_long_inc(&c->read_realloc_races);
if (rbio->flags & BCH_READ_RETRY_IF_STALE)
- bch2_rbio_retry(c, rbio);
+ bch2_rbio_error(rbio, READ_RETRY, -EINTR);
else
- bch2_rbio_error(rbio, -EINTR);
+ bch2_rbio_error(rbio, READ_ERR, -EINTR);
return;
}
- if (rbio->crc.compression_type ||
- bch2_csum_type_is_encryption(rbio->crc.csum_type))
- queue_work(system_unbound_wq, &rbio->work);
- else if (rbio->crc.csum_type)
- queue_work(system_highpri_wq, &rbio->work);
- else
- __bch2_read_endio(&rbio->work);
-}
-
-static bool should_promote(struct bch_fs *c,
- struct extent_pick_ptr *pick, unsigned flags)
-{
- if (!(flags & BCH_READ_PROMOTE))
- return false;
-
- if (percpu_ref_is_dying(&c->writes))
- return false;
+ if (rbio->pick.crc.compression_type ||
+ bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
+ wq = system_unbound_wq;
+ else if (rbio->pick.crc.csum_type)
+ wq = system_highpri_wq;
- return c->fastest_tier &&
- c->fastest_tier < c->tiers + pick->ca->mi.tier;
+ bch2_rbio_punt(rbio, __bch2_read_endio, wq);
}
-void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig,
- struct bvec_iter iter, struct bkey_s_c k,
- struct extent_pick_ptr *pick, unsigned flags)
+int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
+ struct bvec_iter iter, struct bkey_s_c k,
+ struct extent_pick_ptr *pick, unsigned flags)
{
struct bch_read_bio *rbio;
- struct cache_promote_op *promote_op = NULL;
+ struct promote_op *promote_op = NULL;
unsigned skip = iter.bi_sector - bkey_start_offset(k.k);
bool bounce = false, split, read_full = false;
+ int ret = 0;
bch2_increment_clock(c, bio_sectors(&orig->bio), READ);
+ PTR_BUCKET(pick->ca, &pick->ptr)->prio[READ] = c->prio_clock[READ].hand;
EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
k.k->p.offset < bvec_iter_end_sector(iter));
- /* only promote if we're not reading from the fastest tier: */
-
- /*
- * XXX: multiple promotes can race with each other, wastefully. Keep a
- * list of outstanding promotes?
- */
- if (should_promote(c, pick, flags)) {
- /*
- * biovec needs to be big enough to hold decompressed data, if
- * the bch2_write_extent() has to decompress/recompress it:
- */
- unsigned sectors =
- max_t(unsigned, k.k->size,
- crc_uncompressed_size(NULL, &pick->crc));
- unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
-
- promote_op = kmalloc(sizeof(*promote_op) +
- sizeof(struct bio_vec) * pages, GFP_NOIO);
- if (promote_op) {
- struct bio *promote_bio = &promote_op->write.wbio.bio;
-
- bio_init(promote_bio);
- promote_bio->bi_max_vecs = pages;
- promote_bio->bi_io_vec = promote_bio->bi_inline_vecs;
- bounce = true;
- /* could also set read_full */
- }
- }
-
/*
* note: if compression_type and crc_type both == none, then
* compressed/uncompressed size is zero
(bvec_iter_sectors(iter) != crc_uncompressed_size(NULL, &pick->crc) ||
(bch2_csum_type_is_encryption(pick->crc.csum_type) &&
(flags & BCH_READ_USER_MAPPED)) ||
- (flags & BCH_READ_FORCE_BOUNCE)))) {
+ (flags & BCH_READ_MUST_BOUNCE)))) {
read_full = true;
bounce = true;
}
+ if (should_promote(c, pick, flags))
+ promote_op = promote_alloc(c, iter, k, pick, read_full);
+
+ /* could also set read_full */
+ if (promote_op)
+ bounce = true;
+
if (bounce) {
unsigned sectors = read_full
? (crc_compressed_size(NULL, &pick->crc) ?: k.k->size)
: bvec_iter_sectors(iter);
- rbio = container_of(bio_alloc_bioset(GFP_NOIO,
+ rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
DIV_ROUND_UP(sectors, PAGE_SECTORS),
- &c->bio_read_split),
- struct bch_read_bio, bio);
+ &c->bio_read_split));
bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9);
split = true;
- } else if (!(flags & BCH_READ_MAY_REUSE_BIO) ||
- !(flags & BCH_READ_IS_LAST)) {
+ } else if (flags & BCH_READ_MUST_CLONE) {
/*
* Have to clone if there were any splits, due to error
* reporting issues (if a split errored, and retrying didn't
* from the whole bio, in which case we don't want to retry and
* lose the error)
*/
- rbio = container_of(bio_clone_fast(&orig->bio,
- GFP_NOIO, &c->bio_read_split),
- struct bch_read_bio, bio);
+ rbio = rbio_init(bio_clone_fast(&orig->bio,
+ GFP_NOIO, &c->bio_read_split));
rbio->bio.bi_iter = iter;
split = true;
} else {
BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
}
- if (!(flags & BCH_READ_IS_LAST))
- __bio_inc_remaining(&orig->bio);
+ rbio->c = c;
if (split)
rbio->parent = orig;
else
- rbio->orig_bi_end_io = orig->bio.bi_end_io;
- rbio->parent_iter = iter;
+ rbio->end_io = orig->bio.bi_end_io;
+ rbio->bvec_iter = iter;
rbio->flags = flags;
rbio->bounce = bounce;
rbio->split = split;
- rbio->c = c;
- rbio->ca = pick->ca;
- rbio->ptr = pick->ptr;
- rbio->crc = pick->crc;
+ rbio->process_context = false;
+ rbio->retry = 0;
+ rbio->pick = *pick;
/*
* crc.compressed_size will be 0 if there wasn't any checksum
* information, also we need to stash the original size of the bio if we
* bounced (which isn't necessarily the original key size, if we bounced
* only for promoting)
*/
- rbio->crc._compressed_size = bio_sectors(&rbio->bio) - 1;
+ rbio->pick.crc._compressed_size = bio_sectors(&rbio->bio) - 1;
rbio->version = k.k->version;
rbio->promote = promote_op;
rbio->inode = k.k->p.inode;
- INIT_WORK(&rbio->work, __bch2_read_endio);
+ INIT_WORK(&rbio->work, NULL);
rbio->bio.bi_bdev = pick->ca->disk_sb.bdev;
rbio->bio.bi_opf = orig->bio.bi_opf;
rbio->bio.bi_iter.bi_sector = pick->ptr.offset;
rbio->bio.bi_end_io = bch2_read_endio;
- if (promote_op) {
- struct bio *promote_bio = &promote_op->write.wbio.bio;
-
- promote_bio->bi_iter = rbio->bio.bi_iter;
- memcpy(promote_bio->bi_io_vec, rbio->bio.bi_io_vec,
- sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
-
- bch2_migrate_write_init(c, &promote_op->write,
- &c->promote_write_point,
- k, NULL,
- BCH_WRITE_ALLOC_NOWAIT|
- BCH_WRITE_CACHED);
- promote_op->write.promote = true;
-
- if (rbio->crc.compression_type) {
- promote_op->write.op.flags |= BCH_WRITE_DATA_COMPRESSED;
- promote_op->write.op.crc = rbio->crc;
- promote_op->write.op.size = k.k->size;
- } else if (read_full) {
- /*
- * Adjust bio to correspond to _live_ portion of @k -
- * which might be less than what we're actually reading:
- */
- bio_advance(promote_bio, rbio->crc.offset << 9);
- BUG_ON(bio_sectors(promote_bio) < k.k->size);
- promote_bio->bi_iter.bi_size = k.k->size << 9;
- } else {
- /*
- * Set insert pos to correspond to what we're actually
- * reading:
- */
- promote_op->write.op.pos.offset = iter.bi_sector;
- }
-
- promote_bio->bi_iter.bi_sector =
- promote_op->write.op.pos.offset;
- }
-
- /* _after_ promete stuff has looked at rbio->crc.offset */
if (read_full)
- rbio->crc.offset += skip;
+ rbio->pick.crc.offset += skip;
else
rbio->bio.bi_iter.bi_sector += skip;
if (bounce)
trace_read_bounce(&rbio->bio);
- if (!(flags & BCH_READ_IS_LAST))
- trace_read_split(&rbio->bio);
+ this_cpu_add(pick->ca->io_done->sectors[READ][BCH_DATA_USER],
+ bio_sectors(&rbio->bio));
+
+ if (likely(!(flags & BCH_READ_IN_RETRY))) {
+ submit_bio(&rbio->bio);
+ } else {
+ submit_bio_wait(&rbio->bio);
+
+ rbio->process_context = true;
+ bch2_read_endio(&rbio->bio);
- generic_make_request(&rbio->bio);
+ ret = rbio->retry;
+ if (!ret)
+ bch2_rbio_done(rbio);
+ }
+
+ return ret;
}
-static void bch2_read_iter(struct bch_fs *c, struct bch_read_bio *rbio,
- struct bvec_iter bvec_iter, u64 inode,
- unsigned flags)
+void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
+ struct bvec_iter bvec_iter, u64 inode,
+ struct bch_devs_mask *avoid, unsigned flags)
{
- struct bio *bio = &rbio->bio;
struct btree_iter iter;
struct bkey_s_c k;
int ret;
-
- for_each_btree_key_with_holes(&iter, c, BTREE_ID_EXTENTS,
- POS(inode, bvec_iter.bi_sector), k) {
+retry:
+ for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
+ POS(inode, bvec_iter.bi_sector),
+ BTREE_ITER_WITH_HOLES, k) {
BKEY_PADDED(k) tmp;
struct extent_pick_ptr pick;
- unsigned bytes, sectors;
- bool is_last;
+ struct bvec_iter fragment;
/*
* Unlock the iterator while the btree node's lock is still in
k = bkey_i_to_s_c(&tmp.k);
bch2_btree_iter_unlock(&iter);
- bch2_extent_pick_ptr(c, k, &pick);
+ bch2_extent_pick_ptr(c, k, avoid, &pick);
if (IS_ERR(pick.ca)) {
- bcache_io_error(c, bio, "no device to read from");
- bio_endio(bio);
+ bcache_io_error(c, &rbio->bio, "no device to read from");
+ bio_endio(&rbio->bio);
return;
}
- sectors = min_t(u64, k.k->p.offset,
- bvec_iter_end_sector(bvec_iter)) -
- bvec_iter.bi_sector;
- bytes = sectors << 9;
- is_last = bytes == bvec_iter.bi_size;
- swap(bvec_iter.bi_size, bytes);
-
- if (is_last)
- flags |= BCH_READ_IS_LAST;
+ fragment = bvec_iter;
+ fragment.bi_size = (min_t(u64, k.k->p.offset,
+ bvec_iter_end_sector(bvec_iter)) -
+ bvec_iter.bi_sector) << 9;
if (pick.ca) {
- PTR_BUCKET(pick.ca, &pick.ptr)->read_prio =
- c->prio_clock[READ].hand;
-
- bch2_read_extent_iter(c, rbio, bvec_iter,
- k, &pick, flags);
+ if (fragment.bi_size != bvec_iter.bi_size) {
+ bio_inc_remaining(&rbio->bio);
+ flags |= BCH_READ_MUST_CLONE;
+ trace_read_split(&rbio->bio);
+ }
- flags &= ~BCH_READ_MAY_REUSE_BIO;
+ ret = __bch2_read_extent(c, rbio, fragment,
+ k, &pick, flags);
+ switch (ret) {
+ case READ_RETRY_AVOID:
+ __set_bit(pick.ca->dev_idx, avoid->d);
+ case READ_RETRY:
+ goto retry;
+ case READ_ERR:
+ bio_endio(&rbio->bio);
+ return;
+ };
} else {
- zero_fill_bio_iter(bio, bvec_iter);
+ zero_fill_bio_iter(&rbio->bio, fragment);
- if (is_last)
- bio_endio(bio);
+ if (fragment.bi_size == bvec_iter.bi_size)
+ bio_endio(&rbio->bio);
}
- if (is_last)
+ if (fragment.bi_size == bvec_iter.bi_size)
return;
- swap(bvec_iter.bi_size, bytes);
- bio_advance_iter(bio, &bvec_iter, bytes);
+ bio_advance_iter(&rbio->bio, &bvec_iter, fragment.bi_size);
}
/*
*/
ret = bch2_btree_iter_unlock(&iter);
BUG_ON(!ret);
- bcache_io_error(c, bio, "btree IO error %i", ret);
- bio_endio(bio);
-}
-
-void bch2_read(struct bch_fs *c, struct bch_read_bio *bio, u64 inode)
-{
- bch2_read_iter(c, bio, bio->bio.bi_iter, inode,
- BCH_READ_RETRY_IF_STALE|
- BCH_READ_PROMOTE|
- BCH_READ_MAY_REUSE_BIO|
- BCH_READ_USER_MAPPED);
-}
-
-/**
- * bch_read_retry - re-submit a bio originally from bch2_read()
- */
-static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio)
-{
- struct bch_read_bio *parent = bch2_rbio_parent(rbio);
- struct bvec_iter iter = rbio->parent_iter;
- unsigned flags = rbio->flags;
- u64 inode = rbio->inode;
-
- trace_read_retry(&rbio->bio);
-
- if (rbio->split)
- bch2_rbio_free(rbio);
- else
- rbio->bio.bi_end_io = rbio->orig_bi_end_io;
-
- bch2_read_iter(c, parent, iter, inode, flags);
-}
-
-void bch2_read_retry_work(struct work_struct *work)
-{
- struct bch_fs *c = container_of(work, struct bch_fs,
- read_retry_work);
- struct bch_read_bio *rbio;
- struct bio *bio;
- unsigned long flags;
-
- while (1) {
- spin_lock_irqsave(&c->read_retry_lock, flags);
- bio = bio_list_pop(&c->read_retry_list);
- spin_unlock_irqrestore(&c->read_retry_lock, flags);
-
- if (!bio)
- break;
-
- rbio = container_of(bio, struct bch_read_bio, bio);
- bch2_read_retry(c, rbio);
- }
+ bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+ bio_endio(&rbio->bio);
}