X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fio.c;h=0bc72d2a4dd4cf09bc6b11b9e22a8afcb43977f8;hb=49ba8d0ef6133487559bdc73f2afc87fbea85fe0;hp=17ea38e42ae850cbdc6cf462b25376aef3622537;hpb=92d34f6ed29e90d48c40a4c31816df805edfe483;p=bcachefs-tools-debian diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 17ea38e..0bc72d2 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -7,8 +7,9 @@ */ #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "bset.h" #include "btree_update.h" #include "buckets.h" @@ -26,14 +27,23 @@ #include "keylist.h" #include "move.h" #include "rebalance.h" +#include "subvolume.h" #include "super.h" #include "super-io.h" #include #include +#include #include +const char *bch2_blk_status_to_str(blk_status_t status) +{ + if (status == BLK_STS_REMOVED) + return "device removed"; + return blk_status_to_str(status); +} + static bool bch2_target_congested(struct bch_fs *c, u16 target) { const struct bch_devs_mask *devs; @@ -46,7 +56,9 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) return false; rcu_read_lock(); - devs = bch2_target_to_mask(c, target); + devs = bch2_target_to_mask(c, target) ?: + &c->rw_devs[BCH_DATA_user]; + for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { ca = rcu_dereference(c->devs[d]); if (!ca) @@ -109,7 +121,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) * the time: */ if (abs((int) (old - io_latency)) < (old >> 1) && - now & ~(~0 << 5)) + now & ~(~0U << 5)) break; new = ewma_add(old, io_latency, 5); @@ -160,7 +172,7 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, while (size) { struct page *page = __bio_alloc_page_pool(c, &using_mempool); - unsigned len = min(PAGE_SIZE, size); + unsigned len = min_t(size_t, PAGE_SIZE, size); BUG_ON(!bio_add_page(bio, page, len, 0)); size -= len; @@ -172,39 +184,48 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, /* Extent update path: */ -static int sum_sector_overwrites(struct btree_trans *trans, - struct btree_iter *extent_iter, - struct bkey_i *new, - bool may_allocate, - bool *maybe_extending, - s64 *delta) +int bch2_sum_sector_overwrites(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i *new, + bool *maybe_extending, + bool *usage_increasing, + s64 *i_sectors_delta, + s64 *disk_sectors_delta) { - struct btree_iter *iter; + struct bch_fs *c = trans->c; + struct btree_iter iter; struct bkey_s_c old; + unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); + bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); int ret = 0; - *maybe_extending = true; - *delta = 0; + *maybe_extending = true; + *usage_increasing = false; + *i_sectors_delta = 0; + *disk_sectors_delta = 0; - iter = bch2_trans_copy_iter(trans, extent_iter); - if (IS_ERR(iter)) - return PTR_ERR(iter); + bch2_trans_copy_iter(&iter, extent_iter); for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { - if (!may_allocate && - bch2_bkey_nr_ptrs_fully_allocated(old) < - bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) { - ret = -ENOSPC; - break; - } + s64 sectors = min(new->k.p.offset, old.k->p.offset) - + max(bkey_start_offset(&new->k), + bkey_start_offset(old.k)); - *delta += (min(new->k.p.offset, - old.k->p.offset) - - max(bkey_start_offset(&new->k), - bkey_start_offset(old.k))) * + *i_sectors_delta += sectors * (bkey_extent_is_allocation(&new->k) - bkey_extent_is_allocation(old.k)); + *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)); + *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot + ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) + : 0; + + if (!*usage_increasing && + (new->k.p.snapshot != old.k->p.snapshot || + new_replicas > bch2_bkey_replicas(c, old) || + (!new_compressed && bch2_bkey_sectors_compressed(old)))) + *usage_increasing = true; + if (bkey_cmp(old.k->p, new->k.p) >= 0) { /* * Check if there's already data above where we're @@ -216,8 +237,12 @@ static int sum_sector_overwrites(struct btree_trans *trans, * writing to, because i_size could be up to one block * less: */ - if (!bkey_cmp(old.k->p, new->k.p)) - old = bch2_btree_iter_next(iter); + if (!bkey_cmp(old.k->p, new->k.p)) { + old = bch2_btree_iter_next(&iter); + ret = bkey_err(old); + if (ret) + break; + } if (old.k && !bkey_err(old) && old.k->p.inode == extent_iter->pos.inode && @@ -228,46 +253,74 @@ static int sum_sector_overwrites(struct btree_trans *trans, } } - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } int bch2_extent_update(struct btree_trans *trans, + subvol_inum inum, struct btree_iter *iter, struct bkey_i *k, struct disk_reservation *disk_res, u64 *journal_seq, u64 new_i_size, - s64 *i_sectors_delta) + s64 *i_sectors_delta_total, + bool check_enospc) { /* this must live until after bch2_trans_commit(): */ struct bkey_inode_buf inode_p; - bool extending = false; - s64 delta = 0; + struct bpos next_pos; + bool extending = false, usage_increasing; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; int ret; - ret = bch2_extent_trim_atomic(k, iter); + /* + * This traverses us the iterator without changing iter->path->pos to + * search_key() (which is pos + 1 for extents): we want there to be a + * path already traversed at iter->pos because + * bch2_trans_extent_update() will use it to attempt extent merging + */ + ret = __bch2_btree_iter_traverse(iter); + if (ret) + return ret; + + ret = bch2_extent_trim_atomic(trans, iter, k); if (ret) return ret; - ret = sum_sector_overwrites(trans, iter, k, - disk_res && disk_res->sectors != 0, - &extending, &delta); + ret = bch2_sum_sector_overwrites(trans, iter, k, + &extending, + &usage_increasing, + &i_sectors_delta, + &disk_sectors_delta); if (ret) return ret; + if (!usage_increasing) + check_enospc = false; + + if (disk_res && + disk_sectors_delta > (s64) disk_res->sectors) { + ret = bch2_disk_reservation_add(trans->c, disk_res, + disk_sectors_delta - disk_res->sectors, + !check_enospc + ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + return ret; + } + new_i_size = extending ? min(k->k.p.offset << 9, new_i_size) : 0; - if (delta || new_i_size) { - struct btree_iter *inode_iter; + if (i_sectors_delta || new_i_size) { + struct btree_iter inode_iter; struct bch_inode_unpacked inode_u; - inode_iter = bch2_inode_peek(trans, &inode_u, - k->k.p.inode, BTREE_ITER_INTENT); - if (IS_ERR(inode_iter)) - return PTR_ERR(inode_iter); + ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, + BTREE_ITER_INTENT); + if (ret) + return ret; /* * XXX: @@ -287,45 +340,71 @@ int bch2_extent_update(struct btree_trans *trans, else new_i_size = 0; - inode_u.bi_sectors += delta; + inode_u.bi_sectors += i_sectors_delta; + + if (i_sectors_delta || new_i_size) { + bch2_inode_pack(trans->c, &inode_p, &inode_u); + + inode_p.inode.k.p.snapshot = iter->snapshot; - if (delta || new_i_size) { - bch2_inode_pack(&inode_p, &inode_u); - bch2_trans_update(trans, inode_iter, - &inode_p.inode.k_i); + ret = bch2_trans_update(trans, &inode_iter, + &inode_p.inode.k_i, 0); } - bch2_trans_iter_put(trans, inode_iter); + bch2_trans_iter_exit(trans, &inode_iter); + + if (ret) + return ret; } - bch2_trans_update(trans, iter, k); + next_pos = k->k.p; - ret = bch2_trans_commit(trans, disk_res, journal_seq, + ret = bch2_trans_update(trans, iter, k, 0) ?: + bch2_trans_commit(trans, disk_res, journal_seq, BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_ATOMIC| - BTREE_INSERT_USE_RESERVE); - if (!ret && i_sectors_delta) - *i_sectors_delta += delta; + BTREE_INSERT_NOFAIL); + BUG_ON(ret == -ENOSPC); + if (ret) + return ret; - return ret; + bch2_btree_iter_set_pos(iter, next_pos); + + if (i_sectors_delta_total) + *i_sectors_delta_total += i_sectors_delta; + return 0; } +/* + * Returns -EINTR if we had to drop locks: + */ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end, u64 *journal_seq, - s64 *i_sectors_delta) + subvol_inum inum, u64 end, + u64 *journal_seq, s64 *i_sectors_delta) { struct bch_fs *c = trans->c; unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct bpos end_pos = POS(inum.inum, end); struct bkey_s_c k; int ret = 0, ret2 = 0; + u32 snapshot; - while ((k = bch2_btree_iter_peek(iter)).k && - bkey_cmp(iter->pos, end) < 0) { + while (1) { struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); struct bkey_i delete; + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto btree_err; + + bch2_btree_iter_set_snapshot(iter, snapshot); + + k = bch2_btree_iter_peek(iter); + if (bkey_cmp(iter->pos, end_pos) >= 0) + break; + ret = bkey_err(k); if (ret) goto btree_err; @@ -335,13 +414,11 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, /* create the biggest key we can */ bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end, &delete); - - bch2_trans_begin_updates(trans); + bch2_cut_back(end_pos, &delete); - ret = bch2_extent_update(trans, iter, &delete, + ret = bch2_extent_update(trans, inum, iter, &delete, &disk_res, journal_seq, - 0, i_sectors_delta); + 0, i_sectors_delta, false); bch2_disk_reservation_put(c, &disk_res); btree_err: if (ret == -EINTR) { @@ -352,76 +429,88 @@ btree_err: break; } - if (bkey_cmp(iter->pos, end) > 0) { - bch2_btree_iter_set_pos(iter, end); - ret = bch2_btree_iter_traverse(iter); - } + if (bkey_cmp(iter->pos, end_pos) > 0) + bch2_btree_iter_set_pos(iter, end_pos); return ret ?: ret2; } -int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, +int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, u64 *journal_seq, s64 *i_sectors_delta) { struct btree_trans trans; - struct btree_iter *iter; - int ret = 0; + struct btree_iter iter; + int ret; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - POS(inum, start), - BTREE_ITER_INTENT); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + POS(inum.inum, start), + BTREE_ITER_INTENT); - ret = bch2_fpunch_at(&trans, iter, POS(inum, end), + ret = bch2_fpunch_at(&trans, &iter, inum, end, journal_seq, i_sectors_delta); - bch2_trans_exit(&trans); - if (ret == -EINTR) - ret = 0; + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); - return ret; + return ret == -EINTR ? 0 : ret; } int bch2_write_index_default(struct bch_write_op *op) { struct bch_fs *c = op->c; - struct bkey_on_stack sk; + struct bkey_buf sk; struct keylist *keys = &op->insert_keys; struct bkey_i *k = bch2_keylist_front(keys); struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; + subvol_inum inum = { + .subvol = op->subvol, + .inum = k->k.p.inode, + }; int ret; - bkey_on_stack_init(&sk); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + BUG_ON(!inum.subvol); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - bkey_start_pos(&k->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); do { + bch2_trans_begin(&trans); + k = bch2_keylist_front(keys); + bch2_bkey_buf_copy(&sk, c, k); - bkey_on_stack_realloc(&sk, c, k->k.u64s); - bkey_copy(sk.k, k); - bch2_cut_front(iter->pos, sk.k); + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, + &sk.k->k.p.snapshot); + if (ret == -EINTR) + continue; + if (ret) + break; - bch2_trans_begin_updates(&trans); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bkey_start_pos(&sk.k->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - ret = bch2_extent_update(&trans, iter, sk.k, + ret = bch2_extent_update(&trans, inum, &iter, sk.k, &op->res, op_journal_seq(op), - op->new_i_size, &op->i_sectors_delta); + op->new_i_size, &op->i_sectors_delta, + op->flags & BCH_WRITE_CHECK_ENOSPC); + bch2_trans_iter_exit(&trans, &iter); + if (ret == -EINTR) continue; if (ret) break; - if (bkey_cmp(iter->pos, k->k.p) >= 0) - bch2_keylist_pop_front(keys); + if (bkey_cmp(iter.pos, k->k.p) >= 0) + bch2_keylist_pop_front(&op->insert_keys); + else + bch2_cut_front(iter.pos, k); } while (!bch2_keylist_empty(keys)); bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); return ret; } @@ -464,13 +553,11 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, n->c = c; n->dev = ptr->dev; - n->have_ioref = bch2_dev_get_ioref(ca, WRITE); + n->have_ioref = bch2_dev_get_ioref(ca, + type == BCH_DATA_btree ? READ : WRITE); n->submit_time = local_clock(); n->bio.bi_iter.bi_sector = ptr->offset; - if (!journal_flushes_device(ca)) - n->bio.bi_opf |= REQ_FUA; - if (likely(n->have_ioref)) { this_cpu_add(ca->io_done->sectors[WRITE][type], bio_sectors(&n->bio)); @@ -494,19 +581,19 @@ static void bch2_write_done(struct closure *cl) if (!op->error && (op->flags & BCH_WRITE_FLUSH)) op->error = bch2_journal_error(&c->journal); - if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) - bch2_disk_reservation_put(c, &op->res); + bch2_disk_reservation_put(c, &op->res); percpu_ref_put(&c->writes); bch2_keylist_free(&op->insert_keys, op->inline_keys); bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); - if (op->end_io) + if (op->end_io) { + EBUG_ON(cl->parent); + closure_debug_destroy(cl); op->end_io(op); - if (cl->parent) + } else { closure_return(cl); - else - closure_debug_destroy(cl); + } } /** @@ -546,9 +633,14 @@ static void __bch2_write_index(struct bch_write_op *op) * particularly want to plumb io_opts all the way through the btree * update stack right now */ - for_each_keylist_key(keys, k) + for_each_keylist_key(keys, k) { bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); + if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k))) + bch2_check_set_feature(op->c, BCH_FEATURE_incompressible); + + } + if (!bch2_keylist_empty(keys)) { u64 sectors_start = keylist_sectors(keys); int ret = op->index_update_fn(op); @@ -559,7 +651,8 @@ static void __bch2_write_index(struct bch_write_op *op) op->written += sectors_start - keylist_sectors(keys); if (ret) { - __bcache_io_error(c, "btree IO error %i", ret); + bch_err_inum_ratelimited(c, op->pos.inode, + "write error %i from btree update", ret); op->error = ret; } } @@ -583,7 +676,9 @@ static void bch2_write_index(struct closure *cl) __bch2_write_index(op); - if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { + if (!(op->flags & BCH_WRITE_DONE)) { + continue_at(cl, __bch2_write, index_update_wq(op)); + } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { bch2_journal_flush_seq_async(&c->journal, *op_journal_seq(op), cl); @@ -602,7 +697,11 @@ static void bch2_write_endio(struct bio *bio) struct bch_fs *c = wbio->c; struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); - if (bch2_dev_io_err_on(bio->bi_status, ca, "data write")) + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, + op->pos.inode, + op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */ + "data write error: %s", + bch2_blk_status_to_str(bio->bi_status))) set_bit(wbio->dev, op->failed.d); if (wbio->have_ioref) { @@ -784,8 +883,9 @@ static enum prep_encoded_ret { /* Can we just write the entire extent as is? */ if (op->crc.uncompressed_size == op->crc.live_size && op->crc.compressed_size <= wp->sectors_free && - op->crc.compression_type == op->compression_type) { - if (!op->crc.compression_type && + (op->crc.compression_type == op->compression_type || + op->incompressible)) { + if (!crc_is_compressed(op->crc) && op->csum_type != op->crc.csum_type && bch2_write_rechecksum(c, op, op->csum_type)) return PREP_ENCODED_CHECKSUM_ERR; @@ -797,7 +897,7 @@ static enum prep_encoded_ret { * If the data is compressed and we couldn't write the entire extent as * is, we have to decompress it: */ - if (op->crc.compression_type) { + if (crc_is_compressed(op->crc)) { struct bch_csum csum; if (bch2_write_decrypt(op)) @@ -864,6 +964,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, ret = -EIO; goto err; case PREP_ENCODED_CHECKSUM_ERR: + BUG(); goto csum_err; case PREP_ENCODED_DO_WRITE: /* XXX look for bug here */ @@ -908,11 +1009,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, bch2_csum_type_is_encryption(op->crc.csum_type)); BUG_ON(op->compression_type && !bounce); - crc.compression_type = op->compression_type - ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, - op->compression_type) + crc.compression_type = op->incompressible + ? BCH_COMPRESSION_TYPE_incompressible + : op->compression_type + ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, + op->compression_type) : 0; - if (!crc.compression_type) { + if (!crc_is_compressed(crc)) { dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); @@ -933,7 +1036,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, if (bch2_csum_type_is_encryption(op->csum_type)) { if (bversion_zero(version)) { - version.lo = atomic64_inc_return(&c->key_version) + 1; + version.lo = atomic64_inc_return(&c->key_version); } else { crc.nonce = op->nonce; op->nonce += src_len >> 9; @@ -941,7 +1044,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, } if ((op->flags & BCH_WRITE_DATA_ENCODED) && - !crc.compression_type && + !crc_is_compressed(crc) && bch2_csum_type_is_encryption(op->crc.csum_type) == bch2_csum_type_is_encryption(op->csum_type)) { /* @@ -1040,7 +1143,10 @@ static void __bch2_write(struct closure *cl) struct write_point *wp; struct bio *bio; bool skip_put = true; + unsigned nofs_flags; int ret; + + nofs_flags = memalloc_nofs_save(); again: memset(&op->failed, 0, sizeof(op->failed)); @@ -1060,6 +1166,17 @@ again: BKEY_EXTENT_U64s_MAX)) goto flush_io; + if ((op->flags & BCH_WRITE_FROM_INTERNAL) && + percpu_ref_is_dying(&c->writes)) { + ret = -EROFS; + goto err; + } + + /* + * The copygc thread is now global, which means it's no longer + * freeing up space on specific disks, which means that + * allocations for specific disks may hang arbitrarily long: + */ wp = bch2_alloc_sectors_start(c, op->target, op->opts.erasure_code, @@ -1069,7 +1186,8 @@ again: op->nr_replicas_required, op->alloc_reserve, op->flags, - (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl); + (op->flags & (BCH_WRITE_ALLOC_NOWAIT| + BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl); EBUG_ON(!wp); if (unlikely(IS_ERR(wp))) { @@ -1081,6 +1199,16 @@ again: goto flush_io; } + /* + * It's possible for the allocator to fail, put us on the + * freelist waitlist, and then succeed in one of various retry + * paths: if that happens, we need to disable the skip_put + * optimization because otherwise there won't necessarily be a + * barrier before we free the bch_write_op: + */ + if (atomic_read(&cl->remaining) & CLOSURE_WAITING) + skip_put = false; + bch2_open_bucket_get(c, wp, &op->open_buckets); ret = bch2_write_extent(op, wp, &bio); bch2_alloc_sectors_done(c, wp); @@ -1088,8 +1216,15 @@ again: if (ret < 0) goto err; - if (ret) + if (ret) { skip_put = false; + } else { + /* + * for the skip_put optimization this has to be set + * before we submit the bio: + */ + op->flags |= BCH_WRITE_DONE; + } bio->bi_end_io = bch2_write_endio; bio->bi_private = &op->cl; @@ -1103,27 +1238,43 @@ again: key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); - bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER, + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, key_to_write); } while (ret); if (!skip_put) continue_at(cl, bch2_write_index, index_update_wq(op)); +out: + memalloc_nofs_restore(nofs_flags); return; err: op->error = ret; + op->flags |= BCH_WRITE_DONE; continue_at(cl, bch2_write_index, index_update_wq(op)); - return; + goto out; flush_io: + /* + * If the write can't all be submitted at once, we generally want to + * block synchronously as that signals backpressure to the caller. + * + * However, if we're running out of a workqueue, we can't block here + * because we'll be blocking other work items from completing: + */ + if (current->flags & PF_WQ_WORKER) { + continue_at(cl, bch2_write_index, index_update_wq(op)); + goto out; + } + closure_sync(cl); if (!bch2_keylist_empty(&op->insert_keys)) { __bch2_write_index(op); if (op->error) { + op->flags |= BCH_WRITE_DONE; continue_at_nobarrier(cl, bch2_write_done, NULL); - return; + goto out; } } @@ -1139,7 +1290,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) unsigned sectors; int ret; - bch2_check_set_feature(op->c, BCH_FEATURE_INLINE_DATA); + bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, ARRAY_SIZE(op->inline_keys), @@ -1167,6 +1318,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) bch2_keylist_push(&op->insert_keys); op->flags |= BCH_WRITE_WROTE_DATA_INLINE; + op->flags |= BCH_WRITE_DONE; + continue_at_nobarrier(cl, bch2_write_index, NULL); return; err: @@ -1205,14 +1358,14 @@ void bch2_write(struct closure *cl) wbio_init(bio)->put_bio = false; if (bio_sectors(bio) & (c->opts.block_size - 1)) { - __bcache_io_error(c, "misaligned write"); + bch_err_inum_ratelimited(c, op->pos.inode, + "misaligned write"); op->error = -EIO; goto err; } if (c->opts.nochanges || !percpu_ref_tryget(&c->writes)) { - __bcache_io_error(c, "read only"); op->error = -EROFS; goto err; } @@ -1231,14 +1384,15 @@ void bch2_write(struct closure *cl) continue_at_nobarrier(cl, __bch2_write, NULL); return; err: - if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) - bch2_disk_reservation_put(c, &op->res); - if (op->end_io) + bch2_disk_reservation_put(c, &op->res); + + if (op->end_io) { + EBUG_ON(cl->parent); + closure_debug_destroy(cl); op->end_io(op); - if (cl->parent) + } else { closure_return(cl); - else - closure_debug_destroy(cl); + } } /* Cache promotion on read */ @@ -1330,12 +1484,13 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) bch2_migrate_read_done(&op->write, rbio); closure_init(cl, NULL); - closure_call(&op->write.op.cl, bch2_write, c->wq, cl); + closure_call(&op->write.op.cl, bch2_write, c->btree_update_wq, cl); closure_return_with_destructor(cl, promote_done); } static struct promote_op *__promote_alloc(struct bch_fs *c, enum btree_id btree_id, + struct bkey_s_c k, struct bpos pos, struct extent_ptr_decoded *pick, struct bch_io_opts opts, @@ -1390,10 +1545,10 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, opts, DATA_PROMOTE, (struct data_opts) { - .target = opts.promote_target + .target = opts.promote_target, + .nr_replicas = 1, }, - btree_id, - bkey_s_c_null); + btree_id, k); BUG_ON(ret); return op; @@ -1433,9 +1588,9 @@ static struct promote_op *promote_alloc(struct bch_fs *c, promote = __promote_alloc(c, k.k->type == KEY_TYPE_reflink_v - ? BTREE_ID_REFLINK - : BTREE_ID_EXTENTS, - pos, pick, opts, sectors, rbio); + ? BTREE_ID_reflink + : BTREE_ID_extents, + k, pos, pick, opts, sectors, rbio); if (!promote) return NULL; @@ -1514,133 +1669,63 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) } static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, + struct bvec_iter bvec_iter, struct bch_io_failures *failed, unsigned flags) { struct btree_trans trans; - struct btree_iter *iter; - struct bkey_on_stack sk; + struct btree_iter iter; + struct bkey_buf sk; struct bkey_s_c k; int ret; flags &= ~BCH_READ_LAST_FRAGMENT; flags |= BCH_READ_MUST_CLONE; - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - rbio->pos, BTREE_ITER_SLOTS); + bch2_trans_iter_init(&trans, &iter, rbio->data_btree, + rbio->read_pos, BTREE_ITER_SLOTS); retry: rbio->bio.bi_status = 0; - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(&iter); if (bkey_err(k)) goto err; - bkey_on_stack_reassemble(&sk, c, k); + bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); bch2_trans_unlock(&trans); if (!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, - rbio->pos.offset - + rbio->data_pos.offset - rbio->pick.crc.offset)) { /* extent we wanted to read no longer exists: */ rbio->hole = true; goto out; } - ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags); + ret = __bch2_read_extent(&trans, rbio, bvec_iter, + rbio->read_pos, + rbio->data_btree, + k, 0, failed, flags); if (ret == READ_RETRY) goto retry; if (ret) goto err; out: bch2_rbio_done(rbio); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); return; err: rbio->bio.bi_status = BLK_STS_IOERR; goto out; } -static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, - struct bch_io_failures *failed, unsigned flags) -{ - struct btree_trans trans; - struct btree_iter *iter; - struct bkey_on_stack sk; - struct bkey_s_c k; - int ret; - - flags &= ~BCH_READ_LAST_FRAGMENT; - flags |= BCH_READ_MUST_CLONE; - - bkey_on_stack_init(&sk); - bch2_trans_init(&trans, c, 0, 0); -retry: - bch2_trans_begin(&trans); - - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, - POS(inode, bvec_iter.bi_sector), - BTREE_ITER_SLOTS, k, ret) { - unsigned bytes, sectors, offset_into_extent; - - bkey_on_stack_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - offset_into_extent = iter->pos.offset - - bkey_start_offset(k.k); - sectors = k.k->size - offset_into_extent; - - ret = bch2_read_indirect_extent(&trans, - &offset_into_extent, sk.k); - if (ret) - break; - - sectors = min(sectors, k.k->size - offset_into_extent); - - bch2_trans_unlock(&trans); - - bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; - swap(bvec_iter.bi_size, bytes); - - ret = __bch2_read_extent(c, rbio, bvec_iter, k, - offset_into_extent, failed, flags); - switch (ret) { - case READ_RETRY: - goto retry; - case READ_ERR: - goto err; - }; - - if (bytes == bvec_iter.bi_size) - goto out; - - swap(bvec_iter.bi_size, bytes); - bio_advance_iter(&rbio->bio, &bvec_iter, bytes); - } - - if (ret == -EINTR) - goto retry; - /* - * If we get here, it better have been because there was an error - * reading a btree node - */ - BUG_ON(!ret); - __bcache_io_error(c, "btree IO error: %i", ret); -err: - rbio->bio.bi_status = BLK_STS_IOERR; -out: - bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); - bch2_rbio_done(rbio); -} - static void bch2_rbio_retry(struct work_struct *work) { struct bch_read_bio *rbio = @@ -1648,7 +1733,10 @@ static void bch2_rbio_retry(struct work_struct *work) struct bch_fs *c = rbio->c; struct bvec_iter iter = rbio->bvec_iter; unsigned flags = rbio->flags; - u64 inode = rbio->pos.inode; + subvol_inum inum = { + .subvol = rbio->subvol, + .inum = rbio->read_pos.inode, + }; struct bch_io_failures failed = { .nr = 0 }; trace_read_retry(&rbio->bio); @@ -1663,10 +1751,14 @@ static void bch2_rbio_retry(struct work_struct *work) flags |= BCH_READ_IN_RETRY; flags &= ~BCH_READ_MAY_PROMOTE; - if (flags & BCH_READ_NODECODE) - bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); - else - bch2_read_retry(c, rbio, iter, inode, &failed, flags); + if (flags & BCH_READ_NODECODE) { + bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); + } else { + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + + __bch2_read(c, rbio, iter, inum, &failed, flags); + } } static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, @@ -1688,34 +1780,26 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, } } -static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) +static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + struct bch_read_bio *rbio) { struct bch_fs *c = rbio->c; - struct btree_trans trans; - struct btree_iter *iter; - struct bkey_s_c k; - struct bkey_on_stack new; + u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; struct bch_extent_crc_unpacked new_crc; - u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; - int ret; - - if (rbio->pick.crc.compression_type) - return; + struct btree_iter iter; + struct bkey_i *new; + struct bkey_s_c k; + int ret = 0; - bkey_on_stack_init(&new); - bch2_trans_init(&trans, c, 0, 0); -retry: - bch2_trans_begin(&trans); + if (crc_is_compressed(rbio->pick.crc)) + return 0; - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, rbio->pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(iter); - if (IS_ERR_OR_NULL(k.k)) + bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + if ((ret = bkey_err(k))) goto out; - bkey_on_stack_reassemble(&new, c, k); - k = bkey_i_to_s_c(new.k); - if (bversion_cmp(k.k->version, rbio->version) || !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) goto out; @@ -1730,22 +1814,34 @@ retry: bkey_start_offset(k.k) - data_offset, k.k->size, rbio->pick.crc.csum_type)) { bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); + ret = 0; goto out; } - if (!bch2_bkey_narrow_crcs(new.k, new_crc)) + /* + * going to be temporarily appending another checksum entry: + */ + new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + + sizeof(struct bch_extent_crc128)); + if ((ret = PTR_ERR_OR_ZERO(new))) goto out; - bch2_trans_update(&trans, iter, new.k); - ret = bch2_trans_commit(&trans, NULL, NULL, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_NOWAIT); - if (ret == -EINTR) - goto retry; + bkey_reassemble(new, k); + + if (!bch2_bkey_narrow_crcs(new, new_crc)) + goto out; + + ret = bch2_trans_update(trans, &iter, new, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); out: - bch2_trans_exit(&trans); - bkey_on_stack_exit(&new, c); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) +{ + bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, + __bch2_rbio_narrow_crcs(&trans, rbio)); } /* Inner part that may run in process context */ @@ -1760,8 +1856,11 @@ static void __bch2_read_endio(struct work_struct *work) struct bvec_iter dst_iter = rbio->bvec_iter; struct bch_extent_crc_unpacked crc = rbio->pick.crc; struct nonce nonce = extent_nonce(rbio->version, crc); + unsigned nofs_flags; struct bch_csum csum; + nofs_flags = memalloc_nofs_save(); + /* Reset iterator for checksumming and copying bounced data: */ if (rbio->bounce) { src->bi_iter.bi_size = crc.compressed_size << 9; @@ -1775,6 +1874,13 @@ static void __bch2_read_endio(struct work_struct *work) if (bch2_crc_cmp(csum, rbio->pick.crc.csum)) goto csum_err; + /* + * XXX + * We need to rework the narrow_crcs path to deliver the read completion + * first, and then punt to a different workqueue, otherwise we're + * holding up reads while doing btree updates which is bad for memory + * reclaim. + */ if (unlikely(rbio->narrow_crcs)) bch2_rbio_narrow_crcs(rbio); @@ -1785,7 +1891,7 @@ static void __bch2_read_endio(struct work_struct *work) crc.offset += rbio->offset_into_extent; crc.live_size = bvec_iter_sectors(rbio->bvec_iter); - if (crc.compression_type != BCH_COMPRESSION_NONE) { + if (crc_is_compressed(crc)) { bch2_encrypt_bio(c, crc.csum_type, nonce, src); if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) goto decompression_err; @@ -1819,6 +1925,8 @@ nodecode: rbio = bch2_rbio_free(rbio); bch2_rbio_done(rbio); } +out: + memalloc_nofs_restore(nofs_flags); return; csum_err: /* @@ -1829,22 +1937,20 @@ csum_err: if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { rbio->flags |= BCH_READ_MUST_BOUNCE; bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); - return; + goto out; } - bch2_dev_io_error(ca, - "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)", - rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, + bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector, + "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)", rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, csum.hi, csum.lo, crc.csum_type); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); - return; + goto out; decompression_err: - __bcache_io_error(c, "decompression error, inode %llu offset %llu", - rbio->pos.inode, - (u64) rbio->bvec_iter.bi_sector); + bch_err_inum_ratelimited(c, rbio->read_pos.inode, + "decompression error"); bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); - return; + goto out; } static void bch2_read_endio(struct bio *bio) @@ -1864,7 +1970,11 @@ static void bch2_read_endio(struct bio *bio) if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) { + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, + rbio->read_pos.inode, + rbio->read_pos.offset, + "data read error: %s", + bch2_blk_status_to_str(bio->bi_status))) { bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); return; } @@ -1882,7 +1992,7 @@ static void bch2_read_endio(struct bio *bio) } if (rbio->narrow_crcs || - rbio->pick.crc.compression_type || + crc_is_compressed(rbio->pick.crc) || bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; else if (rbio->pick.crc.csum_type) @@ -1893,62 +2003,64 @@ static void bch2_read_endio(struct bio *bio) int __bch2_read_indirect_extent(struct btree_trans *trans, unsigned *offset_into_extent, - struct bkey_i *orig_k) + struct bkey_buf *orig_k) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; u64 reflink_offset; int ret; - reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) + + reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + *offset_into_extent; - iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK, - POS(0, reflink_offset), - BTREE_ITER_SLOTS); - ret = PTR_ERR_OR_ZERO(iter); - if (ret) - return ret; - - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, + POS(0, reflink_offset), + BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; - if (k.k->type != KEY_TYPE_reflink_v) { - __bcache_io_error(trans->c, - "pointer to nonexistent indirect extent"); + if (k.k->type != KEY_TYPE_reflink_v && + k.k->type != KEY_TYPE_indirect_inline_data) { + bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode, + "%llu len %u points to nonexistent indirect extent %llu", + orig_k->k->k.p.offset, + orig_k->k->k.size, + reflink_offset); + bch2_inconsistent_error(trans->c); ret = -EIO; goto err; } - *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); - bkey_reassemble(orig_k, k); + *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); + bch2_bkey_buf_reassemble(orig_k, trans->c, k); err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } -int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, - struct bvec_iter iter, struct bkey_s_c k, +int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + struct bvec_iter iter, struct bpos read_pos, + enum btree_id data_btree, struct bkey_s_c k, unsigned offset_into_extent, struct bch_io_failures *failed, unsigned flags) { + struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; struct bch_dev *ca; struct promote_op *promote = NULL; bool bounce = false, read_full = false, narrow_crcs = false; - struct bpos pos = bkey_start_pos(k.k); + struct bpos data_pos = bkey_start_pos(k.k); int pick_ret; - if (k.k->type == KEY_TYPE_inline_data) { - struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); + if (bkey_extent_is_inline_data(k.k)) { unsigned bytes = min_t(unsigned, iter.bi_size, - bkey_val_bytes(d.k)); + bkey_inline_data_bytes(k.k)); swap(iter.bi_size, bytes); - memcpy_to_bio(&orig->bio, iter, d.v->data); + memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); swap(iter.bi_size, bytes); bio_advance_iter(&orig->bio, &iter, bytes); zero_fill_bio_iter(&orig->bio, iter); @@ -1962,7 +2074,8 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, goto hole; if (pick_ret < 0) { - __bcache_io_error(c, "no device to read from"); + bch_err_inum_ratelimited(c, k.k->p.inode, + "no device to read from"); goto err; } @@ -1978,7 +2091,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, goto hole; iter.bi_size = pick.crc.compressed_size << 9; - goto noclone; + goto get_bio; } if (!(flags & BCH_READ_LAST_FRAGMENT) || @@ -1993,7 +2106,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); - if (pick.crc.compression_type != BCH_COMPRESSION_NONE || + if (crc_is_compressed(pick.crc) || (pick.crc.csum_type != BCH_CSUM_NONE && (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || (bch2_csum_type_is_encryption(pick.crc.csum_type) && @@ -2008,14 +2121,14 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, &rbio, &bounce, &read_full); if (!read_full) { - EBUG_ON(pick.crc.compression_type); + EBUG_ON(crc_is_compressed(pick.crc)); EBUG_ON(pick.crc.csum_type && (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || bvec_iter_sectors(iter) != pick.crc.live_size || pick.crc.offset || offset_into_extent)); - pos.offset += offset_into_extent; + data_pos.offset += offset_into_extent; pick.ptr.offset += pick.crc.offset + offset_into_extent; offset_into_extent = 0; @@ -2025,7 +2138,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, pick.crc.live_size = bvec_iter_sectors(iter); offset_into_extent = 0; } - +get_bio: if (rbio) { /* * promote already allocated bounce rbio: @@ -2063,7 +2176,6 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, rbio->bio.bi_iter = iter; rbio->split = true; } else { -noclone: rbio = orig; rbio->bio.bi_iter = iter; EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); @@ -2088,7 +2200,10 @@ noclone: /* XXX: only initialize this if needed */ rbio->devs_have = bch2_bkey_devs(k); rbio->pick = pick; - rbio->pos = pos; + rbio->subvol = orig->subvol; + rbio->read_pos = read_pos; + rbio->data_btree = data_btree; + rbio->data_pos = data_pos; rbio->version = k.k->version; rbio->promote = promote; INIT_WORK(&rbio->work, NULL); @@ -2102,9 +2217,13 @@ noclone: bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - rcu_read_lock(); - bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); - rcu_read_unlock(); + /* + * If it's being moved internally, we don't want to flag it as a cache + * hit: + */ + if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) + bch2_bucket_io_time_reset(trans, pick.ptr.dev, + PTR_BUCKET_NR(ca, &pick.ptr), READ); if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { bio_inc_remaining(&orig->bio); @@ -2113,12 +2232,13 @@ noclone: if (!rbio->pick.idx) { if (!rbio->have_ioref) { - __bcache_io_error(c, "no device to read from"); + bch_err_inum_ratelimited(c, k.k->p.inode, + "no device to read from"); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; } - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], bio_sectors(&rbio->bio)); bio_set_dev(&rbio->bio, ca->disk_sb.bdev); @@ -2153,6 +2273,9 @@ out: ret = READ_RETRY; } + if (!ret) + goto out_read_done; + return ret; } @@ -2179,54 +2302,65 @@ out_read_done: return 0; } -void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) +void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, subvol_inum inum, + struct bch_io_failures *failed, unsigned flags) { struct btree_trans trans; - struct btree_iter *iter; - struct bkey_on_stack sk; + struct btree_iter iter; + struct bkey_buf sk; struct bkey_s_c k; - unsigned flags = BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE| - BCH_READ_USER_MAPPED; + u32 snapshot; int ret; - BUG_ON(rbio->_state); BUG_ON(flags & BCH_READ_NODECODE); - BUG_ON(flags & BCH_READ_IN_RETRY); - rbio->c = c; - rbio->start_time = local_clock(); - - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); + iter = (struct btree_iter) { NULL }; - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - POS(inode, rbio->bio.bi_iter.bi_sector), - BTREE_ITER_SLOTS); + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, bvec_iter.bi_sector, snapshot), + BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS); while (1) { unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; - bch2_btree_iter_set_pos(iter, - POS(inode, rbio->bio.bi_iter.bi_sector)); + /* + * read_extent -> io_time_reset may cause a transaction restart + * without returning an error, we need to check for that here: + */ + if (!bch2_trans_relock(&trans)) { + ret = -EINTR; + break; + } + + bch2_btree_iter_set_pos(&iter, + POS(inum.inum, bvec_iter.bi_sector)); - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) - goto err; + break; - offset_into_extent = iter->pos.offset - + offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; - bkey_on_stack_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); + bch2_bkey_buf_reassemble(&sk, c, k); - ret = bch2_read_indirect_extent(&trans, - &offset_into_extent, sk.k); + ret = bch2_read_indirect_extent(&trans, &data_btree, + &offset_into_extent, &sk); if (ret) - goto err; + break; + + k = bkey_i_to_s_c(sk.k); /* * With indirect extents, the amount of data to read is the min @@ -2240,31 +2374,39 @@ retry: */ bch2_trans_unlock(&trans); - bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; - swap(rbio->bio.bi_iter.bi_size, bytes); + bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; + swap(bvec_iter.bi_size, bytes); - if (rbio->bio.bi_iter.bi_size == bytes) + if (bvec_iter.bi_size == bytes) flags |= BCH_READ_LAST_FRAGMENT; - bch2_read_extent(c, rbio, k, offset_into_extent, flags); + ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos, + data_btree, k, + offset_into_extent, failed, flags); + if (ret) + break; if (flags & BCH_READ_LAST_FRAGMENT) break; - swap(rbio->bio.bi_iter.bi_size, bytes); - bio_advance(&rbio->bio, bytes); + swap(bvec_iter.bi_size, bytes); + bio_advance_iter(&rbio->bio, &bvec_iter, bytes); } -out: - bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); - return; err: - if (ret == -EINTR) + bch2_trans_iter_exit(&trans, &iter); + + if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID) goto retry; - bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); - bch2_rbio_done(rbio); - goto out; + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + + if (ret) { + bch_err_inum_ratelimited(c, inum.inum, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; + bch2_rbio_done(rbio); + } } void bch2_fs_io_exit(struct bch_fs *c)