X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fio.c;h=092ece2c8bdee20594ad57fc34148acf4b7a87f7;hb=07ec713e056a73337bc0f53f5910f5179537b2c2;hp=0c41e4111a43a0fb25ad9316b7c66c6ca6e15e40;hpb=ea83a3985d28372d56ec7cea6e73907551869f63;p=bcachefs-tools-debian diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 0c41e41..092ece2 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * Some low level IO code, and hacks for various block layer limitations * @@ -6,7 +7,9 @@ */ #include "bcachefs.h" -#include "alloc.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "bkey_buf.h" #include "bset.h" #include "btree_update.h" #include "buckets.h" @@ -14,27 +17,99 @@ #include "compress.h" #include "clock.h" #include "debug.h" +#include "disk_groups.h" +#include "ec.h" #include "error.h" -#include "extents.h" +#include "extent_update.h" +#include "inode.h" #include "io.h" #include "journal.h" #include "keylist.h" #include "move.h" +#include "rebalance.h" +#include "super.h" #include "super-io.h" #include #include +#include #include -/* Allocate, free from mempool: */ +const char *bch2_blk_status_to_str(blk_status_t status) +{ + if (status == BLK_STS_REMOVED) + return "device removed"; + return blk_status_to_str(status); +} + +static bool bch2_target_congested(struct bch_fs *c, u16 target) +{ + const struct bch_devs_mask *devs; + unsigned d, nr = 0, total = 0; + u64 now = local_clock(), last; + s64 congested; + struct bch_dev *ca; + + if (!target) + return false; + + rcu_read_lock(); + devs = bch2_target_to_mask(c, target) ?: + &c->rw_devs[BCH_DATA_user]; + + for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { + ca = rcu_dereference(c->devs[d]); + if (!ca) + continue; + + congested = atomic_read(&ca->congested); + last = READ_ONCE(ca->congested_last); + if (time_after64(now, last)) + congested -= (now - last) >> 12; + + total += max(congested, 0LL); + nr++; + } + rcu_read_unlock(); + + return bch2_rand_range(nr * CONGESTED_MAX) < total; +} -void bch2_latency_acct(struct bch_dev *ca, unsigned submit_time_us, int rw) +static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, + u64 now, int rw) { + u64 latency_capable = + ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; + /* ideally we'd be taking into account the device's variance here: */ + u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); + s64 latency_over = io_latency - latency_threshold; + + if (latency_threshold && latency_over > 0) { + /* + * bump up congested by approximately latency_over * 4 / + * latency_threshold - we don't need much accuracy here so don't + * bother with the divide: + */ + if (atomic_read(&ca->congested) < CONGESTED_MAX) + atomic_add(latency_over >> + max_t(int, ilog2(latency_threshold) - 2, 0), + &ca->congested); + + ca->congested_last = now; + } else if (atomic_read(&ca->congested) > 0) { + atomic_dec(&ca->congested); + } +} + +void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) +{ + atomic64_t *latency = &ca->cur_latency[rw]; u64 now = local_clock(); - unsigned io_latency = (now >> 10) - submit_time_us; - atomic_t *latency = &ca->latency[rw]; - unsigned old, new, v = atomic_read(latency); + u64 io_latency = time_after64(now, submit_time) + ? now - submit_time + : 0; + u64 old, new, v = atomic64_read(latency); do { old = v; @@ -45,32 +120,37 @@ void bch2_latency_acct(struct bch_dev *ca, unsigned submit_time_us, int rw) * the time: */ if (abs((int) (old - io_latency)) < (old >> 1) && - now & ~(~0 << 5)) + now & ~(~0U << 5)) break; - new = ewma_add((u64) old, io_latency, 6); - } while ((v = atomic_cmpxchg(latency, old, new)) != old); + new = ewma_add(old, io_latency, 5); + } while ((v = atomic64_cmpxchg(latency, old, new)) != old); + + bch2_congested_acct(ca, io_latency, now, rw); + + __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); } +/* Allocate, free from mempool: */ + void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { + struct bvec_iter_all iter; struct bio_vec *bv; - unsigned i; - bio_for_each_segment_all(bv, bio, i) + bio_for_each_segment_all(bv, bio, iter) if (bv->bv_page != ZERO_PAGE(0)) mempool_free(bv->bv_page, &c->bio_bounce_pages); bio->bi_vcnt = 0; } -static void bch2_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio, - bool *using_mempool) +static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt++]; + struct page *page; if (likely(!*using_mempool)) { - bv->bv_page = alloc_page(GFP_NOIO); - if (unlikely(!bv->bv_page)) { + page = alloc_page(GFP_NOIO); + if (unlikely(!page)) { mutex_lock(&c->bio_bounce_pages_lock); *using_mempool = true; goto pool_alloc; @@ -78,55 +158,320 @@ static void bch2_bio_alloc_page_pool(struct bch_fs *c, struct bio *bio, } } else { pool_alloc: - bv->bv_page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO); + page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO); } - bv->bv_len = PAGE_SIZE; - bv->bv_offset = 0; + return page; } void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, - size_t bytes) + size_t size) { bool using_mempool = false; - BUG_ON(DIV_ROUND_UP(bytes, PAGE_SIZE) > bio->bi_max_vecs); + while (size) { + struct page *page = __bio_alloc_page_pool(c, &using_mempool); + unsigned len = min_t(size_t, PAGE_SIZE, size); - bio->bi_iter.bi_size = bytes; - - while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) - bch2_bio_alloc_page_pool(c, bio, &using_mempool); + BUG_ON(!bio_add_page(bio, page, len, 0)); + size -= len; + } if (using_mempool) mutex_unlock(&c->bio_bounce_pages_lock); } -void bch2_bio_alloc_more_pages_pool(struct bch_fs *c, struct bio *bio, - size_t bytes) +/* Extent update path: */ + +int bch2_sum_sector_overwrites(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i *new, + bool *maybe_extending, + bool *usage_increasing, + s64 *i_sectors_delta, + s64 *disk_sectors_delta) { - while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; + struct bch_fs *c = trans->c; + struct btree_iter *iter; + struct bkey_s_c old; + unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); + bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); + int ret = 0; - BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs); + *maybe_extending = true; + *usage_increasing = false; + *i_sectors_delta = 0; + *disk_sectors_delta = 0; - bv->bv_page = alloc_page(GFP_NOIO); - if (!bv->bv_page) { + iter = bch2_trans_copy_iter(trans, extent_iter); + + for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { + s64 sectors = min(new->k.p.offset, old.k->p.offset) - + max(bkey_start_offset(&new->k), + bkey_start_offset(old.k)); + + *i_sectors_delta += sectors * + (bkey_extent_is_allocation(&new->k) - + bkey_extent_is_allocation(old.k)); + + *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)); + *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot + ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) + : 0; + + if (!*usage_increasing && + (new_replicas > bch2_bkey_replicas(c, old) || + (!new_compressed && bch2_bkey_sectors_compressed(old)))) + *usage_increasing = true; + + if (bkey_cmp(old.k->p, new->k.p) >= 0) { /* - * We already allocated from mempool, we can't allocate from it again - * without freeing the pages we already allocated or else we could - * deadlock: + * Check if there's already data above where we're + * going to be writing to - this means we're definitely + * not extending the file: + * + * Note that it's not sufficient to check if there's + * data up to the sector offset we're going to be + * writing to, because i_size could be up to one block + * less: */ - bch2_bio_free_pages_pool(c, bio); - bch2_bio_alloc_pages_pool(c, bio, bytes); - return; + if (!bkey_cmp(old.k->p, new->k.p)) + old = bch2_btree_iter_next(iter); + + if (old.k && !bkey_err(old) && + old.k->p.inode == extent_iter->pos.inode && + bkey_extent_is_data(old.k)) + *maybe_extending = false; + + break; + } + } + + bch2_trans_iter_put(trans, iter); + return ret; +} + +int bch2_extent_update(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *k, + struct disk_reservation *disk_res, + u64 *journal_seq, + u64 new_i_size, + s64 *i_sectors_delta_total, + bool check_enospc) +{ + /* this must live until after bch2_trans_commit(): */ + struct bkey_inode_buf inode_p; + bool extending = false, usage_increasing; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; + int ret; + + ret = bch2_extent_trim_atomic(k, iter); + if (ret) + return ret; + + ret = bch2_sum_sector_overwrites(trans, iter, k, + &extending, + &usage_increasing, + &i_sectors_delta, + &disk_sectors_delta); + if (ret) + return ret; + + if (!usage_increasing) + check_enospc = false; + + if (disk_res && + disk_sectors_delta > (s64) disk_res->sectors) { + ret = bch2_disk_reservation_add(trans->c, disk_res, + disk_sectors_delta - disk_res->sectors, + !check_enospc + ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + return ret; + } + + new_i_size = extending + ? min(k->k.p.offset << 9, new_i_size) + : 0; + + if (i_sectors_delta || new_i_size) { + struct btree_iter *inode_iter; + struct bch_inode_unpacked inode_u; + + inode_iter = bch2_inode_peek(trans, &inode_u, + k->k.p.inode, BTREE_ITER_INTENT); + ret = PTR_ERR_OR_ZERO(inode_iter); + if (ret) + return ret; + + /* + * XXX: + * writeback can race a bit with truncate, because truncate + * first updates the inode then truncates the pagecache. This is + * ugly, but lets us preserve the invariant that the in memory + * i_size is always >= the on disk i_size. + * + BUG_ON(new_i_size > inode_u.bi_size && + (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)); + */ + BUG_ON(new_i_size > inode_u.bi_size && !extending); + + if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + new_i_size > inode_u.bi_size) + inode_u.bi_size = new_i_size; + else + new_i_size = 0; + + inode_u.bi_sectors += i_sectors_delta; + + if (i_sectors_delta || new_i_size) { + bch2_inode_pack(trans->c, &inode_p, &inode_u); + + inode_p.inode.k.p.snapshot = iter->snapshot; + + ret = bch2_trans_update(trans, inode_iter, + &inode_p.inode.k_i, 0); + } + + bch2_trans_iter_put(trans, inode_iter); + + if (ret) + return ret; + } + + ret = bch2_trans_update(trans, iter, k, 0) ?: + bch2_trans_commit(trans, disk_res, journal_seq, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL); + BUG_ON(ret == -ENOSPC); + if (ret) + return ret; + + if (i_sectors_delta_total) + *i_sectors_delta_total += i_sectors_delta; + return 0; +} + +int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end, u64 *journal_seq, + s64 *i_sectors_delta) +{ + struct bch_fs *c = trans->c; + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct bkey_s_c k; + int ret = 0, ret2 = 0; + + while ((k = bch2_btree_iter_peek(iter)).k && + bkey_cmp(iter->pos, end) < 0) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete; + + bch2_trans_begin(trans); + + ret = bkey_err(k); + if (ret) + goto btree_err; + + bkey_init(&delete.k); + delete.k.p = iter->pos; + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end, &delete); + + ret = bch2_extent_update(trans, iter, &delete, + &disk_res, journal_seq, + 0, i_sectors_delta, false); + bch2_disk_reservation_put(c, &disk_res); +btree_err: + if (ret == -EINTR) { + ret2 = ret; + ret = 0; } + if (ret) + break; + } - bv->bv_len = PAGE_SIZE; - bv->bv_offset = 0; - bio->bi_vcnt++; + if (bkey_cmp(iter->pos, end) > 0) { + bch2_btree_iter_set_pos(iter, end); + ret = bch2_btree_iter_traverse(iter); } - bio->bi_iter.bi_size = bytes; + return ret ?: ret2; +} + +int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, + u64 *journal_seq, s64 *i_sectors_delta) +{ + struct btree_trans trans; + struct btree_iter *iter; + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, + POS(inum, start), + BTREE_ITER_INTENT); + + ret = bch2_fpunch_at(&trans, iter, POS(inum, end), + journal_seq, i_sectors_delta); + + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + + if (ret == -EINTR) + ret = 0; + + return ret; +} + +int bch2_write_index_default(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct bkey_buf sk; + struct keylist *keys = &op->insert_keys; + struct bkey_i *k = bch2_keylist_front(keys); + struct btree_trans trans; + struct btree_iter *iter; + int ret; + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, + bkey_start_pos(&k->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + do { + bch2_trans_begin(&trans); + + k = bch2_keylist_front(keys); + + k->k.p.snapshot = iter->snapshot; + + bch2_bkey_buf_realloc(&sk, c, k->k.u64s); + bkey_copy(sk.k, k); + bch2_cut_front(iter->pos, sk.k); + + ret = bch2_extent_update(&trans, iter, sk.k, + &op->res, op_journal_seq(op), + op->new_i_size, &op->i_sectors_delta, + op->flags & BCH_WRITE_CHECK_ENOSPC); + if (ret == -EINTR) + continue; + if (ret) + break; + + if (bkey_cmp(iter->pos, k->k.p) >= 0) + bch2_keylist_pop_front(keys); + } while (!bch2_keylist_empty(keys)); + + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + + return ret; } /* Writes */ @@ -135,21 +480,20 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, enum bch_data_type type, const struct bkey_i *k) { - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); const struct bch_extent_ptr *ptr; struct bch_write_bio *n; struct bch_dev *ca; - unsigned ptr_idx = 0; BUG_ON(c->opts.nochanges); - extent_for_each_ptr(e, ptr) { + bkey_for_each_ptr(ptrs, ptr) { BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || !c->devs[ptr->dev]); - ca = c->devs[ptr->dev]; + ca = bch_dev_bkey_exists(c, ptr->dev); - if (ptr + 1 < &extent_entry_last(e)->ptr) { + if (to_entry(ptr + 1) < ptrs.end) { n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO, &ca->replica_set)); @@ -167,24 +511,20 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, } n->c = c; - n->ca = ca; - n->ptr_idx = ptr_idx++; - n->submit_time_us = local_clock_us(); + n->dev = ptr->dev; + n->have_ioref = bch2_dev_get_ioref(ca, + type == BCH_DATA_btree ? READ : WRITE); + n->submit_time = local_clock(); n->bio.bi_iter.bi_sector = ptr->offset; - if (!journal_flushes_device(ca)) - n->bio.bi_opf |= REQ_FUA; - - if (likely(percpu_ref_tryget(&ca->io_ref))) { + if (likely(n->have_ioref)) { this_cpu_add(ca->io_done->sectors[WRITE][type], bio_sectors(&n->bio)); - n->have_io_ref = true; - n->bio.bi_bdev = ca->disk_sb.bdev; + bio_set_dev(&n->bio, ca->disk_sb.bdev); submit_bio(&n->bio); } else { - n->have_io_ref = false; - bcache_io_error(c, &n->bio, "device has been removed"); + n->bio.bi_status = BLK_STS_REMOVED; bio_endio(&n->bio); } } @@ -195,78 +535,109 @@ static void __bch2_write(struct closure *); static void bch2_write_done(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - - BUG_ON(!(op->flags & BCH_WRITE_DONE)); + struct bch_fs *c = op->c; if (!op->error && (op->flags & BCH_WRITE_FLUSH)) - op->error = bch2_journal_error(&op->c->journal); + op->error = bch2_journal_error(&c->journal); - bch2_disk_reservation_put(op->c, &op->res); - percpu_ref_put(&op->c->writes); + bch2_disk_reservation_put(c, &op->res); + percpu_ref_put(&c->writes); bch2_keylist_free(&op->insert_keys, op->inline_keys); - closure_return(cl); -} - -static u64 keylist_sectors(struct keylist *keys) -{ - struct bkey_i *k; - u64 ret = 0; - for_each_keylist_key(keys, k) - ret += k->k.size; + bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); - return ret; + if (op->end_io) { + EBUG_ON(cl->parent); + closure_debug_destroy(cl); + op->end_io(op); + } else { + closure_return(cl); + } } -int bch2_write_index_default(struct bch_write_op *op) +/** + * bch_write_index - after a write, update index to point to new data + */ +static void __bch2_write_index(struct bch_write_op *op) { + struct bch_fs *c = op->c; struct keylist *keys = &op->insert_keys; - struct btree_iter iter; + struct bch_extent_ptr *ptr; + struct bkey_i *src, *dst = keys->keys, *n, *k; + unsigned dev; int ret; - bch2_btree_iter_init(&iter, op->c, BTREE_ID_EXTENTS, - bkey_start_pos(&bch2_keylist_front(keys)->k), - BTREE_ITER_INTENT); + for (src = keys->keys; src != keys->top; src = n) { + n = bkey_next(src); - ret = bch2_btree_insert_list_at(&iter, keys, &op->res, - NULL, op_journal_seq(op), - BTREE_INSERT_NOFAIL); - bch2_btree_iter_unlock(&iter); + if (bkey_extent_is_direct_data(&src->k)) { + bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, + test_bit(ptr->dev, op->failed.d)); - return ret; -} + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) { + ret = -EIO; + goto err; + } + } -/** - * bch_write_index - after a write, update index to point to new data - */ -static void bch2_write_index(struct closure *cl) -{ - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bch_fs *c = op->c; - struct keylist *keys = &op->insert_keys; + if (dst != src) + memmove_u64s_down(dst, src, src->u64s); + dst = bkey_next(dst); + } + + keys->top = dst; + + /* + * probably not the ideal place to hook this in, but I don't + * particularly want to plumb io_opts all the way through the btree + * update stack right now + */ + for_each_keylist_key(keys, k) { + bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); - op->flags |= BCH_WRITE_LOOPED; + if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k))) + bch2_check_set_feature(op->c, BCH_FEATURE_incompressible); + + } if (!bch2_keylist_empty(keys)) { u64 sectors_start = keylist_sectors(keys); int ret = op->index_update_fn(op); + BUG_ON(ret == -EINTR); BUG_ON(keylist_sectors(keys) && !ret); op->written += sectors_start - keylist_sectors(keys); if (ret) { - __bcache_io_error(c, "btree IO error %i", ret); + bch_err_inum_ratelimited(c, op->pos.inode, + "write error %i from btree update", ret); op->error = ret; } } +out: + /* If some a bucket wasn't written, we can't erasure code it: */ + for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) + bch2_open_bucket_write_error(c, &op->open_buckets, dev); + + bch2_open_buckets_put(c, &op->open_buckets); + return; +err: + keys->top = keys->keys; + op->error = ret; + goto out; +} - bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets); +static void bch2_write_index(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_fs *c = op->c; - if (!(op->flags & BCH_WRITE_DONE)) - continue_at(cl, __bch2_write, op->io_wq); + __bch2_write_index(op); - if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { + if (!(op->flags & BCH_WRITE_DONE)) { + continue_at(cl, __bch2_write, index_update_wq(op)); + } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { bch2_journal_flush_seq_async(&c->journal, *op_journal_seq(op), cl); @@ -276,43 +647,6 @@ static void bch2_write_index(struct closure *cl) } } -static void bch2_write_io_error(struct closure *cl) -{ - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct keylist *keys = &op->insert_keys; - struct bch_fs *c = op->c; - struct bch_extent_ptr *ptr; - struct bkey_i *k; - int ret; - - for_each_keylist_key(keys, k) { - struct bkey_i *n = bkey_next(k); - struct bkey_s_extent e = bkey_i_to_s_extent(k); - - extent_for_each_ptr_backwards(e, ptr) - if (test_bit(ptr->dev, op->failed.d)) - bch2_extent_drop_ptr(e, ptr); - - memmove(bkey_next(k), n, (void *) keys->top - (void *) n); - keys->top_p -= (u64 *) n - (u64 *) bkey_next(k); - - ret = bch2_extent_nr_ptrs(e.c) - ? bch2_check_mark_super(c, e.c, BCH_DATA_USER) - : -EIO; - if (ret) { - keys->top = keys->keys; - op->error = ret; - op->flags |= BCH_WRITE_DONE; - break; - } - } - - memset(&op->failed, 0, sizeof(op->failed)); - - bch2_write_index(cl); - return; -} - static void bch2_write_endio(struct bio *bio) { struct closure *cl = bio->bi_private; @@ -320,17 +654,19 @@ static void bch2_write_endio(struct bio *bio) struct bch_write_bio *wbio = to_wbio(bio); struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; struct bch_fs *c = wbio->c; - struct bch_dev *ca = wbio->ca; - - bch2_latency_acct(ca, wbio->submit_time_us, WRITE); + struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); - if (bch2_dev_io_err_on(bio->bi_error, ca, "data write")) { - set_bit(ca->dev_idx, op->failed.d); - set_closure_fn(cl, bch2_write_io_error, index_update_wq(op)); - } + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, + op->pos.inode, + op->pos.offset - bio_sectors(bio), /* XXX definitely wrong */ + "data write error: %s", + bch2_blk_status_to_str(bio->bi_status))) + set_bit(wbio->dev, op->failed.d); - if (wbio->have_io_ref) + if (wbio->have_ioref) { + bch2_latency_acct(ca, wbio->submit_time, WRITE); percpu_ref_put(&ca->io_ref); + } if (wbio->bounce) bch2_bio_free_pages_pool(c, bio); @@ -340,8 +676,10 @@ static void bch2_write_endio(struct bio *bio) if (parent) bio_endio(&parent->bio); - else + else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT)) closure_put(cl); + else + continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op)); } static void init_append_extent(struct bch_write_op *op, @@ -349,68 +687,88 @@ static void init_append_extent(struct bch_write_op *op, struct bversion version, struct bch_extent_crc_unpacked crc) { - struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top); + struct bch_fs *c = op->c; + struct bkey_i_extent *e; + struct open_bucket *ob; + unsigned i; + BUG_ON(crc.compressed_size > wp->sectors_free); + wp->sectors_free -= crc.compressed_size; op->pos.offset += crc.uncompressed_size; - e->k.p = op->pos; - e->k.size = crc.uncompressed_size; - e->k.version = version; - bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED); - bch2_extent_crc_append(e, crc); - bch2_alloc_sectors_append_ptrs(op->c, wp, e, crc.compressed_size); + e = bkey_extent_init(op->insert_keys.top); + e->k.p = op->pos; + e->k.size = crc.uncompressed_size; + e->k.version = version; + + if (crc.csum_type || + crc.compression_type || + crc.nonce) + bch2_extent_crc_append(&e->k_i, crc); + + open_bucket_for_each(c, &wp->ptrs, ob, i) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + union bch_extent_entry *end = + bkey_val_end(bkey_i_to_s(&e->k_i)); + + end->ptr = ob->ptr; + end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + end->ptr.cached = !ca->mi.durability || + (op->flags & BCH_WRITE_CACHED) != 0; + end->ptr.offset += ca->mi.bucket_size - ob->sectors_free; + + e->k.u64s++; + + BUG_ON(crc.compressed_size > ob->sectors_free); + ob->sectors_free -= crc.compressed_size; + } - bkey_extent_set_cached(&e->k, (op->flags & BCH_WRITE_CACHED)); bch2_keylist_push(&op->insert_keys); } static struct bio *bch2_write_bio_alloc(struct bch_fs *c, struct write_point *wp, struct bio *src, - bool *page_alloc_failed) + bool *page_alloc_failed, + void *buf) { struct bch_write_bio *wbio; struct bio *bio; unsigned output_available = min(wp->sectors_free << 9, src->bi_iter.bi_size); - unsigned pages = DIV_ROUND_UP(output_available, PAGE_SIZE); + unsigned pages = DIV_ROUND_UP(output_available + + (buf + ? ((unsigned long) buf & (PAGE_SIZE - 1)) + : 0), PAGE_SIZE); bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write); wbio = wbio_init(bio); - wbio->bounce = true; wbio->put_bio = true; /* copy WRITE_SYNC flag */ wbio->bio.bi_opf = src->bi_opf; + if (buf) { + bch2_bio_map(bio, buf, output_available); + return bio; + } + + wbio->bounce = true; + /* * We can't use mempool for more than c->sb.encoded_extent_max * worth of pages, but we'd like to allocate more if we can: */ - while (bio->bi_iter.bi_size < output_available) { - unsigned len = min_t(unsigned, PAGE_SIZE, - output_available - bio->bi_iter.bi_size); - struct page *p; - - p = alloc_page(GFP_NOIO); - if (!p) { - unsigned pool_max = - min_t(unsigned, output_available, - c->sb.encoded_extent_max << 9); - - if (bio_sectors(bio) < pool_max) - bch2_bio_alloc_pages_pool(c, bio, pool_max); - break; - } + bch2_bio_alloc_pages_pool(c, bio, + min_t(unsigned, output_available, + c->sb.encoded_extent_max << 9)); - bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) { - .bv_page = p, - .bv_len = len, - .bv_offset = 0, - }; - bio->bi_iter.bi_size += len; - } + if (bio->bi_iter.bi_size < output_available) + *page_alloc_failed = + bch2_bio_alloc_pages(bio, + output_available - + bio->bi_iter.bi_size, + GFP_NOFS) != 0; - *page_alloc_failed = bio->bi_vcnt < pages; return bio; } @@ -484,8 +842,9 @@ static enum prep_encoded_ret { /* Can we just write the entire extent as is? */ if (op->crc.uncompressed_size == op->crc.live_size && op->crc.compressed_size <= wp->sectors_free && - op->crc.compression_type == op->compression_type) { - if (!op->crc.compression_type && + (op->crc.compression_type == op->compression_type || + op->incompressible)) { + if (!crc_is_compressed(op->crc) && op->csum_type != op->crc.csum_type && bch2_write_rechecksum(c, op, op->csum_type)) return PREP_ENCODED_CHECKSUM_ERR; @@ -497,7 +856,7 @@ static enum prep_encoded_ret { * If the data is compressed and we couldn't write the entire extent as * is, we have to decompress it: */ - if (op->crc.compression_type) { + if (crc_is_compressed(op->crc)) { struct bch_csum csum; if (bch2_write_decrypt(op)) @@ -540,20 +899,23 @@ static enum prep_encoded_ret { return PREP_ENCODED_OK; } -static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) +static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + struct bio **_dst) { struct bch_fs *c = op->c; struct bio *src = &op->wbio.bio, *dst = src; struct bvec_iter saved_iter; - struct bkey_i *key_to_write; - unsigned key_to_write_offset = op->insert_keys.top_p - - op->insert_keys.keys_p; - unsigned total_output = 0; - bool bounce = false, page_alloc_failed = false; + void *ec_buf; + struct bpos ec_pos = op->pos; + unsigned total_output = 0, total_input = 0; + bool bounce = false; + bool page_alloc_failed = false; int ret, more = 0; BUG_ON(!bio_sectors(src)); + ec_buf = bch2_writepoint_ec_buf(c, wp); + switch (bch2_write_prep_encoded_data(op, wp)) { case PREP_ENCODED_OK: break; @@ -561,18 +923,30 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) ret = -EIO; goto err; case PREP_ENCODED_CHECKSUM_ERR: + BUG(); goto csum_err; case PREP_ENCODED_DO_WRITE: + /* XXX look for bug here */ + if (ec_buf) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bio_copy_data(dst, src); + bounce = true; + } init_append_extent(op, wp, op->version, op->crc); goto do_write; } - if (op->compression_type || + if (ec_buf || + op->compression_type || (op->csum_type && !(op->flags & BCH_WRITE_PAGES_STABLE)) || (bch2_csum_type_is_encryption(op->csum_type) && !(op->flags & BCH_WRITE_PAGES_OWNED))) { - dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed); + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); bounce = true; } @@ -594,11 +968,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) bch2_csum_type_is_encryption(op->crc.csum_type)); BUG_ON(op->compression_type && !bounce); - crc.compression_type = op->compression_type - ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, - op->compression_type) + crc.compression_type = op->incompressible + ? BCH_COMPRESSION_TYPE_incompressible + : op->compression_type + ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, + op->compression_type) : 0; - if (!crc.compression_type) { + if (!crc_is_compressed(crc)) { dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); @@ -619,7 +995,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) if (bch2_csum_type_is_encryption(op->csum_type)) { if (bversion_zero(version)) { - version.lo = atomic64_inc_return(&c->key_version) + 1; + version.lo = atomic64_inc_return(&c->key_version); } else { crc.nonce = op->nonce; op->nonce += src_len >> 9; @@ -627,7 +1003,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) } if ((op->flags & BCH_WRITE_DATA_ENCODED) && - !crc.compression_type && + !crc_is_compressed(crc) && bch2_csum_type_is_encryption(op->crc.csum_type) == bch2_csum_type_is_encryption(op->csum_type)) { /* @@ -675,7 +1051,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) if (dst != src) bio_advance(dst, dst_len); bio_advance(src, src_len); - total_output += dst_len; + total_output += dst_len; + total_input += src_len; } while (dst->bi_iter.bi_size && src->bi_iter.bi_size && wp->sectors_free && @@ -688,47 +1065,32 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) dst->bi_iter = saved_iter; - if (!bounce && more) { - dst = bio_split(src, total_output >> 9, + if (dst == src && more) { + BUG_ON(total_output != total_input); + + dst = bio_split(src, total_input >> 9, GFP_NOIO, &c->bio_write); - wbio_init(dst)->put_bio = true; + wbio_init(dst)->put_bio = true; + /* copy WRITE_SYNC flag */ + dst->bi_opf = src->bi_opf; } dst->bi_iter.bi_size = total_output; - - /* Free unneeded pages after compressing: */ - if (bounce) - while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE)) - mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page, - &c->bio_bounce_pages); do_write: /* might have done a realloc... */ + bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9); - key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); - - ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write), - BCH_DATA_USER); - if (ret) - goto err; - - dst->bi_end_io = bch2_write_endio; - dst->bi_private = &op->cl; - bio_set_op_attrs(dst, REQ_OP_WRITE, 0); - - closure_get(dst->bi_private); - - bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER, - key_to_write); + *_dst = dst; return more; csum_err: bch_err(c, "error verifying existing checksum while " "rewriting existing data (memory corruption?)"); ret = -EIO; err: - if (bounce) { + if (to_wbio(dst)->bounce) bch2_bio_free_pages_pool(c, dst); + if (to_wbio(dst)->put_bio) bio_put(dst); - } return ret; } @@ -738,29 +1100,53 @@ static void __bch2_write(struct closure *cl) struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; struct write_point *wp; + struct bio *bio; + bool skip_put = true; + unsigned nofs_flags; int ret; + nofs_flags = memalloc_nofs_save(); +again: + memset(&op->failed, 0, sizeof(op->failed)); + do { - if (op->open_buckets_nr + op->nr_replicas > - ARRAY_SIZE(op->open_buckets)) - continue_at(cl, bch2_write_index, index_update_wq(op)); + struct bkey_i *key_to_write; + unsigned key_to_write_offset = op->insert_keys.top_p - + op->insert_keys.keys_p; + + /* +1 for possible cache device: */ + if (op->open_buckets.nr + op->nr_replicas + 1 > + ARRAY_SIZE(op->open_buckets.v)) + goto flush_io; - /* for the device pointers and 1 for the chksum */ if (bch2_keylist_realloc(&op->insert_keys, op->inline_keys, ARRAY_SIZE(op->inline_keys), BKEY_EXTENT_U64s_MAX)) - continue_at(cl, bch2_write_index, index_update_wq(op)); + goto flush_io; + + if ((op->flags & BCH_WRITE_FROM_INTERNAL) && + percpu_ref_is_dying(&c->writes)) { + ret = -EROFS; + goto err; + } + /* + * The copygc thread is now global, which means it's no longer + * freeing up space on specific disks, which means that + * allocations for specific disks may hang arbitrarily long: + */ wp = bch2_alloc_sectors_start(c, - op->devs, + op->target, + op->opts.erasure_code, op->write_point, &op->devs_have, op->nr_replicas, op->nr_replicas_required, op->alloc_reserve, op->flags, - (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl); + (op->flags & (BCH_WRITE_ALLOC_NOWAIT| + BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl); EBUG_ON(!wp); if (unlikely(IS_ERR(wp))) { @@ -769,70 +1155,134 @@ static void __bch2_write(struct closure *cl) goto err; } - /* - * If we already have some keys, must insert them first - * before allocating another open bucket. We only hit - * this case if open_bucket_nr > 1. - */ - if (!bch2_keylist_empty(&op->insert_keys)) - continue_at(cl, bch2_write_index, - index_update_wq(op)); + goto flush_io; + } - /* - * If we've looped, we're running out of a workqueue - - * not the bch2_write() caller's context - and we don't - * want to block the workqueue: - */ - if (op->flags & BCH_WRITE_LOOPED) - continue_at(cl, __bch2_write, op->io_wq); + /* + * It's possible for the allocator to fail, put us on the + * freelist waitlist, and then succeed in one of various retry + * paths: if that happens, we need to disable the skip_put + * optimization because otherwise there won't necessarily be a + * barrier before we free the bch_write_op: + */ + if (atomic_read(&cl->remaining) & CLOSURE_WAITING) + skip_put = false; + + bch2_open_bucket_get(c, wp, &op->open_buckets); + ret = bch2_write_extent(op, wp, &bio); + bch2_alloc_sectors_done(c, wp); + if (ret < 0) + goto err; + + if (ret) { + skip_put = false; + } else { /* - * Otherwise, we do want to block the caller on alloc - * failure instead of letting it queue up more and more - * writes: - * XXX: this technically needs a try_to_freeze() - - * except that that's not safe because caller may have - * issued other IO... hmm.. + * for the skip_put optimization this has to be set + * before we submit the bio: */ - closure_sync(cl); - continue; + op->flags |= BCH_WRITE_DONE; } - ret = bch2_write_extent(op, wp); + bio->bi_end_io = bch2_write_endio; + bio->bi_private = &op->cl; + bio->bi_opf |= REQ_OP_WRITE; - BUG_ON(op->open_buckets_nr + wp->nr_ptrs_can_use > - ARRAY_SIZE(op->open_buckets)); - bch2_open_bucket_get(c, wp, - &op->open_buckets_nr, - op->open_buckets); - bch2_alloc_sectors_done(c, wp); + if (!skip_put) + closure_get(bio->bi_private); + else + op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT; - if (ret < 0) - goto err; + key_to_write = (void *) (op->insert_keys.keys_p + + key_to_write_offset); + + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, + key_to_write); } while (ret); - op->flags |= BCH_WRITE_DONE; - continue_at(cl, bch2_write_index, index_update_wq(op)); + if (!skip_put) + continue_at(cl, bch2_write_index, index_update_wq(op)); +out: + memalloc_nofs_restore(nofs_flags); + return; err: - /* - * Right now we can only error here if we went RO - the - * allocation failed, but we already checked for -ENOSPC when we - * got our reservation. - * - * XXX capacity might have changed, but we don't check for that - * yet: - */ op->error = ret; op->flags |= BCH_WRITE_DONE; + continue_at(cl, bch2_write_index, index_update_wq(op)); + goto out; +flush_io: /* - * No reason not to insert keys for whatever data was successfully - * written (especially for a cmpxchg operation that's moving data - * around) + * If the write can't all be submitted at once, we generally want to + * block synchronously as that signals backpressure to the caller. + * + * However, if we're running out of a workqueue, we can't block here + * because we'll be blocking other work items from completing: */ - continue_at(cl, !bch2_keylist_empty(&op->insert_keys) - ? bch2_write_index - : bch2_write_done, index_update_wq(op)); + if (current->flags & PF_WQ_WORKER) { + continue_at(cl, bch2_write_index, index_update_wq(op)); + goto out; + } + + closure_sync(cl); + + if (!bch2_keylist_empty(&op->insert_keys)) { + __bch2_write_index(op); + + if (op->error) { + op->flags |= BCH_WRITE_DONE; + continue_at_nobarrier(cl, bch2_write_done, NULL); + goto out; + } + } + + goto again; +} + +static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) +{ + struct closure *cl = &op->cl; + struct bio *bio = &op->wbio.bio; + struct bvec_iter iter; + struct bkey_i_inline_data *id; + unsigned sectors; + int ret; + + bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); + + ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, + ARRAY_SIZE(op->inline_keys), + BKEY_U64s + DIV_ROUND_UP(data_len, 8)); + if (ret) { + op->error = ret; + goto err; + } + + sectors = bio_sectors(bio); + op->pos.offset += sectors; + + id = bkey_inline_data_init(op->insert_keys.top); + id->k.p = op->pos; + id->k.version = op->version; + id->k.size = sectors; + + iter = bio->bi_iter; + iter.bi_size = data_len; + memcpy_from_bio(id->v.data, bio, iter); + + while (data_len & 7) + id->v.data[data_len++] = '\0'; + set_bkey_val_bytes(&id->k, data_len); + bch2_keylist_push(&op->insert_keys); + + op->flags |= BCH_WRITE_WROTE_DATA_INLINE; + op->flags |= BCH_WRITE_DONE; + + continue_at_nobarrier(cl, bch2_write_index, NULL); + return; +err: + bch2_write_done(&op->cl); } /** @@ -854,48 +1304,124 @@ err: void bch2_write(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bio *bio = &op->wbio.bio; struct bch_fs *c = op->c; + unsigned data_len; BUG_ON(!op->nr_replicas); BUG_ON(!op->write_point.v); BUG_ON(!bkey_cmp(op->pos, POS_MAX)); - BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX); - - memset(&op->failed, 0, sizeof(op->failed)); + op->start_time = local_clock(); bch2_keylist_init(&op->insert_keys, op->inline_keys); - wbio_init(&op->wbio.bio)->put_bio = false; + wbio_init(bio)->put_bio = false; + + if (bio_sectors(bio) & (c->opts.block_size - 1)) { + bch_err_inum_ratelimited(c, op->pos.inode, + "misaligned write"); + op->error = -EIO; + goto err; + } if (c->opts.nochanges || !percpu_ref_tryget(&c->writes)) { - __bcache_io_error(c, "read only"); op->error = -EROFS; - bch2_disk_reservation_put(c, &op->res); - closure_return(cl); + goto err; } - bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE); + bch2_increment_clock(c, bio_sectors(bio), WRITE); + + data_len = min_t(u64, bio->bi_iter.bi_size, + op->new_i_size - (op->pos.offset << 9)); + + if (c->opts.inline_data && + data_len <= min(block_bytes(c) / 2, 1024U)) { + bch2_write_data_inline(op, data_len); + return; + } continue_at_nobarrier(cl, __bch2_write, NULL); + return; +err: + bch2_disk_reservation_put(c, &op->res); + + if (op->end_io) { + EBUG_ON(cl->parent); + closure_debug_destroy(cl); + op->end_io(op); + } else { + closure_return(cl); + } } /* Cache promotion on read */ struct promote_op { struct closure cl; + struct rcu_head rcu; + u64 start_time; + + struct rhash_head hash; + struct bpos pos; + struct migrate_write write; struct bio_vec bi_inline_vecs[0]; /* must be last */ }; +static const struct rhashtable_params bch_promote_params = { + .head_offset = offsetof(struct promote_op, hash), + .key_offset = offsetof(struct promote_op, pos), + .key_len = sizeof(struct bpos), +}; + +static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, + struct bpos pos, + struct bch_io_opts opts, + unsigned flags) +{ + if (!(flags & BCH_READ_MAY_PROMOTE)) + return false; + + if (!opts.promote_target) + return false; + + if (bch2_bkey_has_target(c, k, opts.promote_target)) + return false; + + if (bch2_target_congested(c, opts.promote_target)) { + /* XXX trace this */ + return false; + } + + if (rhashtable_lookup_fast(&c->promote_table, &pos, + bch_promote_params)) + return false; + + return true; +} + +static void promote_free(struct bch_fs *c, struct promote_op *op) +{ + int ret; + + ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); + percpu_ref_put(&c->writes); + kfree_rcu(op, rcu); +} + static void promote_done(struct closure *cl) { struct promote_op *op = container_of(cl, struct promote_op, cl); struct bch_fs *c = op->write.op.c; - percpu_ref_put(&c->writes); + bch2_time_stats_update(&c->times[BCH_TIME_data_promote], + op->start_time); + bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio); - kfree(op); + promote_free(c, op); } static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) @@ -904,82 +1430,136 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) struct closure *cl = &op->cl; struct bio *bio = &op->write.op.wbio.bio; - BUG_ON(!rbio->split || !rbio->bounce); - - if (!percpu_ref_tryget(&c->writes)) - return; - trace_promote(&rbio->bio); /* we now own pages: */ + BUG_ON(!rbio->bounce); BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); - swap(bio->bi_vcnt, rbio->bio.bi_vcnt); - rbio->promote = NULL; - - __bch2_write_op_init(&op->write.op, c); - op->write.move_dev = -1; - op->write.op.devs = c->fastest_devs; - op->write.op.write_point = writepoint_hashed((unsigned long) current); - op->write.op.flags |= BCH_WRITE_ALLOC_NOWAIT; - op->write.op.flags |= BCH_WRITE_CACHED; + memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, + sizeof(struct bio_vec) * rbio->bio.bi_vcnt); + swap(bio->bi_vcnt, rbio->bio.bi_vcnt); - bch2_migrate_write_init(&op->write, rbio); + bch2_migrate_read_done(&op->write, rbio); closure_init(cl, NULL); - closure_call(&op->write.op.cl, bch2_write, c->wq, cl); + closure_call(&op->write.op.cl, bch2_write, c->btree_update_wq, cl); closure_return_with_destructor(cl, promote_done); } -/* - * XXX: multiple promotes can race with each other, wastefully. Keep a list of - * outstanding promotes? - */ -static struct promote_op *promote_alloc(struct bch_read_bio *rbio) +static struct promote_op *__promote_alloc(struct bch_fs *c, + enum btree_id btree_id, + struct bkey_s_c k, + struct bpos pos, + struct extent_ptr_decoded *pick, + struct bch_io_opts opts, + unsigned sectors, + struct bch_read_bio **rbio) { - struct promote_op *op; + struct promote_op *op = NULL; struct bio *bio; - /* data might have to be decompressed in the write path: */ - unsigned pages = DIV_ROUND_UP(rbio->pick.crc.uncompressed_size, - PAGE_SECTORS); + unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); + int ret; - BUG_ON(!rbio->bounce); - BUG_ON(pages < rbio->bio.bi_vcnt); + if (!percpu_ref_tryget(&c->writes)) + return NULL; - op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, - GFP_NOIO); + op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); if (!op) - return NULL; + goto err; + + op->start_time = local_clock(); + op->pos = pos; + + /* + * We don't use the mempool here because extents that aren't + * checksummed or compressed can be too big for the mempool: + */ + *rbio = kzalloc(sizeof(struct bch_read_bio) + + sizeof(struct bio_vec) * pages, + GFP_NOIO); + if (!*rbio) + goto err; + + rbio_init(&(*rbio)->bio, opts); + bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages); + + if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, + GFP_NOIO)) + goto err; + + (*rbio)->bounce = true; + (*rbio)->split = true; + (*rbio)->kmalloc = true; + + if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, + bch_promote_params)) + goto err; bio = &op->write.op.wbio.bio; bio_init(bio, bio->bi_inline_vecs, pages); - memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, - sizeof(struct bio_vec) * rbio->bio.bi_vcnt); + ret = bch2_migrate_write_init(c, &op->write, + writepoint_hashed((unsigned long) current), + opts, + DATA_PROMOTE, + (struct data_opts) { + .target = opts.promote_target, + .nr_replicas = 1, + }, + btree_id, k); + BUG_ON(ret); return op; +err: + if (*rbio) + bio_free_pages(&(*rbio)->bio); + kfree(*rbio); + *rbio = NULL; + kfree(op); + percpu_ref_put(&c->writes); + return NULL; } -/* only promote if we're not reading from the fastest tier: */ -static bool should_promote(struct bch_fs *c, - struct extent_pick_ptr *pick, unsigned flags) +noinline +static struct promote_op *promote_alloc(struct bch_fs *c, + struct bvec_iter iter, + struct bkey_s_c k, + struct extent_ptr_decoded *pick, + struct bch_io_opts opts, + unsigned flags, + struct bch_read_bio **rbio, + bool *bounce, + bool *read_full) { - if (!(flags & BCH_READ_MAY_PROMOTE)) - return false; + bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); + /* data might have to be decompressed in the write path: */ + unsigned sectors = promote_full + ? max(pick->crc.compressed_size, pick->crc.live_size) + : bvec_iter_sectors(iter); + struct bpos pos = promote_full + ? bkey_start_pos(k.k) + : POS(k.k->p.inode, iter.bi_sector); + struct promote_op *promote; + + if (!should_promote(c, k, pos, opts, flags)) + return NULL; - if (percpu_ref_is_dying(&c->writes)) - return false; + promote = __promote_alloc(c, + k.k->type == KEY_TYPE_reflink_v + ? BTREE_ID_reflink + : BTREE_ID_extents, + k, pos, pick, opts, sectors, rbio); + if (!promote) + return NULL; - return c->fastest_tier && - c->fastest_tier < c->tiers + pick->ca->mi.tier; + *bounce = true; + *read_full = promote_full; + return promote; } /* Read */ -static void bch2_read_nodecode_retry(struct bch_fs *, struct bch_read_bio *, - struct bvec_iter, u64, - struct bch_devs_mask *, unsigned); - #define READ_RETRY_AVOID 1 #define READ_RETRY 2 #define READ_ERR 3 @@ -1012,68 +1592,133 @@ static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) { - struct bch_read_bio *parent = rbio->parent; - - BUG_ON(!rbio->split); + BUG_ON(rbio->bounce && !rbio->split); if (rbio->promote) - kfree(rbio->promote); + promote_free(rbio->c, rbio->promote); + rbio->promote = NULL; + if (rbio->bounce) bch2_bio_free_pages_pool(rbio->c, &rbio->bio); - bio_put(&rbio->bio); - return parent; + if (rbio->split) { + struct bch_read_bio *parent = rbio->parent; + + if (rbio->kmalloc) + kfree(rbio); + else + bio_put(&rbio->bio); + + rbio = parent; + } + + return rbio; } +/* + * Only called on a top level bch_read_bio to complete an entire read request, + * not a split: + */ static void bch2_rbio_done(struct bch_read_bio *rbio) { - if (rbio->promote) - kfree(rbio->promote); - rbio->promote = NULL; - - if (rbio->split) - rbio = bch2_rbio_free(rbio); + if (rbio->start_time) + bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], + rbio->start_time); bio_endio(&rbio->bio); } +static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, u64 inode, + struct bch_io_failures *failed, + unsigned flags) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_buf sk; + struct bkey_s_c k; + int ret; + + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_iter(&trans, rbio->data_btree, + rbio->read_pos, BTREE_ITER_SLOTS); +retry: + rbio->bio.bi_status = 0; + + k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) + goto err; + + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + bch2_trans_unlock(&trans); + + if (!bch2_bkey_matches_ptr(c, k, + rbio->pick.ptr, + rbio->data_pos.offset - + rbio->pick.crc.offset)) { + /* extent we wanted to read no longer exists: */ + rbio->hole = true; + goto out; + } + + ret = __bch2_read_extent(&trans, rbio, bvec_iter, + rbio->read_pos, + rbio->data_btree, + k, 0, failed, flags); + if (ret == READ_RETRY) + goto retry; + if (ret) + goto err; +out: + bch2_rbio_done(rbio); + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + return; +err: + rbio->bio.bi_status = BLK_STS_IOERR; + goto out; +} + static void bch2_rbio_retry(struct work_struct *work) { struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bvec_iter iter = rbio->bvec_iter; - unsigned flags = rbio->flags; - u64 inode = rbio->pos.inode; - struct bch_devs_mask avoid; + struct bch_fs *c = rbio->c; + struct bvec_iter iter = rbio->bvec_iter; + unsigned flags = rbio->flags; + u64 inode = rbio->read_pos.inode; + struct bch_io_failures failed = { .nr = 0 }; trace_read_retry(&rbio->bio); - memset(&avoid, 0, sizeof(avoid)); - if (rbio->retry == READ_RETRY_AVOID) - __set_bit(rbio->pick.ca->dev_idx, avoid.d); + bch2_mark_io_failure(&failed, &rbio->pick); - if (rbio->promote) - kfree(rbio->promote); - rbio->promote = NULL; + rbio->bio.bi_status = 0; - if (rbio->split) - rbio = bch2_rbio_free(rbio); - else - rbio->bio.bi_error = 0; + rbio = bch2_rbio_free(rbio); - if (!(flags & BCH_READ_NODECODE)) - flags |= BCH_READ_MUST_CLONE; flags |= BCH_READ_IN_RETRY; flags &= ~BCH_READ_MAY_PROMOTE; - if (flags & BCH_READ_NODECODE) - bch2_read_nodecode_retry(c, rbio, iter, inode, &avoid, flags); - else - __bch2_read(c, rbio, iter, inode, &avoid, flags); + if (flags & BCH_READ_NODECODE) { + bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); + } else { + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + + __bch2_read(c, rbio, iter, inode, &failed, flags); + } } -static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error) +static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, + blk_status_t error) { rbio->retry = retry; @@ -1081,7 +1726,9 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error) return; if (retry == READ_ERR) { - bch2_rbio_parent(rbio)->bio.bi_error = error; + rbio = bch2_rbio_free(rbio); + + rbio->bio.bi_status = error; bch2_rbio_done(rbio); } else { bch2_rbio_punt(rbio, bch2_rbio_retry, @@ -1089,76 +1736,67 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error) } } -static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) +static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + struct bch_read_bio *rbio) { struct bch_fs *c = rbio->c; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_i_extent *e; - BKEY_PADDED(k) new; + u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; struct bch_extent_crc_unpacked new_crc; - unsigned offset; - int ret; - - if (rbio->pick.crc.compression_type) - return; + struct btree_iter *iter = NULL; + struct bkey_i *new; + struct bkey_s_c k; + int ret = 0; - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, rbio->pos, - BTREE_ITER_INTENT); -retry: - k = bch2_btree_iter_peek(&iter); - if (IS_ERR_OR_NULL(k.k)) - goto out; + if (crc_is_compressed(rbio->pick.crc)) + return 0; - if (!bkey_extent_is_data(k.k)) + iter = bch2_trans_get_iter(trans, rbio->data_btree, rbio->data_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(iter); + if ((ret = bkey_err(k))) goto out; - bkey_reassemble(&new.k, k); - e = bkey_i_to_extent(&new.k); - - if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e), - rbio->pick.ptr, - rbio->pos.offset - - rbio->pick.crc.offset) || - bversion_cmp(e->k.version, rbio->version)) + if (bversion_cmp(k.k->version, rbio->version) || + !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) goto out; /* Extent was merged? */ - if (bkey_start_offset(&e->k) < rbio->pos.offset || - e->k.p.offset > rbio->pos.offset + rbio->pick.crc.uncompressed_size) + if (bkey_start_offset(k.k) < data_offset || + k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) goto out; - /* The extent might have been partially overwritten since we read it: */ - offset = rbio->pick.crc.offset + (bkey_start_offset(&e->k) - rbio->pos.offset); - if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, - rbio->pick.crc, NULL, &new_crc, - offset, e->k.size, - rbio->pick.crc.csum_type)) { + rbio->pick.crc, NULL, &new_crc, + bkey_start_offset(k.k) - data_offset, k.k->size, + rbio->pick.crc.csum_type)) { bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); + ret = 0; goto out; } - if (!bch2_extent_narrow_crcs(e, new_crc)) + /* + * going to be temporarily appending another checksum entry: + */ + new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + + sizeof(struct bch_extent_crc128)); + if ((ret = PTR_ERR_OR_ZERO(new))) goto out; - ret = bch2_btree_insert_at(c, NULL, NULL, NULL, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_NOWAIT, - BTREE_INSERT_ENTRY(&iter, &e->k_i)); - if (ret == -EINTR) - goto retry; + bkey_reassemble(new, k); + + if (!bch2_bkey_narrow_crcs(new, new_crc)) + goto out; + + ret = bch2_trans_update(trans, iter, new, 0); out: - bch2_btree_iter_unlock(&iter); + bch2_trans_iter_put(trans, iter); + return ret; } -static bool should_narrow_crcs(struct bkey_s_c_extent e, - struct extent_pick_ptr *pick, - unsigned flags) +static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) { - return !(flags & BCH_READ_IN_RETRY) && - bch2_can_narrow_extent_crcs(e, pick->crc); + bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, + __bch2_rbio_narrow_crcs(&trans, rbio)); } /* Inner part that may run in process context */ @@ -1166,13 +1804,18 @@ static void __bch2_read_endio(struct work_struct *work) { struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bio *src = &rbio->bio, *dst = &bch2_rbio_parent(rbio)->bio; + struct bch_fs *c = rbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); + struct bio *src = &rbio->bio; + struct bio *dst = &bch2_rbio_parent(rbio)->bio; struct bvec_iter dst_iter = rbio->bvec_iter; struct bch_extent_crc_unpacked crc = rbio->pick.crc; struct nonce nonce = extent_nonce(rbio->version, crc); + unsigned nofs_flags; struct bch_csum csum; + nofs_flags = memalloc_nofs_save(); + /* Reset iterator for checksumming and copying bounced data: */ if (rbio->bounce) { src->bi_iter.bi_size = crc.compressed_size << 9; @@ -1186,6 +1829,13 @@ static void __bch2_read_endio(struct work_struct *work) if (bch2_crc_cmp(csum, rbio->pick.crc.csum)) goto csum_err; + /* + * XXX + * We need to rework the narrow_crcs path to deliver the read completion + * first, and then punt to a different workqueue, otherwise we're + * holding up reads while doing btree updates which is bad for memory + * reclaim. + */ if (unlikely(rbio->narrow_crcs)) bch2_rbio_narrow_crcs(rbio); @@ -1193,10 +1843,10 @@ static void __bch2_read_endio(struct work_struct *work) goto nodecode; /* Adjust crc to point to subset of data we want: */ - crc.offset += rbio->bvec_iter.bi_sector - rbio->pos.offset; + crc.offset += rbio->offset_into_extent; crc.live_size = bvec_iter_sectors(rbio->bvec_iter); - if (crc.compression_type != BCH_COMPRESSION_NONE) { + if (crc_is_compressed(crc)) { bch2_encrypt_bio(c, crc.csum_type, nonce, src); if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) goto decompression_err; @@ -1223,10 +1873,15 @@ static void __bch2_read_endio(struct work_struct *work) */ bch2_encrypt_bio(c, crc.csum_type, nonce, src); promote_start(rbio->promote, rbio); + rbio->promote = NULL; } nodecode: - if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) + if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { + rbio = bch2_rbio_free(rbio); bch2_rbio_done(rbio); + } +out: + memalloc_nofs_restore(nofs_flags); return; csum_err: /* @@ -1236,59 +1891,63 @@ csum_err: */ if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { rbio->flags |= BCH_READ_MUST_BOUNCE; - bch2_rbio_error(rbio, READ_RETRY, -EIO); - return; + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); + goto out; } - bch2_dev_io_error(rbio->pick.ca, - "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)", - rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, + bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector, + "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %u)", rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, csum.hi, csum.lo, crc.csum_type); - bch2_rbio_error(rbio, READ_RETRY_AVOID, -EIO); - return; + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; decompression_err: - __bcache_io_error(c, "decompression error, inode %llu offset %llu", - rbio->pos.inode, - (u64) rbio->bvec_iter.bi_sector); - bch2_rbio_error(rbio, READ_ERR, -EIO); - return; + bch_err_inum_ratelimited(c, rbio->read_pos.inode, + "decompression error"); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + goto out; } static void bch2_read_endio(struct bio *bio) { struct bch_read_bio *rbio = container_of(bio, struct bch_read_bio, bio); - struct bch_fs *c = rbio->c; + struct bch_fs *c = rbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); struct workqueue_struct *wq = NULL; enum rbio_context context = RBIO_CONTEXT_NULL; - bch2_latency_acct(rbio->pick.ca, rbio->submit_time_us, READ); - - percpu_ref_put(&rbio->pick.ca->io_ref); + if (rbio->have_ioref) { + bch2_latency_acct(ca, rbio->submit_time, READ); + percpu_ref_put(&ca->io_ref); + } if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bch2_dev_io_err_on(bio->bi_error, rbio->pick.ca, "data read")) { - bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_error); + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, + rbio->read_pos.inode, + rbio->read_pos.offset, + "data read error: %s", + bch2_blk_status_to_str(bio->bi_status))) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); return; } if (rbio->pick.ptr.cached && (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || - ptr_stale(rbio->pick.ca, &rbio->pick.ptr))) { + ptr_stale(ca, &rbio->pick.ptr))) { atomic_long_inc(&c->read_realloc_races); if (rbio->flags & BCH_READ_RETRY_IF_STALE) - bch2_rbio_error(rbio, READ_RETRY, -EINTR); + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); else - bch2_rbio_error(rbio, READ_ERR, -EINTR); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); return; } if (rbio->narrow_crcs || - rbio->pick.crc.compression_type || + crc_is_compressed(rbio->pick.crc) || bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; else if (rbio->pick.crc.csum_type) @@ -1297,73 +1956,166 @@ static void bch2_read_endio(struct bio *bio) bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); } -int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, - struct bvec_iter iter, struct bkey_s_c_extent e, - struct extent_pick_ptr *pick, unsigned flags) +int __bch2_read_indirect_extent(struct btree_trans *trans, + unsigned *offset_into_extent, + struct bkey_buf *orig_k) { - struct bch_read_bio *rbio; - bool split = false, bounce = false, read_full = false; - bool promote = false, narrow_crcs = false; - struct bpos pos = bkey_start_pos(e.k); - int ret = 0; + struct btree_iter *iter; + struct bkey_s_c k; + u64 reflink_offset; + int ret; - PTR_BUCKET(pick->ca, &pick->ptr)->prio[READ] = c->prio_clock[READ].hand; + reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + + *offset_into_extent; - narrow_crcs = should_narrow_crcs(e, pick, flags); + iter = bch2_trans_get_iter(trans, BTREE_ID_reflink, + POS(0, reflink_offset), + BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_reflink_v && + k.k->type != KEY_TYPE_indirect_inline_data) { + bch_err_inum_ratelimited(trans->c, orig_k->k->k.p.inode, + "%llu len %u points to nonexistent indirect extent %llu", + orig_k->k->k.p.offset, + orig_k->k->k.size, + reflink_offset); + bch2_inconsistent_error(trans->c); + ret = -EIO; + goto err; + } + + *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); + bch2_bkey_buf_reassemble(orig_k, trans->c, k); +err: + bch2_trans_iter_put(trans, iter); + return ret; +} + +int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + struct bvec_iter iter, struct bpos read_pos, + enum btree_id data_btree, struct bkey_s_c k, + unsigned offset_into_extent, + struct bch_io_failures *failed, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct extent_ptr_decoded pick; + struct bch_read_bio *rbio = NULL; + struct bch_dev *ca; + struct promote_op *promote = NULL; + bool bounce = false, read_full = false, narrow_crcs = false; + struct bpos data_pos = bkey_start_pos(k.k); + int pick_ret; + + if (bkey_extent_is_inline_data(k.k)) { + unsigned bytes = min_t(unsigned, iter.bi_size, + bkey_inline_data_bytes(k.k)); + + swap(iter.bi_size, bytes); + memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); + swap(iter.bi_size, bytes); + bio_advance_iter(&orig->bio, &iter, bytes); + zero_fill_bio_iter(&orig->bio, iter); + goto out_read_done; + } + + pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); + + /* hole or reservation - just zero fill: */ + if (!pick_ret) + goto hole; + + if (pick_ret < 0) { + bch_err_inum_ratelimited(c, k.k->p.inode, + "no device to read from"); + goto err; + } + + if (pick_ret > 0) + ca = bch_dev_bkey_exists(c, pick.ptr.dev); if (flags & BCH_READ_NODECODE) { - BUG_ON(iter.bi_size < pick->crc.compressed_size << 9); - iter.bi_size = pick->crc.compressed_size << 9; - goto noclone; + /* + * can happen if we retry, and the extent we were going to read + * has been merged in the meantime: + */ + if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) + goto hole; + + iter.bi_size = pick.crc.compressed_size << 9; + goto get_bio; } + if (!(flags & BCH_READ_LAST_FRAGMENT) || + bio_flagged(&orig->bio, BIO_CHAIN)) + flags |= BCH_READ_MUST_CLONE; + + narrow_crcs = !(flags & BCH_READ_IN_RETRY) && + bch2_can_narrow_extent_crcs(k, pick.crc); + if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) flags |= BCH_READ_MUST_BOUNCE; - EBUG_ON(bkey_start_offset(e.k) > iter.bi_sector || - e.k->p.offset < bvec_iter_end_sector(iter)); + EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); - if (pick->crc.compression_type != BCH_COMPRESSION_NONE || - (pick->crc.csum_type != BCH_CSUM_NONE && - (bvec_iter_sectors(iter) != pick->crc.uncompressed_size || - (bch2_csum_type_is_encryption(pick->crc.csum_type) && + if (crc_is_compressed(pick.crc) || + (pick.crc.csum_type != BCH_CSUM_NONE && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + (bch2_csum_type_is_encryption(pick.crc.csum_type) && (flags & BCH_READ_USER_MAPPED)) || (flags & BCH_READ_MUST_BOUNCE)))) { read_full = true; bounce = true; } - promote = should_promote(c, pick, flags); - /* could also set read_full */ - if (promote) - bounce = true; + if (orig->opts.promote_target) + promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, + &rbio, &bounce, &read_full); if (!read_full) { - EBUG_ON(pick->crc.compression_type); - EBUG_ON(pick->crc.csum_type && - (bvec_iter_sectors(iter) != pick->crc.uncompressed_size || - bvec_iter_sectors(iter) != pick->crc.live_size || - pick->crc.offset || - iter.bi_sector != pos.offset)); - - pick->ptr.offset += pick->crc.offset + - (iter.bi_sector - pos.offset); - pick->crc.compressed_size = bvec_iter_sectors(iter); - pick->crc.uncompressed_size = bvec_iter_sectors(iter); - pick->crc.offset = 0; - pick->crc.live_size = bvec_iter_sectors(iter); - pos.offset = iter.bi_sector; + EBUG_ON(crc_is_compressed(pick.crc)); + EBUG_ON(pick.crc.csum_type && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + bvec_iter_sectors(iter) != pick.crc.live_size || + pick.crc.offset || + offset_into_extent)); + + data_pos.offset += offset_into_extent; + pick.ptr.offset += pick.crc.offset + + offset_into_extent; + offset_into_extent = 0; + pick.crc.compressed_size = bvec_iter_sectors(iter); + pick.crc.uncompressed_size = bvec_iter_sectors(iter); + pick.crc.offset = 0; + pick.crc.live_size = bvec_iter_sectors(iter); + offset_into_extent = 0; } - - if (bounce) { - unsigned sectors = pick->crc.compressed_size; +get_bio: + if (rbio) { + /* + * promote already allocated bounce rbio: + * promote needs to allocate a bio big enough for uncompressing + * data in the write path, but we're not going to use it all + * here: + */ + EBUG_ON(rbio->bio.bi_iter.bi_size < + pick.crc.compressed_size << 9); + rbio->bio.bi_iter.bi_size = + pick.crc.compressed_size << 9; + } else if (bounce) { + unsigned sectors = pick.crc.compressed_size; rbio = rbio_init(bio_alloc_bioset(GFP_NOIO, - DIV_ROUND_UP(sectors, PAGE_SECTORS), - &c->bio_read_split)); + DIV_ROUND_UP(sectors, PAGE_SECTORS), + &c->bio_read_split), + orig->opts); bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); - split = true; + rbio->bounce = true; + rbio->split = true; } else if (flags & BCH_READ_MUST_CLONE) { /* * Have to clone if there were any splits, due to error @@ -1374,215 +2126,251 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, * lose the error) */ rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO, - &c->bio_read_split)); + &c->bio_read_split), + orig->opts); rbio->bio.bi_iter = iter; - split = true; + rbio->split = true; } else { -noclone: rbio = orig; rbio->bio.bi_iter = iter; - split = false; - BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); + EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); } - BUG_ON(bio_sectors(&rbio->bio) != pick->crc.compressed_size); + EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); rbio->c = c; - if (split) + rbio->submit_time = local_clock(); + if (rbio->split) rbio->parent = orig; else rbio->end_io = orig->bio.bi_end_io; rbio->bvec_iter = iter; - rbio->submit_time_us = local_clock_us(); + rbio->offset_into_extent= offset_into_extent; rbio->flags = flags; - rbio->bounce = bounce; - rbio->split = split; + rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); rbio->narrow_crcs = narrow_crcs; + rbio->hole = 0; rbio->retry = 0; rbio->context = 0; - rbio->devs_have = bch2_extent_devs(e); - rbio->pick = *pick; - rbio->pos = pos; - rbio->version = e.k->version; - rbio->promote = promote ? promote_alloc(rbio) : NULL; + /* XXX: only initialize this if needed */ + rbio->devs_have = bch2_bkey_devs(k); + rbio->pick = pick; + rbio->read_pos = read_pos; + rbio->data_btree = data_btree; + rbio->data_pos = data_pos; + rbio->version = k.k->version; + rbio->promote = promote; INIT_WORK(&rbio->work, NULL); - rbio->bio.bi_bdev = pick->ca->disk_sb.bdev; rbio->bio.bi_opf = orig->bio.bi_opf; - rbio->bio.bi_iter.bi_sector = pick->ptr.offset; + rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; - if (bounce) + if (rbio->bounce) trace_read_bounce(&rbio->bio); bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - this_cpu_add(pick->ca->io_done->sectors[READ][BCH_DATA_USER], - bio_sectors(&rbio->bio)); - if (likely(!(flags & BCH_READ_IN_RETRY))) { - submit_bio(&rbio->bio); - } else { - submit_bio_wait(&rbio->bio); - - rbio->context = RBIO_CONTEXT_UNBOUND; - bch2_read_endio(&rbio->bio); + /* + * If it's being moved internally, we don't want to flag it as a cache + * hit: + */ + if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) + bch2_bucket_io_time_reset(trans, pick.ptr.dev, + PTR_BUCKET_NR(ca, &pick.ptr), READ); - ret = rbio->retry; - if (!ret) - bch2_rbio_done(rbio); + if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { + bio_inc_remaining(&orig->bio); + trace_read_split(&orig->bio); } - return ret; -} + if (!rbio->pick.idx) { + if (!rbio->have_ioref) { + bch_err_inum_ratelimited(c, k.k->p.inode, + "no device to read from"); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; + } -static void bch2_read_nodecode_retry(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, - struct bch_devs_mask *avoid, unsigned flags) -{ - struct extent_pick_ptr pick; - struct btree_iter iter; - BKEY_PADDED(k) tmp; - struct bkey_s_c k; - int ret; + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], + bio_sectors(&rbio->bio)); + bio_set_dev(&rbio->bio, ca->disk_sb.bdev); - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, - POS(inode, bvec_iter.bi_sector), - BTREE_ITER_WITH_HOLES); -retry: - k = bch2_btree_iter_peek_with_holes(&iter); - if (btree_iter_err(k)) { - bch2_btree_iter_unlock(&iter); - goto err; - } + if (likely(!(flags & BCH_READ_IN_RETRY))) + submit_bio(&rbio->bio); + else + submit_bio_wait(&rbio->bio); + } else { + /* Attempting reconstruct read: */ + if (bch2_ec_read_extent(c, rbio)) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; + } - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); - bch2_btree_iter_unlock(&iter); + if (likely(!(flags & BCH_READ_IN_RETRY))) + bio_endio(&rbio->bio); + } +out: + if (likely(!(flags & BCH_READ_IN_RETRY))) { + return 0; + } else { + int ret; - if (!bkey_extent_is_data(k.k) || - !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k), - rbio->pick.ptr, - rbio->pos.offset - - rbio->pick.crc.offset) || - bkey_start_offset(k.k) != bvec_iter.bi_sector) - goto err; + rbio->context = RBIO_CONTEXT_UNBOUND; + bch2_read_endio(&rbio->bio); - bch2_extent_pick_ptr(c, k, avoid, &pick); - if (IS_ERR(pick.ca)) { - bcache_io_error(c, &rbio->bio, "no device to read from"); - bio_endio(&rbio->bio); - return; - } + ret = rbio->retry; + rbio = bch2_rbio_free(rbio); - if (!pick.ca) - goto err; + if (ret == READ_RETRY_AVOID) { + bch2_mark_io_failure(failed, &pick); + ret = READ_RETRY; + } - if (pick.crc.compressed_size > bvec_iter_sectors(bvec_iter)) { - percpu_ref_put(&pick.ca->io_ref); - goto err; + if (!ret) + goto out_read_done; + return ret; } - ret = __bch2_read_extent(c, rbio, bvec_iter, bkey_s_c_to_extent(k), - &pick, flags); - switch (ret) { - case READ_RETRY_AVOID: - __set_bit(pick.ca->dev_idx, avoid->d); - case READ_RETRY: - goto retry; - case READ_ERR: - bio_endio(&rbio->bio); - return; - }; - - return; err: + if (flags & BCH_READ_IN_RETRY) + return READ_ERR; + + orig->bio.bi_status = BLK_STS_IOERR; + goto out_read_done; + +hole: /* - * extent we wanted to read no longer exists, or - * was merged or partially overwritten (and thus - * possibly bigger than the memory that was - * originally allocated) + * won't normally happen in the BCH_READ_NODECODE + * (bch2_move_extent()) path, but if we retry and the extent we wanted + * to read no longer exists we have to signal that: */ - rbio->bio.bi_error = -EINTR; - bio_endio(&rbio->bio); - return; + if (flags & BCH_READ_NODECODE) + orig->hole = true; + + zero_fill_bio_iter(&orig->bio, iter); +out_read_done: + if (flags & BCH_READ_LAST_FRAGMENT) + bch2_rbio_done(orig); + return 0; } void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, u64 inode, - struct bch_devs_mask *avoid, unsigned flags) + struct bch_io_failures *failed, unsigned flags) { - struct btree_iter iter; + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_buf sk; struct bkey_s_c k; int ret; - EBUG_ON(flags & BCH_READ_NODECODE); + BUG_ON(flags & BCH_READ_NODECODE); + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); retry: - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, - POS(inode, bvec_iter.bi_sector), - BTREE_ITER_WITH_HOLES, k) { - BKEY_PADDED(k) tmp; - struct extent_pick_ptr pick; - struct bvec_iter fragment; + bch2_trans_begin(&trans); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, + POS(inode, bvec_iter.bi_sector), + BTREE_ITER_SLOTS); + while (1) { + unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; + + bch2_btree_iter_set_pos(iter, + POS(inode, bvec_iter.bi_sector)); + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + break; + + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + bch2_bkey_buf_reassemble(&sk, c, k); + + ret = bch2_read_indirect_extent(&trans, &data_btree, + &offset_into_extent, &sk); + if (ret) + break; + + k = bkey_i_to_s_c(sk.k); + + /* + * With indirect extents, the amount of data to read is the min + * of the original extent and the indirect extent: + */ + sectors = min(sectors, k.k->size - offset_into_extent); /* * Unlock the iterator while the btree node's lock is still in * cache, before doing the IO: */ - bkey_reassemble(&tmp.k, k); - k = bkey_i_to_s_c(&tmp.k); - bch2_btree_iter_unlock(&iter); + bch2_trans_unlock(&trans); - bch2_extent_pick_ptr(c, k, avoid, &pick); - if (IS_ERR(pick.ca)) { - bcache_io_error(c, &rbio->bio, "no device to read from"); - bio_endio(&rbio->bio); - return; - } + bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; + swap(bvec_iter.bi_size, bytes); - fragment = bvec_iter; - fragment.bi_size = (min_t(u64, k.k->p.offset, - bvec_iter_end_sector(bvec_iter)) - - bvec_iter.bi_sector) << 9; + if (bvec_iter.bi_size == bytes) + flags |= BCH_READ_LAST_FRAGMENT; - if (pick.ca) { - if (fragment.bi_size != bvec_iter.bi_size) { - bio_inc_remaining(&rbio->bio); - flags |= BCH_READ_MUST_CLONE; - trace_read_split(&rbio->bio); - } + ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter->pos, + data_btree, k, + offset_into_extent, failed, flags); + if (ret) + break; - ret = __bch2_read_extent(c, rbio, fragment, - bkey_s_c_to_extent(k), - &pick, flags); - switch (ret) { - case READ_RETRY_AVOID: - __set_bit(pick.ca->dev_idx, avoid->d); - case READ_RETRY: - goto retry; - case READ_ERR: - bio_endio(&rbio->bio); - return; - }; - } else { - zero_fill_bio_iter(&rbio->bio, fragment); + if (flags & BCH_READ_LAST_FRAGMENT) + break; - if (fragment.bi_size == bvec_iter.bi_size) - bio_endio(&rbio->bio); - } + swap(bvec_iter.bi_size, bytes); + bio_advance_iter(&rbio->bio, &bvec_iter, bytes); + } + bch2_trans_iter_put(&trans, iter); - if (fragment.bi_size == bvec_iter.bi_size) - return; + if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID) + goto retry; - bio_advance_iter(&rbio->bio, &bvec_iter, fragment.bi_size); + if (ret) { + bch_err_inum_ratelimited(c, inode, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; + bch2_rbio_done(rbio); } + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); +} - /* - * If we get here, it better have been because there was an error - * reading a btree node - */ - ret = bch2_btree_iter_unlock(&iter); - BUG_ON(!ret); - bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); - bio_endio(&rbio->bio); +void bch2_fs_io_exit(struct bch_fs *c) +{ + if (c->promote_table.tbl) + rhashtable_destroy(&c->promote_table); + mempool_exit(&c->bio_bounce_pages); + bioset_exit(&c->bio_write); + bioset_exit(&c->bio_read_split); + bioset_exit(&c->bio_read); +} + +int bch2_fs_io_init(struct bch_fs *c) +{ + if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS) || + bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS) || + bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), + BIOSET_NEED_BVECS) || + mempool_init_page_pool(&c->bio_bounce_pages, + max_t(unsigned, + c->opts.btree_node_size, + c->sb.encoded_extent_max) / + PAGE_SECTORS, 0) || + rhashtable_init(&c->promote_table, &bch_promote_params)) + return -ENOMEM; + + return 0; }