X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fio.c;h=c0371e23a4bc0f1cd9abc445a572b4e98a1cca94;hb=693e278c14efae893a6f051446fbbc0cf5e408cd;hp=19059702428ab909911d3b293c89e3d1112b2cd9;hpb=096f2ec00ed6165894aa3218fa5a7ab7a6c3c9e1;p=bcachefs-tools-debian diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 1905970..c0371e2 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -7,14 +7,16 @@ */ #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" -#include "bkey_on_stack.h" +#include "bkey_buf.h" #include "bset.h" #include "btree_update.h" #include "buckets.h" #include "checksum.h" #include "compress.h" #include "clock.h" +#include "data_update.h" #include "debug.h" #include "disk_groups.h" #include "ec.h" @@ -25,15 +27,28 @@ #include "journal.h" #include "keylist.h" #include "move.h" +#include "nocow_locking.h" #include "rebalance.h" +#include "subvolume.h" #include "super.h" #include "super-io.h" #include +#include #include +#include #include +const char *bch2_blk_status_to_str(blk_status_t status) +{ + if (status == BLK_STS_REMOVED) + return "device removed"; + return blk_status_to_str(status); +} + +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT + static bool bch2_target_congested(struct bch_fs *c, u16 target) { const struct bch_devs_mask *devs; @@ -46,7 +61,9 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) return false; rcu_read_lock(); - devs = bch2_target_to_mask(c, target); + devs = bch2_target_to_mask(c, target) ?: + &c->rw_devs[BCH_DATA_user]; + for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { ca = rcu_dereference(c->devs[d]); if (!ca) @@ -109,7 +126,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) * the time: */ if (abs((int) (old - io_latency)) < (old >> 1) && - now & ~(~0 << 5)) + now & ~(~0U << 5)) break; new = ewma_add(old, io_latency, 5); @@ -120,16 +137,25 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); } +#else + +static bool bch2_target_congested(struct bch_fs *c, u16 target) +{ + return false; +} + +#endif + /* Allocate, free from mempool: */ void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { struct bvec_iter_all iter; - struct bio_vec *bv; + struct bio_vec bv; bio_for_each_segment_all(bv, bio, iter) - if (bv->bv_page != ZERO_PAGE(0)) - mempool_free(bv->bv_page, &c->bio_bounce_pages); + if (bv.bv_page != ZERO_PAGE(0)) + mempool_free(bv.bv_page, &c->bio_bounce_pages); bio->bi_vcnt = 0; } @@ -160,7 +186,7 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, while (size) { struct page *page = __bio_alloc_page_pool(c, &using_mempool); - unsigned len = min(PAGE_SIZE, size); + unsigned len = min_t(size_t, PAGE_SIZE, size); BUG_ON(!bio_add_page(bio, page, len, 0)); size -= len; @@ -172,255 +198,444 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, /* Extent update path: */ -static int sum_sector_overwrites(struct btree_trans *trans, - struct btree_iter *extent_iter, - struct bkey_i *new, - bool may_allocate, - bool *maybe_extending, - s64 *delta) +int bch2_sum_sector_overwrites(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i *new, + bool *usage_increasing, + s64 *i_sectors_delta, + s64 *disk_sectors_delta) { - struct btree_iter *iter; + struct bch_fs *c = trans->c; + struct btree_iter iter; struct bkey_s_c old; + unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); + bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); int ret = 0; - *maybe_extending = true; - *delta = 0; + *usage_increasing = false; + *i_sectors_delta = 0; + *disk_sectors_delta = 0; - iter = bch2_trans_copy_iter(trans, extent_iter); - if (IS_ERR(iter)) - return PTR_ERR(iter); + bch2_trans_copy_iter(&iter, extent_iter); - for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { - if (!may_allocate && - bch2_bkey_nr_ptrs_fully_allocated(old) < - bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) { - ret = -ENOSPC; - break; - } + for_each_btree_key_upto_continue_norestart(iter, + new->k.p, BTREE_ITER_SLOTS, old, ret) { + s64 sectors = min(new->k.p.offset, old.k->p.offset) - + max(bkey_start_offset(&new->k), + bkey_start_offset(old.k)); - *delta += (min(new->k.p.offset, - old.k->p.offset) - - max(bkey_start_offset(&new->k), - bkey_start_offset(old.k))) * + *i_sectors_delta += sectors * (bkey_extent_is_allocation(&new->k) - bkey_extent_is_allocation(old.k)); - if (bkey_cmp(old.k->p, new->k.p) >= 0) { - /* - * Check if there's already data above where we're - * going to be writing to - this means we're definitely - * not extending the file: - * - * Note that it's not sufficient to check if there's - * data up to the sector offset we're going to be - * writing to, because i_size could be up to one block - * less: - */ - if (!bkey_cmp(old.k->p, new->k.p)) - old = bch2_btree_iter_next(iter); + *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)); + *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot + ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) + : 0; - if (old.k && !bkey_err(old) && - old.k->p.inode == extent_iter->pos.inode && - bkey_extent_is_data(old.k)) - *maybe_extending = false; + if (!*usage_increasing && + (new->k.p.snapshot != old.k->p.snapshot || + new_replicas > bch2_bkey_replicas(c, old) || + (!new_compressed && bch2_bkey_sectors_compressed(old)))) + *usage_increasing = true; + if (bkey_ge(old.k->p, new->k.p)) break; - } } - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, + struct btree_iter *extent_iter, + u64 new_i_size, + s64 i_sectors_delta) +{ + struct btree_iter iter; + struct bkey_i *k; + struct bkey_i_inode_v3 *inode; + unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, + SPOS(0, + extent_iter->pos.inode, + extent_iter->snapshot), + BTREE_ITER_INTENT|BTREE_ITER_CACHED); + k = bch2_bkey_get_mut(trans, &iter); + ret = PTR_ERR_OR_ZERO(k); + if (unlikely(ret)) + goto err; + + if (unlikely(k->k.type != KEY_TYPE_inode_v3)) { + k = bch2_inode_to_v3(trans, k); + ret = PTR_ERR_OR_ZERO(k); + if (unlikely(ret)) + goto err; + } + + inode = bkey_i_to_inode_v3(k); + + if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) && + new_i_size > le64_to_cpu(inode->v.bi_size)) { + inode->v.bi_size = cpu_to_le64(new_i_size); + inode_update_flags = 0; + } + + if (i_sectors_delta) { + le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); + inode_update_flags = 0; + } + + if (inode->k.p.snapshot != iter.snapshot) { + inode->k.p.snapshot = iter.snapshot; + inode_update_flags = 0; + } + + ret = bch2_trans_update(trans, &iter, &inode->k_i, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + inode_update_flags); +err: + bch2_trans_iter_exit(trans, &iter); return ret; } int bch2_extent_update(struct btree_trans *trans, + subvol_inum inum, struct btree_iter *iter, struct bkey_i *k, struct disk_reservation *disk_res, - u64 *journal_seq, u64 new_i_size, - s64 *i_sectors_delta) + s64 *i_sectors_delta_total, + bool check_enospc) { - /* this must live until after bch2_trans_commit(): */ - struct bkey_inode_buf inode_p; - bool extending = false; - s64 delta = 0; + struct bpos next_pos; + bool usage_increasing; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; int ret; - ret = bch2_extent_trim_atomic(k, iter); + /* + * This traverses us the iterator without changing iter->path->pos to + * search_key() (which is pos + 1 for extents): we want there to be a + * path already traversed at iter->pos because + * bch2_trans_extent_update() will use it to attempt extent merging + */ + ret = __bch2_btree_iter_traverse(iter); if (ret) return ret; - ret = sum_sector_overwrites(trans, iter, k, - disk_res && disk_res->sectors != 0, - &extending, &delta); + ret = bch2_extent_trim_atomic(trans, iter, k); if (ret) return ret; - new_i_size = extending - ? min(k->k.p.offset << 9, new_i_size) - : 0; + next_pos = k->k.p; + + ret = bch2_sum_sector_overwrites(trans, iter, k, + &usage_increasing, + &i_sectors_delta, + &disk_sectors_delta); + if (ret) + return ret; - if (delta || new_i_size) { - struct btree_iter *inode_iter; - struct bch_inode_unpacked inode_u; + if (disk_res && + disk_sectors_delta > (s64) disk_res->sectors) { + ret = bch2_disk_reservation_add(trans->c, disk_res, + disk_sectors_delta - disk_res->sectors, + !check_enospc || !usage_increasing + ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + return ret; + } + + /* + * Note: + * We always have to do an inode update - even when i_size/i_sectors + * aren't changing - for fsync to work properly; fsync relies on + * inode->bi_journal_seq which is updated by the trigger code: + */ + ret = bch2_extent_update_i_size_sectors(trans, iter, + min(k->k.p.offset << 9, new_i_size), + i_sectors_delta) ?: + bch2_trans_update(trans, iter, k, 0) ?: + bch2_trans_commit(trans, disk_res, NULL, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL); + if (unlikely(ret)) + return ret; + + if (i_sectors_delta_total) + *i_sectors_delta_total += i_sectors_delta; + bch2_btree_iter_set_pos(iter, next_pos); + return 0; +} + +/* Overwrites whatever was present with zeroes: */ +int bch2_extent_fallocate(struct btree_trans *trans, + subvol_inum inum, + struct btree_iter *iter, + unsigned sectors, + struct bch_io_opts opts, + s64 *i_sectors_delta, + struct write_point_specifier write_point) +{ + struct bch_fs *c = trans->c; + struct disk_reservation disk_res = { 0 }; + struct closure cl; + struct open_buckets open_buckets; + struct bkey_s_c k; + struct bkey_buf old, new; + unsigned sectors_allocated; + bool have_reservation = false; + bool unwritten = opts.nocow && + c->sb.version >= bcachefs_metadata_version_unwritten_extents; + int ret; - inode_iter = bch2_inode_peek(trans, &inode_u, - k->k.p.inode, BTREE_ITER_INTENT); - if (IS_ERR(inode_iter)) - return PTR_ERR(inode_iter); + bch2_bkey_buf_init(&old); + bch2_bkey_buf_init(&new); + closure_init_stack(&cl); + open_buckets.nr = 0; +retry: + sectors_allocated = 0; + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + return ret; + + sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); + + if (!have_reservation) { + unsigned new_replicas = + max(0, (int) opts.data_replicas - + (int) bch2_bkey_nr_ptrs_fully_allocated(k)); /* - * XXX: - * writeback can race a bit with truncate, because truncate - * first updates the inode then truncates the pagecache. This is - * ugly, but lets us preserve the invariant that the in memory - * i_size is always >= the on disk i_size. - * - BUG_ON(new_i_size > inode_u.bi_size && - (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)); + * Get a disk reservation before (in the nocow case) calling + * into the allocator: */ - BUG_ON(new_i_size > inode_u.bi_size && !extending); + ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); + if (unlikely(ret)) + goto out; - if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && - new_i_size > inode_u.bi_size) - inode_u.bi_size = new_i_size; - else - new_i_size = 0; + bch2_bkey_buf_reassemble(&old, c, k); + } + + if (have_reservation) { + if (!bch2_extents_match(k, bkey_i_to_s_c(old.k))) + goto out; - inode_u.bi_sectors += delta; + bch2_key_resize(&new.k->k, sectors); + } else if (!unwritten) { + struct bkey_i_reservation *reservation; - if (delta || new_i_size) { - bch2_inode_pack(&inode_p, &inode_u); - bch2_trans_update(trans, inode_iter, - &inode_p.inode.k_i, 0); + bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64)); + reservation = bkey_reservation_init(new.k); + reservation->k.p = iter->pos; + bch2_key_resize(&reservation->k, sectors); + reservation->v.nr_replicas = opts.data_replicas; + } else { + struct bkey_i_extent *e; + struct bch_devs_list devs_have; + struct write_point *wp; + struct bch_extent_ptr *ptr; + + devs_have.nr = 0; + + bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX); + + e = bkey_extent_init(new.k); + e->k.p = iter->pos; + + ret = bch2_alloc_sectors_start_trans(trans, + opts.foreground_target, + false, + write_point, + &devs_have, + opts.data_replicas, + opts.data_replicas, + RESERVE_none, 0, &cl, &wp); + if (ret) { + bch2_trans_unlock(trans); + closure_sync(&cl); + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) + goto retry; + return ret; } - bch2_trans_iter_put(trans, inode_iter); + sectors = min(sectors, wp->sectors_free); + sectors_allocated = sectors; + + bch2_key_resize(&e->k, sectors); + + bch2_open_bucket_get(c, wp, &open_buckets); + bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); + bch2_alloc_sectors_done(c, wp); + + extent_for_each_ptr(extent_i_to_s(e), ptr) + ptr->unwritten = true; } - bch2_trans_update(trans, iter, k, 0); + have_reservation = true; - ret = bch2_trans_commit(trans, disk_res, journal_seq, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE); - if (!ret && i_sectors_delta) - *i_sectors_delta += delta; + ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, + 0, i_sectors_delta, true); +out: + if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) { + bch2_trans_unlock(trans); + closure_sync(&cl); + } + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + bch2_trans_begin(trans); + goto retry; + } + + if (!ret && sectors_allocated) + bch2_increment_clock(c, sectors_allocated, WRITE); + + bch2_open_buckets_put(c, &open_buckets); + bch2_disk_reservation_put(c, &disk_res); + bch2_bkey_buf_exit(&new, c); + bch2_bkey_buf_exit(&old, c); return ret; } +/* + * Returns -BCH_ERR_transacton_restart if we had to drop locks: + */ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end, u64 *journal_seq, + subvol_inum inum, u64 end, s64 *i_sectors_delta) { struct bch_fs *c = trans->c; unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct bpos end_pos = POS(inum.inum, end); struct bkey_s_c k; int ret = 0, ret2 = 0; + u32 snapshot; - while ((k = bch2_btree_iter_peek(iter)).k && - bkey_cmp(iter->pos, end) < 0) { + while (!ret || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) { struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); struct bkey_i delete; + if (ret) + ret2 = ret; + bch2_trans_begin(trans); + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(iter, snapshot); + + /* + * peek_upto() doesn't have ideal semantics for extents: + */ + k = bch2_btree_iter_peek_upto(iter, end_pos); + if (!k.k) + break; + ret = bkey_err(k); if (ret) - goto btree_err; + continue; bkey_init(&delete.k); delete.k.p = iter->pos; /* create the biggest key we can */ bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end, &delete); + bch2_cut_back(end_pos, &delete); - ret = bch2_extent_update(trans, iter, &delete, - &disk_res, journal_seq, - 0, i_sectors_delta); + ret = bch2_extent_update(trans, inum, iter, &delete, + &disk_res, 0, i_sectors_delta, false); bch2_disk_reservation_put(c, &disk_res); -btree_err: - if (ret == -EINTR) { - ret2 = ret; - ret = 0; - } - if (ret) - break; - } - - if (bkey_cmp(iter->pos, end) > 0) { - bch2_btree_iter_set_pos(iter, end); - ret = bch2_btree_iter_traverse(iter); } return ret ?: ret2; } -int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, - u64 *journal_seq, s64 *i_sectors_delta) +int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, + s64 *i_sectors_delta) { struct btree_trans trans; - struct btree_iter *iter; - int ret = 0; + struct btree_iter iter; + int ret; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - POS(inum, start), - BTREE_ITER_INTENT); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + POS(inum.inum, start), + BTREE_ITER_INTENT); - ret = bch2_fpunch_at(&trans, iter, POS(inum, end), - journal_seq, i_sectors_delta); + ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta); + + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ret = 0; return ret; } -int bch2_write_index_default(struct bch_write_op *op) +static int bch2_write_index_default(struct bch_write_op *op) { struct bch_fs *c = op->c; - struct bkey_on_stack sk; + struct bkey_buf sk; struct keylist *keys = &op->insert_keys; struct bkey_i *k = bch2_keylist_front(keys); struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; + subvol_inum inum = { + .subvol = op->subvol, + .inum = k->k.p.inode, + }; int ret; - bkey_on_stack_init(&sk); - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + BUG_ON(!inum.subvol); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - bkey_start_pos(&k->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); do { bch2_trans_begin(&trans); k = bch2_keylist_front(keys); + bch2_bkey_buf_copy(&sk, c, k); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, + &sk.k->k.p.snapshot); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bkey_start_pos(&sk.k->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - bkey_on_stack_realloc(&sk, c, k->k.u64s); - bkey_copy(sk.k, k); - bch2_cut_front(iter->pos, sk.k); + ret = bch2_extent_update(&trans, inum, &iter, sk.k, + &op->res, + op->new_i_size, &op->i_sectors_delta, + op->flags & BCH_WRITE_CHECK_ENOSPC); + bch2_trans_iter_exit(&trans, &iter); - ret = bch2_extent_update(&trans, iter, sk.k, - &op->res, op_journal_seq(op), - op->new_i_size, &op->i_sectors_delta); - if (ret == -EINTR) + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; - if (bkey_cmp(iter->pos, k->k.p) >= 0) - bch2_keylist_pop_front(keys); + if (bkey_ge(iter.pos, k->k.p)) + bch2_keylist_pop_front(&op->insert_keys); + else + bch2_cut_front(iter.pos, k); } while (!bch2_keylist_empty(keys)); bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); return ret; } @@ -429,7 +644,8 @@ int bch2_write_index_default(struct bch_write_op *op) void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, enum bch_data_type type, - const struct bkey_i *k) + const struct bkey_i *k, + bool nocow) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); const struct bch_extent_ptr *ptr; @@ -445,8 +661,8 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, ca = bch_dev_bkey_exists(c, ptr->dev); if (to_entry(ptr + 1) < ptrs.end) { - n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO, - &ca->replica_set)); + n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, + GFP_NOIO, &ca->replica_set)); n->bio.bi_end_io = wbio->bio.bi_end_io; n->bio.bi_private = wbio->bio.bi_private; @@ -463,18 +679,24 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, n->c = c; n->dev = ptr->dev; - n->have_ioref = bch2_dev_get_ioref(ca, WRITE); + n->have_ioref = nocow || bch2_dev_get_ioref(ca, + type == BCH_DATA_btree ? READ : WRITE); + n->nocow = nocow; n->submit_time = local_clock(); + n->inode_offset = bkey_start_offset(&k->k); n->bio.bi_iter.bi_sector = ptr->offset; - if (!journal_flushes_device(ca)) - n->bio.bi_opf |= REQ_FUA; - if (likely(n->have_ioref)) { this_cpu_add(ca->io_done->sectors[WRITE][type], bio_sectors(&n->bio)); bio_set_dev(&n->bio, ca->disk_sb.bdev); + + if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) { + bio_endio(&n->bio); + continue; + } + submit_bio(&n->bio); } else { n->bio.bi_status = BLK_STS_REMOVED; @@ -483,43 +705,31 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, } } -static void __bch2_write(struct closure *); +static void __bch2_write(struct bch_write_op *); static void bch2_write_done(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; - if (!op->error && (op->flags & BCH_WRITE_FLUSH)) - op->error = bch2_journal_error(&c->journal); - - if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) - bch2_disk_reservation_put(c, &op->res); - percpu_ref_put(&c->writes); + bch2_disk_reservation_put(c, &op->res); + if (!(op->flags & BCH_WRITE_MOVE)) + bch2_write_ref_put(c, BCH_WRITE_REF_write); bch2_keylist_free(&op->insert_keys, op->inline_keys); bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); - if (op->end_io) { - EBUG_ON(cl->parent); - closure_debug_destroy(cl); + EBUG_ON(cl->parent); + closure_debug_destroy(cl); + if (op->end_io) op->end_io(op); - } else { - closure_return(cl); - } } -/** - * bch_write_index - after a write, update index to point to new data - */ -static void __bch2_write_index(struct bch_write_op *op) +static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) { - struct bch_fs *c = op->c; struct keylist *keys = &op->insert_keys; struct bch_extent_ptr *ptr; - struct bkey_i *src, *dst = keys->keys, *n, *k; - unsigned dev; - int ret; + struct bkey_i *src, *dst = keys->keys, *n; for (src = keys->keys; src != keys->top; src = n) { n = bkey_next(src); @@ -528,45 +738,67 @@ static void __bch2_write_index(struct bch_write_op *op) bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, test_bit(ptr->dev, op->failed.d)); - if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) { - ret = -EIO; - goto err; - } + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) + return -EIO; } if (dst != src) - memmove_u64s_down(dst, src, src->u64s); + memmove_u64s_down(dst, src, src->k.u64s); dst = bkey_next(dst); } keys->top = dst; + return 0; +} + +/** + * bch_write_index - after a write, update index to point to new data + */ +static void __bch2_write_index(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct keylist *keys = &op->insert_keys; + struct bkey_i *k; + unsigned dev; + int ret = 0; + + if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + ret = bch2_write_drop_io_error_ptrs(op); + if (ret) + goto err; + } /* * probably not the ideal place to hook this in, but I don't * particularly want to plumb io_opts all the way through the btree * update stack right now */ - for_each_keylist_key(keys, k) { + for_each_keylist_key(keys, k) bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); - if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k))) - bch2_check_set_feature(op->c, BCH_FEATURE_incompressible); - - } - if (!bch2_keylist_empty(keys)) { u64 sectors_start = keylist_sectors(keys); - int ret = op->index_update_fn(op); - BUG_ON(ret == -EINTR); + ret = !(op->flags & BCH_WRITE_MOVE) + ? bch2_write_index_default(op) + : bch2_data_update_index_update(op); + + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); BUG_ON(keylist_sectors(keys) && !ret); op->written += sectors_start - keylist_sectors(keys); - if (ret) { - __bcache_io_error(c, "btree IO error %i", ret); - op->error = ret; + if (ret && !bch2_err_matches(ret, EROFS)) { + struct bkey_i *k = bch2_keylist_front(&op->insert_keys); + + bch_err_inum_offset_ratelimited(c, + k->k.p.inode, k->k.p.offset << 9, + "write error while doing btree update: %s", + bch2_err_str(ret)); } + + if (ret) + goto err; } out: /* If some a bucket wasn't written, we can't erasure code it: */ @@ -578,23 +810,90 @@ out: err: keys->top = keys->keys; op->error = ret; + op->flags |= BCH_WRITE_DONE; goto out; } +static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) +{ + if (state != wp->state) { + u64 now = ktime_get_ns(); + + if (wp->last_state_change && + time_after64(now, wp->last_state_change)) + wp->time[wp->state] += now - wp->last_state_change; + wp->state = state; + wp->last_state_change = now; + } +} + +static inline void wp_update_state(struct write_point *wp, bool running) +{ + enum write_point_state state; + + state = running ? WRITE_POINT_running : + !list_empty(&wp->writes) ? WRITE_POINT_waiting_io + : WRITE_POINT_stopped; + + __wp_update_state(wp, state); +} + static void bch2_write_index(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bch_fs *c = op->c; + struct write_point *wp = op->wp; + struct workqueue_struct *wq = index_update_wq(op); + unsigned long flags; - __bch2_write_index(op); + if ((op->flags & BCH_WRITE_DONE) && + (op->flags & BCH_WRITE_MOVE)) + bch2_bio_free_pages_pool(op->c, &op->wbio.bio); - if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { - bch2_journal_flush_seq_async(&c->journal, - *op_journal_seq(op), - cl); - continue_at(cl, bch2_write_done, index_update_wq(op)); - } else { - continue_at_nobarrier(cl, bch2_write_done, NULL); + spin_lock_irqsave(&wp->writes_lock, flags); + if (wp->state == WRITE_POINT_waiting_io) + __wp_update_state(wp, WRITE_POINT_waiting_work); + list_add_tail(&op->wp_list, &wp->writes); + spin_unlock_irqrestore (&wp->writes_lock, flags); + + queue_work(wq, &wp->index_update_work); +} + +static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp) +{ + op->wp = wp; + + if (wp->state == WRITE_POINT_stopped) { + spin_lock_irq(&wp->writes_lock); + __wp_update_state(wp, WRITE_POINT_waiting_io); + spin_unlock_irq(&wp->writes_lock); + } +} + +void bch2_write_point_do_index_updates(struct work_struct *work) +{ + struct write_point *wp = + container_of(work, struct write_point, index_update_work); + struct bch_write_op *op; + + while (1) { + spin_lock_irq(&wp->writes_lock); + op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); + if (op) + list_del(&op->wp_list); + wp_update_state(wp, op != NULL); + spin_unlock_irq(&wp->writes_lock); + + if (!op) + break; + + op->flags |= BCH_WRITE_IN_WORKER; + + __bch2_write_index(op); + + if (!(op->flags & BCH_WRITE_DONE)) + __bch2_write(op); + else + bch2_write_done(&op->cl); } } @@ -607,8 +906,17 @@ static void bch2_write_endio(struct bio *bio) struct bch_fs *c = wbio->c; struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); - if (bch2_dev_io_err_on(bio->bi_status, ca, "data write")) + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, + op->pos.inode, + wbio->inode_offset << 9, + "data write error: %s", + bch2_blk_status_to_str(bio->bi_status))) { set_bit(wbio->dev, op->failed.d); + op->flags |= BCH_WRITE_IO_ERROR; + } + + if (wbio->nocow) + set_bit(wbio->dev, op->devs_need_flush->d); if (wbio->have_ioref) { bch2_latency_acct(ca, wbio->submit_time, WRITE); @@ -623,10 +931,8 @@ static void bch2_write_endio(struct bio *bio) if (parent) bio_endio(&parent->bio); - else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT)) - closure_put(cl); else - continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op)); + closure_put(cl); } static void init_append_extent(struct bch_write_op *op, @@ -634,13 +940,8 @@ static void init_append_extent(struct bch_write_op *op, struct bversion version, struct bch_extent_crc_unpacked crc) { - struct bch_fs *c = op->c; struct bkey_i_extent *e; - struct open_bucket *ob; - unsigned i; - BUG_ON(crc.compressed_size > wp->sectors_free); - wp->sectors_free -= crc.compressed_size; op->pos.offset += crc.uncompressed_size; e = bkey_extent_init(op->insert_keys.top); @@ -653,22 +954,8 @@ static void init_append_extent(struct bch_write_op *op, crc.nonce) bch2_extent_crc_append(&e->k_i, crc); - open_bucket_for_each(c, &wp->ptrs, ob, i) { - struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); - union bch_extent_entry *end = - bkey_val_end(bkey_i_to_s(&e->k_i)); - - end->ptr = ob->ptr; - end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; - end->ptr.cached = !ca->mi.durability || - (op->flags & BCH_WRITE_CACHED) != 0; - end->ptr.offset += ca->mi.bucket_size - ob->sectors_free; - - e->k.u64s++; - - BUG_ON(crc.compressed_size > ob->sectors_free); - ob->sectors_free -= crc.compressed_size; - } + bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, + op->flags & BCH_WRITE_CACHED); bch2_keylist_push(&op->insert_keys); } @@ -688,7 +975,10 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, ? ((unsigned long) buf & (PAGE_SIZE - 1)) : 0), PAGE_SIZE); - bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write); + pages = min(pages, BIO_MAX_VECS); + + bio = bio_alloc_bioset(NULL, pages, 0, + GFP_NOIO, &c->bio_write); wbio = wbio_init(bio); wbio->put_bio = true; /* copy WRITE_SYNC flag */ @@ -707,7 +997,7 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c, */ bch2_bio_alloc_pages_pool(c, bio, min_t(unsigned, output_available, - c->sb.encoded_extent_max << 9)); + c->opts.encoded_extent_max)); if (bio->bi_iter.bi_size < output_available) *page_alloc_failed = @@ -751,6 +1041,7 @@ static int bch2_write_decrypt(struct bch_write_op *op) struct bch_fs *c = op->c; struct nonce nonce = extent_nonce(op->version, op->crc); struct bch_csum csum; + int ret; if (!bch2_csum_type_is_encryption(op->crc.csum_type)) return 0; @@ -765,10 +1056,10 @@ static int bch2_write_decrypt(struct bch_write_op *op) if (bch2_crc_cmp(op->crc.csum, csum)) return -EIO; - bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); + ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); op->crc.csum_type = 0; op->crc.csum = (struct bch_csum) { 0, 0 }; - return 0; + return ret; } static enum prep_encoded_ret { @@ -853,7 +1144,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, struct bio *src = &op->wbio.bio, *dst = src; struct bvec_iter saved_iter; void *ec_buf; - struct bpos ec_pos = op->pos; unsigned total_output = 0, total_input = 0; bool bounce = false; bool page_alloc_failed = false; @@ -870,7 +1160,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, ret = -EIO; goto err; case PREP_ENCODED_CHECKSUM_ERR: - BUG(); goto csum_err; case PREP_ENCODED_DO_WRITE: /* XXX look for bug here */ @@ -900,14 +1189,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, saved_iter = dst->bi_iter; do { - struct bch_extent_crc_unpacked crc = - (struct bch_extent_crc_unpacked) { 0 }; + struct bch_extent_crc_unpacked crc = { 0 }; struct bversion version = op->version; size_t dst_len, src_len; if (page_alloc_failed && - bio_sectors(dst) < wp->sectors_free && - bio_sectors(dst) < c->sb.encoded_extent_max) + dst->bi_iter.bi_size < (wp->sectors_free << 9) && + dst->bi_iter.bi_size < c->opts.encoded_extent_max) break; BUG_ON(op->compression_type && @@ -927,7 +1215,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, if (op->csum_type) dst_len = min_t(unsigned, dst_len, - c->sb.encoded_extent_max << 9); + c->opts.encoded_extent_max); if (bounce) { swap(dst->bi_iter.bi_size, dst_len); @@ -953,6 +1241,8 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, !crc_is_compressed(crc) && bch2_csum_type_is_encryption(op->crc.csum_type) == bch2_csum_type_is_encryption(op->csum_type)) { + u8 compression_type = crc.compression_type; + u16 nonce = crc.nonce; /* * Note: when we're using rechecksum(), we need to be * checksumming @src because it has all the data our @@ -971,6 +1261,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, bio_sectors(src) - (src_len >> 9), op->csum_type)) goto csum_err; + /* + * rchecksum_bio sets compression_type on crc from op->crc, + * this isn't always correct as sometimes we're changing + * an extent from uncompressed to incompressible. + */ + crc.compression_type = compression_type; + crc.nonce = nonce; } else { if ((op->flags & BCH_WRITE_DATA_ENCODED) && bch2_rechecksum_bio(c, src, version, op->crc, @@ -985,8 +1282,11 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, crc.live_size = src_len >> 9; swap(dst->bi_iter.bi_size, dst_len); - bch2_encrypt_bio(c, op->csum_type, - extent_nonce(version, crc), dst); + ret = bch2_encrypt_bio(c, op->csum_type, + extent_nonce(version, crc), dst); + if (ret) + goto err; + crc.csum = bch2_checksum_bio(c, op->csum_type, extent_nonce(version, crc), dst); crc.csum_type = op->csum_type; @@ -1022,34 +1322,350 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, dst->bi_opf = src->bi_opf; } - dst->bi_iter.bi_size = total_output; -do_write: - /* might have done a realloc... */ - bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9); + dst->bi_iter.bi_size = total_output; +do_write: + *_dst = dst; + return more; +csum_err: + bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)"); + ret = -EIO; +err: + if (to_wbio(dst)->bounce) + bch2_bio_free_pages_pool(c, dst); + if (to_wbio(dst)->put_bio) + bio_put(dst); + + return ret; +} + +static bool bch2_extent_is_writeable(struct bch_write_op *op, + struct bkey_s_c k) +{ + struct bch_fs *c = op->c; + struct bkey_s_c_extent e; + struct extent_ptr_decoded p; + const union bch_extent_entry *entry; + unsigned replicas = 0; + + if (k.k->type != KEY_TYPE_extent) + return false; + + e = bkey_s_c_to_extent(k); + extent_for_each_ptr_decode(e, p, entry) { + if (p.crc.csum_type || + crc_is_compressed(p.crc) || + p.has_ec) + return false; + + replicas += bch2_extent_ptr_durability(c, &p); + } + + return replicas >= op->opts.data_replicas; +} + +static inline void bch2_nocow_write_unlock(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + const struct bch_extent_ptr *ptr; + struct bkey_i *k; + + for_each_keylist_key(&op->insert_keys, k) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); + + bkey_for_each_ptr(ptrs, ptr) + bch2_bucket_nocow_unlock(&c->nocow_locks, + PTR_BUCKET_POS(c, ptr), + BUCKET_NOCOW_LOCK_UPDATE); + } +} + +static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *orig, + struct bkey_s_c k, + u64 new_i_size) +{ + struct bkey_i *new; + struct bkey_ptrs ptrs; + struct bch_extent_ptr *ptr; + int ret; + + if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) { + /* trace this */ + return 0; + } + + new = bch2_bkey_make_mut(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + bch2_cut_front(bkey_start_pos(&orig->k), new); + bch2_cut_back(orig->k.p, new); + + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_for_each_ptr(ptrs, ptr) + ptr->unwritten = 0; + + /* + * Note that we're not calling bch2_subvol_get_snapshot() in this path - + * that was done when we kicked off the write, and here it's important + * that we update the extent that we wrote to - even if a snapshot has + * since been created. The write is still outstanding, so we're ok + * w.r.t. snapshot atomicity: + */ + return bch2_extent_update_i_size_sectors(trans, iter, + min(new->k.p.offset << 9, new_i_size), 0) ?: + bch2_trans_update(trans, iter, new, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +} + +static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_i *orig; + struct bkey_s_c k; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_keylist_key(&op->insert_keys, orig) { + ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents, + bkey_start_pos(&orig->k), orig->k.p, + BTREE_ITER_INTENT, k, + NULL, NULL, BTREE_INSERT_NOFAIL, ({ + bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size); + })); + + if (ret && !bch2_err_matches(ret, EROFS)) { + struct bkey_i *k = bch2_keylist_front(&op->insert_keys); + + bch_err_inum_offset_ratelimited(c, + k->k.p.inode, k->k.p.offset << 9, + "write error while doing btree update: %s", + bch2_err_str(ret)); + } + + if (ret) { + op->error = ret; + break; + } + } + + bch2_trans_exit(&trans); +} + +static void __bch2_nocow_write_done(struct bch_write_op *op) +{ + bch2_nocow_write_unlock(op); + + if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + op->error = -EIO; + } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) + bch2_nocow_write_convert_unwritten(op); +} + +static void bch2_nocow_write_done(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + + __bch2_nocow_write_done(op); + bch2_write_done(cl); +} + +static void bch2_nocow_write(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_ptrs_c ptrs; + const struct bch_extent_ptr *ptr; + struct { + struct bpos b; + unsigned gen; + struct nocow_lock_bucket *l; + } buckets[BCH_REPLICAS_MAX]; + unsigned nr_buckets = 0; + u32 snapshot; + int ret, i; + + if (op->flags & BCH_WRITE_MOVE) + return; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, op->subvol, &snapshot); + if (unlikely(ret)) + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(op->pos.inode, op->pos.offset, snapshot), + BTREE_ITER_SLOTS); + while (1) { + struct bio *bio = &op->wbio.bio; + + nr_buckets = 0; + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + break; + + /* fall back to normal cow write path? */ + if (unlikely(k.k->p.snapshot != snapshot || + !bch2_extent_is_writeable(op, k))) + break; + + if (bch2_keylist_realloc(&op->insert_keys, + op->inline_keys, + ARRAY_SIZE(op->inline_keys), + k.k->u64s)) + break; + + /* Get iorefs before dropping btree locks: */ + ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr(ptrs, ptr) { + buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr); + buckets[nr_buckets].gen = ptr->gen; + buckets[nr_buckets].l = + bucket_nocow_lock(&c->nocow_locks, + bucket_to_u64(buckets[nr_buckets].b)); + + prefetch(buckets[nr_buckets].l); + + if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) + goto err_get_ioref; + + nr_buckets++; + + if (ptr->unwritten) + op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; + } + + /* Unlock before taking nocow locks, doing IO: */ + bkey_reassemble(op->insert_keys.top, k); + bch2_trans_unlock(&trans); + + bch2_cut_front(op->pos, op->insert_keys.top); + if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) + bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); + + for (i = 0; i < nr_buckets; i++) { + struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode); + struct nocow_lock_bucket *l = buckets[i].l; + bool stale; + + __bch2_bucket_nocow_lock(&c->nocow_locks, l, + bucket_to_u64(buckets[i].b), + BUCKET_NOCOW_LOCK_UPDATE); + + rcu_read_lock(); + stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen); + rcu_read_unlock(); + + if (unlikely(stale)) + goto err_bucket_stale; + } + + bio = &op->wbio.bio; + if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) { + bio = bio_split(bio, k.k->p.offset - op->pos.offset, + GFP_KERNEL, &c->bio_write); + wbio_init(bio)->put_bio = true; + bio->bi_opf = op->wbio.bio.bi_opf; + } else { + op->flags |= BCH_WRITE_DONE; + } + + op->pos.offset += bio_sectors(bio); + op->written += bio_sectors(bio); + + bio->bi_end_io = bch2_write_endio; + bio->bi_private = &op->cl; + bio->bi_opf |= REQ_OP_WRITE; + closure_get(&op->cl); + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, + op->insert_keys.top, true); + + bch2_keylist_push(&op->insert_keys); + if (op->flags & BCH_WRITE_DONE) + break; + bch2_btree_iter_advance(&iter); + } +out: + bch2_trans_iter_exit(&trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + if (ret) { + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + op->pos.offset << 9, + "%s: btree lookup error %s", + __func__, bch2_err_str(ret)); + op->error = ret; + op->flags |= BCH_WRITE_DONE; + } + + bch2_trans_exit(&trans); - *_dst = dst; - return more; -csum_err: - bch_err(c, "error verifying existing checksum while " - "rewriting existing data (memory corruption?)"); - ret = -EIO; -err: - if (to_wbio(dst)->bounce) - bch2_bio_free_pages_pool(c, dst); - if (to_wbio(dst)->put_bio) - bio_put(dst); + /* fallback to cow write path? */ + if (!(op->flags & BCH_WRITE_DONE)) { + closure_sync(&op->cl); + __bch2_nocow_write_done(op); + op->insert_keys.top = op->insert_keys.keys; + } else if (op->flags & BCH_WRITE_SYNC) { + closure_sync(&op->cl); + bch2_nocow_write_done(&op->cl); + } else { + /* + * XXX + * needs to run out of process context because ei_quota_lock is + * a mutex + */ + continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op)); + } + return; +err_get_ioref: + for (i = 0; i < nr_buckets; i++) + percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); - return ret; + /* Fall back to COW path: */ + goto out; +err_bucket_stale: + while (--i >= 0) + bch2_bucket_nocow_unlock(&c->nocow_locks, + buckets[i].b, + BUCKET_NOCOW_LOCK_UPDATE); + for (i = 0; i < nr_buckets; i++) + percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); + + /* We can retry this: */ + ret = BCH_ERR_transaction_restart; + goto out; } -static void __bch2_write(struct closure *cl) +static void __bch2_write(struct bch_write_op *op) { - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; - struct write_point *wp; - struct bio *bio; - bool skip_put = true; + struct write_point *wp = NULL; + struct bio *bio = NULL; + unsigned nofs_flags; int ret; + + nofs_flags = memalloc_nofs_save(); + + if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { + bch2_nocow_write(op); + if (op->flags & BCH_WRITE_DONE) + goto out_nofs_restore; + } again: memset(&op->failed, 0, sizeof(op->failed)); @@ -1061,99 +1677,103 @@ again: /* +1 for possible cache device: */ if (op->open_buckets.nr + op->nr_replicas + 1 > ARRAY_SIZE(op->open_buckets.v)) - goto flush_io; + break; if (bch2_keylist_realloc(&op->insert_keys, op->inline_keys, ARRAY_SIZE(op->inline_keys), BKEY_EXTENT_U64s_MAX)) - goto flush_io; + break; + + /* + * The copygc thread is now global, which means it's no longer + * freeing up space on specific disks, which means that + * allocations for specific disks may hang arbitrarily long: + */ + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_alloc_sectors_start_trans(&trans, + op->target, + op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), + op->write_point, + &op->devs_have, + op->nr_replicas, + op->nr_replicas_required, + op->alloc_reserve, + op->flags, + (op->flags & (BCH_WRITE_ALLOC_NOWAIT| + BCH_WRITE_ONLY_SPECIFIED_DEVS)) + ? NULL : &op->cl, &wp)); + if (unlikely(ret)) { + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) + break; - if ((op->flags & BCH_WRITE_FROM_INTERNAL) && - percpu_ref_is_dying(&c->writes)) { - ret = -EROFS; goto err; } - wp = bch2_alloc_sectors_start(c, - op->target, - op->opts.erasure_code, - op->write_point, - &op->devs_have, - op->nr_replicas, - op->nr_replicas_required, - op->alloc_reserve, - op->flags, - (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl); EBUG_ON(!wp); - if (unlikely(IS_ERR(wp))) { - if (unlikely(PTR_ERR(wp) != -EAGAIN)) { - ret = PTR_ERR(wp); - goto err; - } - - goto flush_io; - } - bch2_open_bucket_get(c, wp, &op->open_buckets); ret = bch2_write_extent(op, wp, &bio); - bch2_alloc_sectors_done(c, wp); - if (ret < 0) - goto err; + bch2_alloc_sectors_done_inlined(c, wp); +err: + if (ret <= 0) { + op->flags |= BCH_WRITE_DONE; - if (ret) - skip_put = false; + if (ret < 0) { + op->error = ret; + break; + } + } bio->bi_end_io = bch2_write_endio; bio->bi_private = &op->cl; bio->bi_opf |= REQ_OP_WRITE; - if (!skip_put) - closure_get(bio->bi_private); - else - op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT; + closure_get(bio->bi_private); key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); - bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER, - key_to_write); + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, + key_to_write, false); } while (ret); - if (!skip_put) - continue_at(cl, bch2_write_index, index_update_wq(op)); - return; -err: - op->error = ret; - - continue_at(cl, bch2_write_index, index_update_wq(op)); - return; -flush_io: - closure_sync(cl); - - if (!bch2_keylist_empty(&op->insert_keys)) { + /* + * Sync or no? + * + * If we're running asynchronously, wne may still want to block + * synchronously here if we weren't able to submit all of the IO at + * once, as that signals backpressure to the caller. + */ + if ((op->flags & BCH_WRITE_SYNC) || + (!(op->flags & BCH_WRITE_DONE) && + !(op->flags & BCH_WRITE_IN_WORKER))) { + closure_sync(&op->cl); __bch2_write_index(op); - if (op->error) { - continue_at_nobarrier(cl, bch2_write_done, NULL); - return; - } + if (!(op->flags & BCH_WRITE_DONE)) + goto again; + bch2_write_done(&op->cl); + } else { + bch2_write_queue(op, wp); + continue_at(&op->cl, bch2_write_index, NULL); } - - goto again; +out_nofs_restore: + memalloc_nofs_restore(nofs_flags); } static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) { - struct closure *cl = &op->cl; struct bio *bio = &op->wbio.bio; struct bvec_iter iter; struct bkey_i_inline_data *id; unsigned sectors; int ret; + op->flags |= BCH_WRITE_WROTE_DATA_INLINE; + op->flags |= BCH_WRITE_DONE; + bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, @@ -1181,9 +1801,7 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) set_bkey_val_bytes(&id->k, data_len); bch2_keylist_push(&op->insert_keys); - op->flags |= BCH_WRITE_WROTE_DATA_INLINE; - continue_at_nobarrier(cl, bch2_write_index, NULL); - return; + __bch2_write_index(op); err: bch2_write_done(&op->cl); } @@ -1211,28 +1829,36 @@ void bch2_write(struct closure *cl) struct bch_fs *c = op->c; unsigned data_len; + EBUG_ON(op->cl.parent); BUG_ON(!op->nr_replicas); BUG_ON(!op->write_point.v); - BUG_ON(!bkey_cmp(op->pos, POS_MAX)); + BUG_ON(bkey_eq(op->pos, POS_MAX)); op->start_time = local_clock(); bch2_keylist_init(&op->insert_keys, op->inline_keys); wbio_init(bio)->put_bio = false; - if (bio_sectors(bio) & (c->opts.block_size - 1)) { - __bcache_io_error(c, "misaligned write"); + if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + op->pos.offset << 9, + "misaligned write"); op->error = -EIO; goto err; } - if (c->opts.nochanges || - !percpu_ref_tryget(&c->writes)) { - if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) - __bcache_io_error(c, "read only"); - op->error = -EROFS; + if (c->opts.nochanges) { + op->error = -BCH_ERR_erofs_no_writes; + goto err; + } + + if (!(op->flags & BCH_WRITE_MOVE) && + !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { + op->error = -BCH_ERR_erofs_no_writes; goto err; } + this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); bch2_increment_clock(c, bio_sectors(bio), WRITE); data_len = min_t(u64, bio->bi_iter.bi_size, @@ -1244,32 +1870,54 @@ void bch2_write(struct closure *cl) return; } - continue_at_nobarrier(cl, __bch2_write, NULL); + __bch2_write(op); return; err: - if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) - bch2_disk_reservation_put(c, &op->res); + bch2_disk_reservation_put(c, &op->res); - if (op->end_io) { - EBUG_ON(cl->parent); - closure_debug_destroy(cl); + closure_debug_destroy(&op->cl); + if (op->end_io) op->end_io(op); - } else { - closure_return(cl); - } +} + +const char * const bch2_write_flags[] = { +#define x(f) #f, + BCH_WRITE_FLAGS() +#undef x + NULL +}; + +void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) +{ + prt_str(out, "pos: "); + bch2_bpos_to_text(out, op->pos); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_str(out, "started: "); + bch2_pr_time_units(out, local_clock() - op->start_time); + prt_newline(out); + + prt_str(out, "flags: "); + prt_bitflags(out, bch2_write_flags, op->flags); + prt_newline(out); + + prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl)); + prt_newline(out); + + printbuf_indent_sub(out, 2); } /* Cache promotion on read */ struct promote_op { - struct closure cl; struct rcu_head rcu; u64 start_time; struct rhash_head hash; struct bpos pos; - struct migrate_write write; + struct data_update write; struct bio_vec bi_inline_vecs[0]; /* must be last */ }; @@ -1293,6 +1941,9 @@ static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, if (bch2_bkey_has_target(c, k, opts.promote_target)) return false; + if (bkey_extent_is_unwritten(k)) + return false; + if (bch2_target_congested(c, opts.promote_target)) { /* XXX trace this */ return false; @@ -1309,33 +1960,31 @@ static void promote_free(struct bch_fs *c, struct promote_op *op) { int ret; + bch2_data_update_exit(&op->write); + ret = rhashtable_remove_fast(&c->promote_table, &op->hash, bch_promote_params); BUG_ON(ret); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_promote); kfree_rcu(op, rcu); } -static void promote_done(struct closure *cl) +static void promote_done(struct bch_write_op *wop) { struct promote_op *op = - container_of(cl, struct promote_op, cl); + container_of(wop, struct promote_op, write.op); struct bch_fs *c = op->write.op.c; bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); - - bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio); promote_free(c, op); } static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) { - struct bch_fs *c = rbio->c; - struct closure *cl = &op->cl; struct bio *bio = &op->write.op.wbio.bio; - trace_promote(&rbio->bio); + trace_and_count(op->write.op.c, read_promote, &rbio->bio); /* we now own pages: */ BUG_ON(!rbio->bounce); @@ -1345,14 +1994,10 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) sizeof(struct bio_vec) * rbio->bio.bi_vcnt); swap(bio->bi_vcnt, rbio->bio.bi_vcnt); - bch2_migrate_read_done(&op->write, rbio); - - closure_init(cl, NULL); - closure_call(&op->write.op.cl, bch2_write, c->wq, cl); - closure_return_with_destructor(cl, promote_done); + bch2_data_update_read_done(&op->write, rbio->pick.crc); } -static struct promote_op *__promote_alloc(struct bch_fs *c, +static struct promote_op *__promote_alloc(struct btree_trans *trans, enum btree_id btree_id, struct bkey_s_c k, struct bpos pos, @@ -1361,12 +2006,13 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, unsigned sectors, struct bch_read_bio **rbio) { + struct bch_fs *c = trans->c; struct promote_op *op = NULL; struct bio *bio; unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); int ret; - if (!percpu_ref_tryget(&c->writes)) + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) return NULL; op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); @@ -1387,7 +2033,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, goto err; rbio_init(&(*rbio)->bio, opts); - bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages); + bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_NOIO)) @@ -1402,17 +2048,26 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, goto err; bio = &op->write.op.wbio.bio; - bio_init(bio, bio->bi_inline_vecs, pages); + bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); - ret = bch2_migrate_write_init(c, &op->write, + ret = bch2_data_update_init(trans, NULL, &op->write, writepoint_hashed((unsigned long) current), opts, - DATA_PROMOTE, - (struct data_opts) { - .target = opts.promote_target + (struct data_update_opts) { + .target = opts.promote_target, + .extra_replicas = 1, + .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, }, btree_id, k); + if (ret == -BCH_ERR_nocow_lock_blocked) { + ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); + goto err; + } + BUG_ON(ret); + op->write.op.end_io = promote_done; return op; err: @@ -1421,21 +2076,22 @@ err: kfree(*rbio); *rbio = NULL; kfree(op); - percpu_ref_put(&c->writes); + bch2_write_ref_put(c, BCH_WRITE_REF_promote); return NULL; } noinline -static struct promote_op *promote_alloc(struct bch_fs *c, - struct bvec_iter iter, - struct bkey_s_c k, - struct extent_ptr_decoded *pick, - struct bch_io_opts opts, - unsigned flags, - struct bch_read_bio **rbio, - bool *bounce, - bool *read_full) +static struct promote_op *promote_alloc(struct btree_trans *trans, + struct bvec_iter iter, + struct bkey_s_c k, + struct extent_ptr_decoded *pick, + struct bch_io_opts opts, + unsigned flags, + struct bch_read_bio **rbio, + bool *bounce, + bool *read_full) { + struct bch_fs *c = trans->c; bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); /* data might have to be decompressed in the write path: */ unsigned sectors = promote_full @@ -1449,10 +2105,10 @@ static struct promote_op *promote_alloc(struct bch_fs *c, if (!should_promote(c, k, pos, opts, flags)) return NULL; - promote = __promote_alloc(c, + promote = __promote_alloc(trans, k.k->type == KEY_TYPE_reflink_v - ? BTREE_ID_REFLINK - : BTREE_ID_EXTENTS, + ? BTREE_ID_reflink + : BTREE_ID_extents, k, pos, pick, opts, sectors, rbio); if (!promote) return NULL; @@ -1532,133 +2188,63 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) } static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, + struct bvec_iter bvec_iter, struct bch_io_failures *failed, unsigned flags) { struct btree_trans trans; - struct btree_iter *iter; - struct bkey_on_stack sk; + struct btree_iter iter; + struct bkey_buf sk; struct bkey_s_c k; int ret; flags &= ~BCH_READ_LAST_FRAGMENT; flags |= BCH_READ_MUST_CLONE; - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - rbio->pos, BTREE_ITER_SLOTS); + bch2_trans_iter_init(&trans, &iter, rbio->data_btree, + rbio->read_pos, BTREE_ITER_SLOTS); retry: rbio->bio.bi_status = 0; - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(&iter); if (bkey_err(k)) goto err; - bkey_on_stack_reassemble(&sk, c, k); + bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); bch2_trans_unlock(&trans); if (!bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, - rbio->pos.offset - + rbio->data_pos.offset - rbio->pick.crc.offset)) { /* extent we wanted to read no longer exists: */ rbio->hole = true; goto out; } - ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags); + ret = __bch2_read_extent(&trans, rbio, bvec_iter, + rbio->read_pos, + rbio->data_btree, + k, 0, failed, flags); if (ret == READ_RETRY) goto retry; if (ret) goto err; out: bch2_rbio_done(rbio); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); + bch2_bkey_buf_exit(&sk, c); return; err: rbio->bio.bi_status = BLK_STS_IOERR; goto out; } -static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, - struct bch_io_failures *failed, unsigned flags) -{ - struct btree_trans trans; - struct btree_iter *iter; - struct bkey_on_stack sk; - struct bkey_s_c k; - int ret; - - flags &= ~BCH_READ_LAST_FRAGMENT; - flags |= BCH_READ_MUST_CLONE; - - bkey_on_stack_init(&sk); - bch2_trans_init(&trans, c, 0, 0); -retry: - bch2_trans_begin(&trans); - - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, - POS(inode, bvec_iter.bi_sector), - BTREE_ITER_SLOTS, k, ret) { - unsigned bytes, sectors, offset_into_extent; - - bkey_on_stack_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - offset_into_extent = iter->pos.offset - - bkey_start_offset(k.k); - sectors = k.k->size - offset_into_extent; - - ret = bch2_read_indirect_extent(&trans, - &offset_into_extent, sk.k); - if (ret) - break; - - sectors = min(sectors, k.k->size - offset_into_extent); - - bch2_trans_unlock(&trans); - - bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; - swap(bvec_iter.bi_size, bytes); - - ret = __bch2_read_extent(c, rbio, bvec_iter, k, - offset_into_extent, failed, flags); - switch (ret) { - case READ_RETRY: - goto retry; - case READ_ERR: - goto err; - }; - - if (bytes == bvec_iter.bi_size) - goto out; - - swap(bvec_iter.bi_size, bytes); - bio_advance_iter(&rbio->bio, &bvec_iter, bytes); - } - - if (ret == -EINTR) - goto retry; - /* - * If we get here, it better have been because there was an error - * reading a btree node - */ - BUG_ON(!ret); - __bcache_io_error(c, "btree IO error: %i", ret); -err: - rbio->bio.bi_status = BLK_STS_IOERR; -out: - bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); - bch2_rbio_done(rbio); -} - static void bch2_rbio_retry(struct work_struct *work) { struct bch_read_bio *rbio = @@ -1666,10 +2252,13 @@ static void bch2_rbio_retry(struct work_struct *work) struct bch_fs *c = rbio->c; struct bvec_iter iter = rbio->bvec_iter; unsigned flags = rbio->flags; - u64 inode = rbio->pos.inode; + subvol_inum inum = { + .subvol = rbio->subvol, + .inum = rbio->read_pos.inode, + }; struct bch_io_failures failed = { .nr = 0 }; - trace_read_retry(&rbio->bio); + trace_and_count(c, read_retry, &rbio->bio); if (rbio->retry == READ_RETRY_AVOID) bch2_mark_io_failure(&failed, &rbio->pick); @@ -1681,10 +2270,14 @@ static void bch2_rbio_retry(struct work_struct *work) flags |= BCH_READ_IN_RETRY; flags &= ~BCH_READ_MAY_PROMOTE; - if (flags & BCH_READ_NODECODE) - bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); - else - bch2_read_retry(c, rbio, iter, inode, &failed, flags); + if (flags & BCH_READ_NODECODE) { + bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); + } else { + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + + __bch2_read(c, rbio, iter, inum, &failed, flags); + } } static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, @@ -1710,9 +2303,9 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, struct bch_read_bio *rbio) { struct bch_fs *c = rbio->c; - u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; + u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; struct bch_extent_crc_unpacked new_crc; - struct btree_iter *iter = NULL; + struct btree_iter iter; struct bkey_i *new; struct bkey_s_c k; int ret = 0; @@ -1720,26 +2313,12 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, if (crc_is_compressed(rbio->pick.crc)) return 0; - iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - if ((ret = PTR_ERR_OR_ZERO(iter))) - goto out; - - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); if ((ret = bkey_err(k))) goto out; - /* - * going to be temporarily appending another checksum entry: - */ - new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + - BKEY_EXTENT_U64s_MAX * 8); - if ((ret = PTR_ERR_OR_ZERO(new))) - goto out; - - bkey_reassemble(new, k); - k = bkey_i_to_s_c(new); - if (bversion_cmp(k.k->version, rbio->version) || !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) goto out; @@ -1758,12 +2337,23 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, goto out; } + /* + * going to be temporarily appending another checksum entry: + */ + new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + + sizeof(struct bch_extent_crc128)); + if ((ret = PTR_ERR_OR_ZERO(new))) + goto out; + + bkey_reassemble(new, k); + if (!bch2_bkey_narrow_crcs(new, new_crc)) goto out; - bch2_trans_update(trans, iter, new, 0); + ret = bch2_trans_update(trans, &iter, new, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); out: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1785,7 +2375,11 @@ static void __bch2_read_endio(struct work_struct *work) struct bvec_iter dst_iter = rbio->bvec_iter; struct bch_extent_crc_unpacked crc = rbio->pick.crc; struct nonce nonce = extent_nonce(rbio->version, crc); + unsigned nofs_flags; struct bch_csum csum; + int ret; + + nofs_flags = memalloc_nofs_save(); /* Reset iterator for checksumming and copying bounced data: */ if (rbio->bounce) { @@ -1797,9 +2391,16 @@ static void __bch2_read_endio(struct work_struct *work) } csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); - if (bch2_crc_cmp(csum, rbio->pick.crc.csum)) + if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) goto csum_err; + /* + * XXX + * We need to rework the narrow_crcs path to deliver the read completion + * first, and then punt to a different workqueue, otherwise we're + * holding up reads while doing btree updates which is bad for memory + * reclaim. + */ if (unlikely(rbio->narrow_crcs)) bch2_rbio_narrow_crcs(rbio); @@ -1811,7 +2412,10 @@ static void __bch2_read_endio(struct work_struct *work) crc.live_size = bvec_iter_sectors(rbio->bvec_iter); if (crc_is_compressed(crc)) { - bch2_encrypt_bio(c, crc.csum_type, nonce, src); + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) goto decompression_err; } else { @@ -1822,7 +2426,9 @@ static void __bch2_read_endio(struct work_struct *work) BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); src->bi_iter.bi_size = dst_iter.bi_size; - bch2_encrypt_bio(c, crc.csum_type, nonce, src); + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; if (rbio->bounce) { struct bvec_iter src_iter = src->bi_iter; @@ -1835,7 +2441,10 @@ static void __bch2_read_endio(struct work_struct *work) * Re encrypt data we decrypted, so it's consistent with * rbio->crc: */ - bch2_encrypt_bio(c, crc.csum_type, nonce, src); + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + promote_start(rbio->promote, rbio); rbio->promote = NULL; } @@ -1844,6 +2453,8 @@ nodecode: rbio = bch2_rbio_free(rbio); bch2_rbio_done(rbio); } +out: + memalloc_nofs_restore(nofs_flags); return; csum_err: /* @@ -1854,22 +2465,30 @@ csum_err: if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { rbio->flags |= BCH_READ_MUST_BOUNCE; bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); - return; + goto out; } - bch2_dev_io_error(ca, - "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)", - rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, + bch_err_inum_offset_ratelimited(ca, + rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)", rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, - csum.hi, csum.lo, crc.csum_type); + csum.hi, csum.lo, bch2_csum_types[crc.csum_type]); + bch2_io_error(ca); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); - return; + goto out; decompression_err: - __bcache_io_error(c, "decompression error, inode %llu offset %llu", - rbio->pos.inode, - (u64) rbio->bvec_iter.bi_sector); + bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "decompression error"); bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); - return; + goto out; +decrypt_err: + bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "decrypt error"); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + goto out; } static void bch2_read_endio(struct bio *bio) @@ -1889,15 +2508,18 @@ static void bch2_read_endio(struct bio *bio) if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) { + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, + rbio->read_pos.inode, + rbio->read_pos.offset, + "data read error: %s", + bch2_blk_status_to_str(bio->bi_status))) { bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); return; } - if (rbio->pick.ptr.cached && - (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || - ptr_stale(ca, &rbio->pick.ptr))) { - atomic_long_inc(&c->read_realloc_races); + if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || + ptr_stale(ca, &rbio->pick.ptr)) { + trace_and_count(c, read_reuse_race, &rbio->bio); if (rbio->flags & BCH_READ_RETRY_IF_STALE) bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); @@ -1907,6 +2529,7 @@ static void bch2_read_endio(struct bio *bio) } if (rbio->narrow_crcs || + rbio->promote || crc_is_compressed(rbio->pick.crc) || bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; @@ -1918,68 +2541,107 @@ static void bch2_read_endio(struct bio *bio) int __bch2_read_indirect_extent(struct btree_trans *trans, unsigned *offset_into_extent, - struct bkey_i *orig_k) + struct bkey_buf *orig_k) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; u64 reflink_offset; int ret; - reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) + + reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + *offset_into_extent; - iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK, - POS(0, reflink_offset), - BTREE_ITER_SLOTS); - ret = PTR_ERR_OR_ZERO(iter); - if (ret) - return ret; - - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, + POS(0, reflink_offset), + BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; - if (k.k->type != KEY_TYPE_reflink_v) { - __bcache_io_error(trans->c, - "pointer to nonexistent indirect extent"); + if (k.k->type != KEY_TYPE_reflink_v && + k.k->type != KEY_TYPE_indirect_inline_data) { + bch_err_inum_offset_ratelimited(trans->c, + orig_k->k->k.p.inode, + orig_k->k->k.p.offset << 9, + "%llu len %u points to nonexistent indirect extent %llu", + orig_k->k->k.p.offset, + orig_k->k->k.size, + reflink_offset); + bch2_inconsistent_error(trans->c); ret = -EIO; goto err; } - *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); - bkey_reassemble(orig_k, k); + *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); + bch2_bkey_buf_reassemble(orig_k, trans->c, k); err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } -int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, - struct bvec_iter iter, struct bkey_s_c k, +static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + struct bkey_s_c k, + struct bch_extent_ptr ptr) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); + struct btree_iter iter; + struct printbuf buf = PRINTBUF; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + PTR_BUCKET_POS(c, &ptr), + BTREE_ITER_CACHED); + + prt_printf(&buf, "Attempting to read from stale dirty pointer:"); + printbuf_indent_add(&buf, 2); + prt_newline(&buf); + + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + + prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); + + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (!ret) { + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, k); + } + + bch2_fs_inconsistent(c, "%s", buf.buf); + + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); +} + +int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + struct bvec_iter iter, struct bpos read_pos, + enum btree_id data_btree, struct bkey_s_c k, unsigned offset_into_extent, struct bch_io_failures *failed, unsigned flags) { + struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; - struct bch_dev *ca; + struct bch_dev *ca = NULL; struct promote_op *promote = NULL; bool bounce = false, read_full = false, narrow_crcs = false; - struct bpos pos = bkey_start_pos(k.k); + struct bpos data_pos = bkey_start_pos(k.k); int pick_ret; - if (k.k->type == KEY_TYPE_inline_data) { - struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); + if (bkey_extent_is_inline_data(k.k)) { unsigned bytes = min_t(unsigned, iter.bi_size, - bkey_val_bytes(d.k)); + bkey_inline_data_bytes(k.k)); swap(iter.bi_size, bytes); - memcpy_to_bio(&orig->bio, iter, d.v->data); + memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); swap(iter.bi_size, bytes); bio_advance_iter(&orig->bio, &iter, bytes); zero_fill_bio_iter(&orig->bio, iter); goto out_read_done; } - +retry_pick: pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); /* hole or reservation - just zero fill: */ @@ -1987,12 +2649,33 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, goto hole; if (pick_ret < 0) { - __bcache_io_error(c, "no device to read from"); + bch_err_inum_offset_ratelimited(c, + read_pos.inode, read_pos.offset << 9, + "no device to read from"); goto err; } - if (pick_ret > 0) - ca = bch_dev_bkey_exists(c, pick.ptr.dev); + ca = bch_dev_bkey_exists(c, pick.ptr.dev); + + /* + * Stale dirty pointers are treated as IO errors, but @failed isn't + * allocated unless we're in the retry path - so if we're not in the + * retry path, don't check here, it'll be caught in bch2_read_endio() + * and we'll end up in the retry path: + */ + if ((flags & BCH_READ_IN_RETRY) && + !pick.ptr.cached && + unlikely(ptr_stale(ca, &pick.ptr))) { + read_from_stale_dirty_pointer(trans, k, pick.ptr); + bch2_mark_io_failure(failed, &pick); + goto retry_pick; + } + + /* + * Unlock the iterator while the btree node's lock is still in + * cache, before doing the IO: + */ + bch2_trans_unlock(trans); if (flags & BCH_READ_NODECODE) { /* @@ -2019,7 +2702,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); if (crc_is_compressed(pick.crc) || - (pick.crc.csum_type != BCH_CSUM_NONE && + (pick.crc.csum_type != BCH_CSUM_none && (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || (bch2_csum_type_is_encryption(pick.crc.csum_type) && (flags & BCH_READ_USER_MAPPED)) || @@ -2029,7 +2712,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, } if (orig->opts.promote_target) - promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, + promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, &rbio, &bounce, &read_full); if (!read_full) { @@ -2040,7 +2723,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, pick.crc.offset || offset_into_extent)); - pos.offset += offset_into_extent; + data_pos.offset += offset_into_extent; pick.ptr.offset += pick.crc.offset + offset_into_extent; offset_into_extent = 0; @@ -2065,8 +2748,10 @@ get_bio: } else if (bounce) { unsigned sectors = pick.crc.compressed_size; - rbio = rbio_init(bio_alloc_bioset(GFP_NOIO, + rbio = rbio_init(bio_alloc_bioset(NULL, DIV_ROUND_UP(sectors, PAGE_SECTORS), + 0, + GFP_NOIO, &c->bio_read_split), orig->opts); @@ -2082,8 +2767,8 @@ get_bio: * from the whole bio, in which case we don't want to retry and * lose the error) */ - rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO, - &c->bio_read_split), + rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOIO, + &c->bio_read_split), orig->opts); rbio->bio.bi_iter = iter; rbio->split = true; @@ -2112,7 +2797,10 @@ get_bio: /* XXX: only initialize this if needed */ rbio->devs_have = bch2_bkey_devs(k); rbio->pick = pick; - rbio->pos = pos; + rbio->subvol = orig->subvol; + rbio->read_pos = read_pos; + rbio->data_btree = data_btree; + rbio->data_pos = data_pos; rbio->version = k.k->version; rbio->promote = promote; INIT_WORK(&rbio->work, NULL); @@ -2122,34 +2810,53 @@ get_bio: rbio->bio.bi_end_io = bch2_read_endio; if (rbio->bounce) - trace_read_bounce(&rbio->bio); + trace_and_count(c, read_bounce, &rbio->bio); + this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - rcu_read_lock(); - bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); - rcu_read_unlock(); + /* + * If it's being moved internally, we don't want to flag it as a cache + * hit: + */ + if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) + bch2_bucket_io_time_reset(trans, pick.ptr.dev, + PTR_BUCKET_NR(ca, &pick.ptr), READ); if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { bio_inc_remaining(&orig->bio); - trace_read_split(&orig->bio); + trace_and_count(c, read_split, &orig->bio); } if (!rbio->pick.idx) { if (!rbio->have_ioref) { - __bcache_io_error(c, "no device to read from"); + bch_err_inum_offset_ratelimited(c, + read_pos.inode, + read_pos.offset << 9, + "no device to read from"); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); goto out; } - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], bio_sectors(&rbio->bio)); bio_set_dev(&rbio->bio, ca->disk_sb.bdev); - if (likely(!(flags & BCH_READ_IN_RETRY))) - submit_bio(&rbio->bio); - else - submit_bio_wait(&rbio->bio); + if (unlikely(c->opts.no_data_io)) { + if (likely(!(flags & BCH_READ_IN_RETRY))) + bio_endio(&rbio->bio); + } else { + if (likely(!(flags & BCH_READ_IN_RETRY))) + submit_bio(&rbio->bio); + else + submit_bio_wait(&rbio->bio); + } + + /* + * We just submitted IO which may block, we expect relock fail + * events and shouldn't count them: + */ + trans->notrace_relock_fail = true; } else { /* Attempting reconstruct read: */ if (bch2_ec_read_extent(c, rbio)) { @@ -2177,6 +2884,9 @@ out: ret = READ_RETRY; } + if (!ret) + goto out_read_done; + return ret; } @@ -2203,54 +2913,64 @@ out_read_done: return 0; } -void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) +void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, subvol_inum inum, + struct bch_io_failures *failed, unsigned flags) { struct btree_trans trans; - struct btree_iter *iter; - struct bkey_on_stack sk; + struct btree_iter iter; + struct bkey_buf sk; struct bkey_s_c k; - unsigned flags = BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE| - BCH_READ_USER_MAPPED; + u32 snapshot; int ret; - BUG_ON(rbio->_state); BUG_ON(flags & BCH_READ_NODECODE); - BUG_ON(flags & BCH_READ_IN_RETRY); - rbio->c = c; - rbio->start_time = local_clock(); - - bkey_on_stack_init(&sk); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); + iter = (struct btree_iter) { NULL }; + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - POS(inode, rbio->bio.bi_iter.bi_sector), - BTREE_ITER_SLOTS); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, bvec_iter.bi_sector, snapshot), + BTREE_ITER_SLOTS); while (1) { unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; - bch2_btree_iter_set_pos(iter, - POS(inode, rbio->bio.bi_iter.bi_sector)); + /* + * read_extent -> io_time_reset may cause a transaction restart + * without returning an error, we need to check for that here: + */ + ret = bch2_trans_relock(&trans); + if (ret) + break; - k = bch2_btree_iter_peek_slot(iter); + bch2_btree_iter_set_pos(&iter, + POS(inum.inum, bvec_iter.bi_sector)); + + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) - goto err; + break; - offset_into_extent = iter->pos.offset - + offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; - bkey_on_stack_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); + bch2_bkey_buf_reassemble(&sk, c, k); - ret = bch2_read_indirect_extent(&trans, - &offset_into_extent, sk.k); + ret = bch2_read_indirect_extent(&trans, &data_btree, + &offset_into_extent, &sk); if (ret) - goto err; + break; + + k = bkey_i_to_s_c(sk.k); /* * With indirect extents, the amount of data to read is the min @@ -2258,37 +2978,46 @@ retry: */ sectors = min(sectors, k.k->size - offset_into_extent); - /* - * Unlock the iterator while the btree node's lock is still in - * cache, before doing the IO: - */ - bch2_trans_unlock(&trans); - - bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; - swap(rbio->bio.bi_iter.bi_size, bytes); + bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; + swap(bvec_iter.bi_size, bytes); - if (rbio->bio.bi_iter.bi_size == bytes) + if (bvec_iter.bi_size == bytes) flags |= BCH_READ_LAST_FRAGMENT; - bch2_read_extent(c, rbio, k, offset_into_extent, flags); + ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos, + data_btree, k, + offset_into_extent, failed, flags); + if (ret) + break; if (flags & BCH_READ_LAST_FRAGMENT) break; - swap(rbio->bio.bi_iter.bi_size, bytes); - bio_advance(&rbio->bio, bytes); + swap(bvec_iter.bi_size, bytes); + bio_advance_iter(&rbio->bio, &bvec_iter, bytes); + + ret = btree_trans_too_many_iters(&trans); + if (ret) + break; } -out: - bch2_trans_exit(&trans); - bkey_on_stack_exit(&sk, c); - return; err: - if (ret == -EINTR) + bch2_trans_iter_exit(&trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + ret == READ_RETRY || + ret == READ_RETRY_AVOID) goto retry; - bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); - bch2_rbio_done(rbio); - goto out; + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + + if (ret) { + bch_err_inum_offset_ratelimited(c, inum.inum, + bvec_iter.bi_sector << 9, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; + bch2_rbio_done(rbio); + } } void bch2_fs_io_exit(struct bch_fs *c) @@ -2304,18 +3033,26 @@ void bch2_fs_io_exit(struct bch_fs *c) int bch2_fs_io_init(struct bch_fs *c) { if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS) || - bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), - BIOSET_NEED_BVECS) || - bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), - BIOSET_NEED_BVECS) || - mempool_init_page_pool(&c->bio_bounce_pages, + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_read_init; + + if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_read_split_init; + + if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_write_init; + + if (mempool_init_page_pool(&c->bio_bounce_pages, max_t(unsigned, c->opts.btree_node_size, - c->sb.encoded_extent_max) / - PAGE_SECTORS, 0) || - rhashtable_init(&c->promote_table, &bch_promote_params)) - return -ENOMEM; + c->opts.encoded_extent_max) / + PAGE_SIZE, 0)) + return -BCH_ERR_ENOMEM_bio_bounce_pages_init; + + if (rhashtable_init(&c->promote_table, &bch_promote_params)) + return -BCH_ERR_ENOMEM_promote_table_init; return 0; }