X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fio.c;h=0bc72d2a4dd4cf09bc6b11b9e22a8afcb43977f8;hb=49ba8d0ef6133487559bdc73f2afc87fbea85fe0;hp=9b6aece794f2c9cf3ec74fe633a69ac4a385696f;hpb=09021c38d3ffd53c54664a5cbabc3c6a37c46960;p=bcachefs-tools-debian diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 9b6aece..0bc72d2 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -27,6 +27,7 @@ #include "keylist.h" #include "move.h" #include "rebalance.h" +#include "subvolume.h" #include "super.h" #include "super-io.h" @@ -192,7 +193,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, s64 *disk_sectors_delta) { struct bch_fs *c = trans->c; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c old; unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); @@ -203,7 +204,7 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, *i_sectors_delta = 0; *disk_sectors_delta = 0; - iter = bch2_trans_copy_iter(trans, extent_iter); + bch2_trans_copy_iter(&iter, extent_iter); for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { s64 sectors = min(new->k.p.offset, old.k->p.offset) - @@ -220,7 +221,8 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, : 0; if (!*usage_increasing && - (new_replicas > bch2_bkey_replicas(c, old) || + (new->k.p.snapshot != old.k->p.snapshot || + new_replicas > bch2_bkey_replicas(c, old) || (!new_compressed && bch2_bkey_sectors_compressed(old)))) *usage_increasing = true; @@ -235,8 +237,12 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, * writing to, because i_size could be up to one block * less: */ - if (!bkey_cmp(old.k->p, new->k.p)) - old = bch2_btree_iter_next(iter); + if (!bkey_cmp(old.k->p, new->k.p)) { + old = bch2_btree_iter_next(&iter); + ret = bkey_err(old); + if (ret) + break; + } if (old.k && !bkey_err(old) && old.k->p.inode == extent_iter->pos.inode && @@ -247,11 +253,12 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, } } - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } int bch2_extent_update(struct btree_trans *trans, + subvol_inum inum, struct btree_iter *iter, struct bkey_i *k, struct disk_reservation *disk_res, @@ -262,11 +269,22 @@ int bch2_extent_update(struct btree_trans *trans, { /* this must live until after bch2_trans_commit(): */ struct bkey_inode_buf inode_p; + struct bpos next_pos; bool extending = false, usage_increasing; s64 i_sectors_delta = 0, disk_sectors_delta = 0; int ret; - ret = bch2_extent_trim_atomic(k, iter); + /* + * This traverses us the iterator without changing iter->path->pos to + * search_key() (which is pos + 1 for extents): we want there to be a + * path already traversed at iter->pos because + * bch2_trans_extent_update() will use it to attempt extent merging + */ + ret = __bch2_btree_iter_traverse(iter); + if (ret) + return ret; + + ret = bch2_extent_trim_atomic(trans, iter, k); if (ret) return ret; @@ -296,13 +314,13 @@ int bch2_extent_update(struct btree_trans *trans, : 0; if (i_sectors_delta || new_i_size) { - struct btree_iter *inode_iter; + struct btree_iter inode_iter; struct bch_inode_unpacked inode_u; - inode_iter = bch2_inode_peek(trans, &inode_u, - k->k.p.inode, BTREE_ITER_INTENT); - if (IS_ERR(inode_iter)) - return PTR_ERR(inode_iter); + ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, + BTREE_ITER_INTENT); + if (ret) + return ret; /* * XXX: @@ -329,13 +347,18 @@ int bch2_extent_update(struct btree_trans *trans, inode_p.inode.k.p.snapshot = iter->snapshot; - bch2_trans_update(trans, inode_iter, + ret = bch2_trans_update(trans, &inode_iter, &inode_p.inode.k_i, 0); } - bch2_trans_iter_put(trans, inode_iter); + bch2_trans_iter_exit(trans, &inode_iter); + + if (ret) + return ret; } + next_pos = k->k.p; + ret = bch2_trans_update(trans, iter, k, 0) ?: bch2_trans_commit(trans, disk_res, journal_seq, BTREE_INSERT_NOCHECK_RW| @@ -344,28 +367,44 @@ int bch2_extent_update(struct btree_trans *trans, if (ret) return ret; + bch2_btree_iter_set_pos(iter, next_pos); + if (i_sectors_delta_total) *i_sectors_delta_total += i_sectors_delta; return 0; } +/* + * Returns -EINTR if we had to drop locks: + */ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, - struct bpos end, u64 *journal_seq, - s64 *i_sectors_delta) + subvol_inum inum, u64 end, + u64 *journal_seq, s64 *i_sectors_delta) { struct bch_fs *c = trans->c; unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct bpos end_pos = POS(inum.inum, end); struct bkey_s_c k; int ret = 0, ret2 = 0; + u32 snapshot; - while ((k = bch2_btree_iter_peek(iter)).k && - bkey_cmp(iter->pos, end) < 0) { + while (1) { struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); struct bkey_i delete; bch2_trans_begin(trans); + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto btree_err; + + bch2_btree_iter_set_snapshot(iter, snapshot); + + k = bch2_btree_iter_peek(iter); + if (bkey_cmp(iter->pos, end_pos) >= 0) + break; + ret = bkey_err(k); if (ret) goto btree_err; @@ -375,9 +414,9 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, /* create the biggest key we can */ bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end, &delete); + bch2_cut_back(end_pos, &delete); - ret = bch2_extent_update(trans, iter, &delete, + ret = bch2_extent_update(trans, inum, iter, &delete, &disk_res, journal_seq, 0, i_sectors_delta, false); bch2_disk_reservation_put(c, &disk_res); @@ -390,36 +429,31 @@ btree_err: break; } - if (bkey_cmp(iter->pos, end) > 0) { - bch2_btree_iter_set_pos(iter, end); - ret = bch2_btree_iter_traverse(iter); - } + if (bkey_cmp(iter->pos, end_pos) > 0) + bch2_btree_iter_set_pos(iter, end_pos); return ret ?: ret2; } -int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, +int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, u64 *journal_seq, s64 *i_sectors_delta) { struct btree_trans trans; - struct btree_iter *iter; - int ret = 0; + struct btree_iter iter; + int ret; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, - POS(inum, start), - BTREE_ITER_INTENT); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + POS(inum.inum, start), + BTREE_ITER_INTENT); - ret = bch2_fpunch_at(&trans, iter, POS(inum, end), + ret = bch2_fpunch_at(&trans, &iter, inum, end, journal_seq, i_sectors_delta); - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); - if (ret == -EINTR) - ret = 0; - - return ret; + return ret == -EINTR ? 0 : ret; } int bch2_write_index_default(struct bch_write_op *op) @@ -429,41 +463,52 @@ int bch2_write_index_default(struct bch_write_op *op) struct keylist *keys = &op->insert_keys; struct bkey_i *k = bch2_keylist_front(keys); struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; + subvol_inum inum = { + .subvol = op->subvol, + .inum = k->k.p.inode, + }; int ret; + BUG_ON(!inum.subvol); + bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, - bkey_start_pos(&k->k), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - do { bch2_trans_begin(&trans); k = bch2_keylist_front(keys); + bch2_bkey_buf_copy(&sk, c, k); - k->k.p.snapshot = iter->snapshot; + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, + &sk.k->k.p.snapshot); + if (ret == -EINTR) + continue; + if (ret) + break; - bch2_bkey_buf_realloc(&sk, c, k->k.u64s); - bkey_copy(sk.k, k); - bch2_cut_front(iter->pos, sk.k); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bkey_start_pos(&sk.k->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - ret = bch2_extent_update(&trans, iter, sk.k, + ret = bch2_extent_update(&trans, inum, &iter, sk.k, &op->res, op_journal_seq(op), op->new_i_size, &op->i_sectors_delta, op->flags & BCH_WRITE_CHECK_ENOSPC); + bch2_trans_iter_exit(&trans, &iter); + if (ret == -EINTR) continue; if (ret) break; - if (bkey_cmp(iter->pos, k->k.p) >= 0) - bch2_keylist_pop_front(keys); + if (bkey_cmp(iter.pos, k->k.p) >= 0) + bch2_keylist_pop_front(&op->insert_keys); + else + bch2_cut_front(iter.pos, k); } while (!bch2_keylist_empty(keys)); - bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); @@ -1439,7 +1484,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) bch2_migrate_read_done(&op->write, rbio); closure_init(cl, NULL); - closure_call(&op->write.op.cl, bch2_write, c->wq, cl); + closure_call(&op->write.op.cl, bch2_write, c->btree_update_wq, cl); closure_return_with_destructor(cl, promote_done); } @@ -1624,12 +1669,12 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) } static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, + struct bvec_iter bvec_iter, struct bch_io_failures *failed, unsigned flags) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; int ret; @@ -1640,12 +1685,12 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio bch2_bkey_buf_init(&sk); bch2_trans_init(&trans, c, 0, 0); - iter = bch2_trans_get_iter(&trans, rbio->data_btree, - rbio->read_pos, BTREE_ITER_SLOTS); + bch2_trans_iter_init(&trans, &iter, rbio->data_btree, + rbio->read_pos, BTREE_ITER_SLOTS); retry: rbio->bio.bi_status = 0; - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(&iter); if (bkey_err(k)) goto err; @@ -1672,7 +1717,7 @@ retry: goto err; out: bch2_rbio_done(rbio); - bch2_trans_iter_put(&trans, iter); + bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); bch2_bkey_buf_exit(&sk, c); return; @@ -1688,7 +1733,10 @@ static void bch2_rbio_retry(struct work_struct *work) struct bch_fs *c = rbio->c; struct bvec_iter iter = rbio->bvec_iter; unsigned flags = rbio->flags; - u64 inode = rbio->read_pos.inode; + subvol_inum inum = { + .subvol = rbio->subvol, + .inum = rbio->read_pos.inode, + }; struct bch_io_failures failed = { .nr = 0 }; trace_read_retry(&rbio->bio); @@ -1704,12 +1752,12 @@ static void bch2_rbio_retry(struct work_struct *work) flags &= ~BCH_READ_MAY_PROMOTE; if (flags & BCH_READ_NODECODE) { - bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); + bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); } else { flags &= ~BCH_READ_LAST_FRAGMENT; flags |= BCH_READ_MUST_CLONE; - __bch2_read(c, rbio, iter, inode, &failed, flags); + __bch2_read(c, rbio, iter, inum, &failed, flags); } } @@ -1738,7 +1786,7 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, struct bch_fs *c = rbio->c; u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; struct bch_extent_crc_unpacked new_crc; - struct btree_iter *iter = NULL; + struct btree_iter iter; struct bkey_i *new; struct bkey_s_c k; int ret = 0; @@ -1746,9 +1794,9 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, if (crc_is_compressed(rbio->pick.crc)) return 0; - iter = bch2_trans_get_iter(trans, rbio->data_btree, rbio->data_pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, rbio->data_btree, rbio->data_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); if ((ret = bkey_err(k))) goto out; @@ -1783,9 +1831,10 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, if (!bch2_bkey_narrow_crcs(new, new_crc)) goto out; - bch2_trans_update(trans, iter, new, 0); + ret = bch2_trans_update(trans, &iter, new, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); out: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1807,8 +1856,11 @@ static void __bch2_read_endio(struct work_struct *work) struct bvec_iter dst_iter = rbio->bvec_iter; struct bch_extent_crc_unpacked crc = rbio->pick.crc; struct nonce nonce = extent_nonce(rbio->version, crc); + unsigned nofs_flags; struct bch_csum csum; + nofs_flags = memalloc_nofs_save(); + /* Reset iterator for checksumming and copying bounced data: */ if (rbio->bounce) { src->bi_iter.bi_size = crc.compressed_size << 9; @@ -1822,6 +1874,13 @@ static void __bch2_read_endio(struct work_struct *work) if (bch2_crc_cmp(csum, rbio->pick.crc.csum)) goto csum_err; + /* + * XXX + * We need to rework the narrow_crcs path to deliver the read completion + * first, and then punt to a different workqueue, otherwise we're + * holding up reads while doing btree updates which is bad for memory + * reclaim. + */ if (unlikely(rbio->narrow_crcs)) bch2_rbio_narrow_crcs(rbio); @@ -1866,6 +1925,8 @@ nodecode: rbio = bch2_rbio_free(rbio); bch2_rbio_done(rbio); } +out: + memalloc_nofs_restore(nofs_flags); return; csum_err: /* @@ -1876,7 +1937,7 @@ csum_err: if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { rbio->flags |= BCH_READ_MUST_BOUNCE; bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); - return; + goto out; } bch2_dev_inum_io_error(ca, rbio->read_pos.inode, (u64) rbio->bvec_iter.bi_sector, @@ -1884,12 +1945,12 @@ csum_err: rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, csum.hi, csum.lo, crc.csum_type); bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); - return; + goto out; decompression_err: bch_err_inum_ratelimited(c, rbio->read_pos.inode, "decompression error"); bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); - return; + goto out; } static void bch2_read_endio(struct bio *bio) @@ -1944,7 +2005,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, unsigned *offset_into_extent, struct bkey_buf *orig_k) { - struct btree_iter *iter; + struct btree_iter iter; struct bkey_s_c k; u64 reflink_offset; int ret; @@ -1952,10 +2013,10 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + *offset_into_extent; - iter = bch2_trans_get_iter(trans, BTREE_ID_reflink, - POS(0, reflink_offset), - BTREE_ITER_SLOTS); - k = bch2_btree_iter_peek_slot(iter); + bch2_trans_iter_init(trans, &iter, BTREE_ID_reflink, + POS(0, reflink_offset), + BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; @@ -1972,10 +2033,10 @@ int __bch2_read_indirect_extent(struct btree_trans *trans, goto err; } - *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); + *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); bch2_bkey_buf_reassemble(orig_k, trans->c, k); err: - bch2_trans_iter_put(trans, iter); + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -2139,6 +2200,7 @@ get_bio: /* XXX: only initialize this if needed */ rbio->devs_have = bch2_bkey_devs(k); rbio->pick = pick; + rbio->subvol = orig->subvol; rbio->read_pos = read_pos; rbio->data_btree = data_btree; rbio->data_pos = data_pos; @@ -2241,13 +2303,14 @@ out_read_done: } void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, + struct bvec_iter bvec_iter, subvol_inum inum, struct bch_io_failures *failed, unsigned flags) { struct btree_trans trans; - struct btree_iter *iter; + struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; + u32 snapshot; int ret; BUG_ON(flags & BCH_READ_NODECODE); @@ -2256,23 +2319,37 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); + iter = (struct btree_iter) { NULL }; + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; - iter = bch2_trans_get_iter(&trans, BTREE_ID_extents, - POS(inode, bvec_iter.bi_sector), - BTREE_ITER_SLOTS); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, bvec_iter.bi_sector, snapshot), + BTREE_ITER_SLOTS|BTREE_ITER_FILTER_SNAPSHOTS); while (1) { unsigned bytes, sectors, offset_into_extent; enum btree_id data_btree = BTREE_ID_extents; - bch2_btree_iter_set_pos(iter, - POS(inode, bvec_iter.bi_sector)); + /* + * read_extent -> io_time_reset may cause a transaction restart + * without returning an error, we need to check for that here: + */ + if (!bch2_trans_relock(&trans)) { + ret = -EINTR; + break; + } - k = bch2_btree_iter_peek_slot(iter); + bch2_btree_iter_set_pos(&iter, + POS(inum.inum, bvec_iter.bi_sector)); + + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) break; - offset_into_extent = iter->pos.offset - + offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); sectors = k.k->size - offset_into_extent; @@ -2303,7 +2380,7 @@ retry: if (bvec_iter.bi_size == bytes) flags |= BCH_READ_LAST_FRAGMENT; - ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter->pos, + ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos, data_btree, k, offset_into_extent, failed, flags); if (ret) @@ -2315,19 +2392,21 @@ retry: swap(bvec_iter.bi_size, bytes); bio_advance_iter(&rbio->bio, &bvec_iter, bytes); } - bch2_trans_iter_put(&trans, iter); +err: + bch2_trans_iter_exit(&trans, &iter); if (ret == -EINTR || ret == READ_RETRY || ret == READ_RETRY_AVOID) goto retry; + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + if (ret) { - bch_err_inum_ratelimited(c, inode, + bch_err_inum_ratelimited(c, inum.inum, "read error %i from btree lookup", ret); rbio->bio.bi_status = BLK_STS_IOERR; bch2_rbio_done(rbio); } - bch2_trans_exit(&trans); - bch2_bkey_buf_exit(&sk, c); } void bch2_fs_io_exit(struct bch_fs *c)