X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fio.c;h=58cc90b198f593e947f02b025f6b7e105f4d27c2;hb=37270fc79cbe4ab62001893eebd16b6fde4b621b;hp=c309080c9de0f530069652513fe40921f7b3b2f9;hpb=05408b6f8fea54bf53e68a4ef24291214970f6d0;p=bcachefs-tools-debian diff --git a/libbcachefs/io.c b/libbcachefs/io.c index c309080..58cc90b 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -7,6 +7,7 @@ */ #include "bcachefs.h" +#include "alloc_background.h" #include "alloc_foreground.h" #include "bkey_on_stack.h" #include "bset.h" @@ -31,9 +32,17 @@ #include #include +#include #include +const char *bch2_blk_status_to_str(blk_status_t status) +{ + if (status == BLK_STS_REMOVED) + return "device removed"; + return blk_status_to_str(status); +} + static bool bch2_target_congested(struct bch_fs *c, u16 target) { const struct bch_devs_mask *devs; @@ -46,7 +55,9 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) return false; rcu_read_lock(); - devs = bch2_target_to_mask(c, target); + devs = bch2_target_to_mask(c, target) ?: + &c->rw_devs[BCH_DATA_user]; + for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { ca = rcu_dereference(c->devs[d]); if (!ca) @@ -463,7 +474,8 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, n->c = c; n->dev = ptr->dev; - n->have_ioref = bch2_dev_get_ioref(ca, WRITE); + n->have_ioref = bch2_dev_get_ioref(ca, + type == BCH_DATA_btree ? READ : WRITE); n->submit_time = local_clock(); n->bio.bi_iter.bi_sector = ptr->offset; @@ -493,8 +505,7 @@ static void bch2_write_done(struct closure *cl) if (!op->error && (op->flags & BCH_WRITE_FLUSH)) op->error = bch2_journal_error(&c->journal); - if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) - bch2_disk_reservation_put(c, &op->res); + bch2_disk_reservation_put(c, &op->res); percpu_ref_put(&c->writes); bch2_keylist_free(&op->insert_keys, op->inline_keys); @@ -612,7 +623,8 @@ static void bch2_write_endio(struct bio *bio) struct bch_fs *c = wbio->c; struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); - if (bch2_dev_io_err_on(bio->bi_status, ca, "data write")) + if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s", + bch2_blk_status_to_str(bio->bi_status))) set_bit(wbio->dev, op->failed.d); if (wbio->have_ioref) { @@ -1054,7 +1066,10 @@ static void __bch2_write(struct closure *cl) struct write_point *wp; struct bio *bio; bool skip_put = true; + unsigned nofs_flags; int ret; + + nofs_flags = memalloc_nofs_save(); again: memset(&op->failed, 0, sizeof(op->failed)); @@ -1080,6 +1095,11 @@ again: goto err; } + /* + * The copygc thread is now global, which means it's no longer + * freeing up space on specific disks, which means that + * allocations for specific disks may hang arbitrarily long: + */ wp = bch2_alloc_sectors_start(c, op->target, op->opts.erasure_code, @@ -1089,7 +1109,8 @@ again: op->nr_replicas_required, op->alloc_reserve, op->flags, - (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl); + (op->flags & (BCH_WRITE_ALLOC_NOWAIT| + BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl); EBUG_ON(!wp); if (unlikely(IS_ERR(wp))) { @@ -1101,6 +1122,16 @@ again: goto flush_io; } + /* + * It's possible for the allocator to fail, put us on the + * freelist waitlist, and then succeed in one of various retry + * paths: if that happens, we need to disable the skip_put + * optimization because otherwise there won't necessarily be a + * barrier before we free the bch_write_op: + */ + if (atomic_read(&cl->remaining) & CLOSURE_WAITING) + skip_put = false; + bch2_open_bucket_get(c, wp, &op->open_buckets); ret = bch2_write_extent(op, wp, &bio); bch2_alloc_sectors_done(c, wp); @@ -1130,19 +1161,21 @@ again: key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); - bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER, + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, key_to_write); } while (ret); if (!skip_put) continue_at(cl, bch2_write_index, index_update_wq(op)); +out: + memalloc_nofs_restore(nofs_flags); return; err: op->error = ret; op->flags |= BCH_WRITE_DONE; continue_at(cl, bch2_write_index, index_update_wq(op)); - return; + goto out; flush_io: /* * If the write can't all be submitted at once, we generally want to @@ -1153,7 +1186,7 @@ flush_io: */ if (current->flags & PF_WQ_WORKER) { continue_at(cl, bch2_write_index, index_update_wq(op)); - return; + goto out; } closure_sync(cl); @@ -1164,7 +1197,7 @@ flush_io: if (op->error) { op->flags |= BCH_WRITE_DONE; continue_at_nobarrier(cl, bch2_write_done, NULL); - return; + goto out; } } @@ -1281,8 +1314,7 @@ void bch2_write(struct closure *cl) continue_at_nobarrier(cl, __bch2_write, NULL); return; err: - if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION)) - bch2_disk_reservation_put(c, &op->res); + bch2_disk_reservation_put(c, &op->res); if (op->end_io) { EBUG_ON(cl->parent); @@ -1443,7 +1475,8 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, opts, DATA_PROMOTE, (struct data_opts) { - .target = opts.promote_target + .target = opts.promote_target, + .nr_replicas = 1, }, btree_id, k); BUG_ON(ret); @@ -1604,7 +1637,7 @@ retry: goto out; } - ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags); + ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, 0, failed, flags); if (ret == READ_RETRY) goto retry; if (ret) @@ -1661,7 +1694,7 @@ retry: bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; swap(bvec_iter.bi_size, bytes); - ret = __bch2_read_extent(c, rbio, bvec_iter, k, + ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, offset_into_extent, failed, flags); switch (ret) { case READ_RETRY: @@ -1923,7 +1956,8 @@ static void bch2_read_endio(struct bio *bio) if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) { + if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s", + bch2_blk_status_to_str(bio->bi_status))) { bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); return; } @@ -1988,11 +2022,12 @@ err: return ret; } -int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, +int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bvec_iter iter, struct bkey_s_c k, unsigned offset_into_extent, struct bch_io_failures *failed, unsigned flags) { + struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; struct bch_dev *ca; @@ -2160,9 +2195,9 @@ get_bio: bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - rcu_read_lock(); - bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); - rcu_read_unlock(); + if (pick.ptr.cached) + bch2_bucket_io_time_reset(trans, pick.ptr.dev, + PTR_BUCKET_NR(ca, &pick.ptr), READ); if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { bio_inc_remaining(&orig->bio); @@ -2176,7 +2211,7 @@ get_bio: goto out; } - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], bio_sectors(&rbio->bio)); bio_set_dev(&rbio->bio, ca->disk_sb.bdev); @@ -2304,7 +2339,7 @@ retry: if (rbio->bio.bi_iter.bi_size == bytes) flags |= BCH_READ_LAST_FRAGMENT; - bch2_read_extent(c, rbio, k, offset_into_extent, flags); + bch2_read_extent(&trans, rbio, k, offset_into_extent, flags); if (flags & BCH_READ_LAST_FRAGMENT) break;