*/
#include "bcachefs.h"
+#include "alloc_background.h"
#include "alloc_foreground.h"
#include "bkey_on_stack.h"
#include "bset.h"
#include <linux/blkdev.h>
#include <linux/random.h>
+#include <linux/sched/mm.h>
#include <trace/events/bcachefs.h>
+const char *bch2_blk_status_to_str(blk_status_t status)
+{
+ if (status == BLK_STS_REMOVED)
+ return "device removed";
+ return blk_status_to_str(status);
+}
+
static bool bch2_target_congested(struct bch_fs *c, u16 target)
{
const struct bch_devs_mask *devs;
return false;
rcu_read_lock();
- devs = bch2_target_to_mask(c, target);
+ devs = bch2_target_to_mask(c, target) ?:
+ &c->rw_devs[BCH_DATA_user];
+
for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
ca = rcu_dereference(c->devs[d]);
if (!ca)
n->c = c;
n->dev = ptr->dev;
- n->have_ioref = bch2_dev_get_ioref(ca, WRITE);
+ n->have_ioref = bch2_dev_get_ioref(ca,
+ type == BCH_DATA_btree ? READ : WRITE);
n->submit_time = local_clock();
n->bio.bi_iter.bi_sector = ptr->offset;
if (!op->error && (op->flags & BCH_WRITE_FLUSH))
op->error = bch2_journal_error(&c->journal);
- if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
- bch2_disk_reservation_put(c, &op->res);
+ bch2_disk_reservation_put(c, &op->res);
percpu_ref_put(&c->writes);
bch2_keylist_free(&op->insert_keys, op->inline_keys);
struct bch_fs *c = wbio->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
- if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s",
+ bch2_blk_status_to_str(bio->bi_status)))
set_bit(wbio->dev, op->failed.d);
if (wbio->have_ioref) {
struct write_point *wp;
struct bio *bio;
bool skip_put = true;
+ unsigned nofs_flags;
int ret;
+
+ nofs_flags = memalloc_nofs_save();
again:
memset(&op->failed, 0, sizeof(op->failed));
goto err;
}
+ /*
+ * The copygc thread is now global, which means it's no longer
+ * freeing up space on specific disks, which means that
+ * allocations for specific disks may hang arbitrarily long:
+ */
wp = bch2_alloc_sectors_start(c,
op->target,
op->opts.erasure_code,
op->nr_replicas_required,
op->alloc_reserve,
op->flags,
- (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
+ (op->flags & (BCH_WRITE_ALLOC_NOWAIT|
+ BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl);
EBUG_ON(!wp);
if (unlikely(IS_ERR(wp))) {
goto flush_io;
}
+ /*
+ * It's possible for the allocator to fail, put us on the
+ * freelist waitlist, and then succeed in one of various retry
+ * paths: if that happens, we need to disable the skip_put
+ * optimization because otherwise there won't necessarily be a
+ * barrier before we free the bch_write_op:
+ */
+ if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
+ skip_put = false;
+
bch2_open_bucket_get(c, wp, &op->open_buckets);
ret = bch2_write_extent(op, wp, &bio);
bch2_alloc_sectors_done(c, wp);
key_to_write = (void *) (op->insert_keys.keys_p +
key_to_write_offset);
- bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
+ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
key_to_write);
} while (ret);
if (!skip_put)
continue_at(cl, bch2_write_index, index_update_wq(op));
+out:
+ memalloc_nofs_restore(nofs_flags);
return;
err:
op->error = ret;
op->flags |= BCH_WRITE_DONE;
continue_at(cl, bch2_write_index, index_update_wq(op));
- return;
+ goto out;
flush_io:
/*
* If the write can't all be submitted at once, we generally want to
*/
if (current->flags & PF_WQ_WORKER) {
continue_at(cl, bch2_write_index, index_update_wq(op));
- return;
+ goto out;
}
closure_sync(cl);
if (op->error) {
op->flags |= BCH_WRITE_DONE;
continue_at_nobarrier(cl, bch2_write_done, NULL);
- return;
+ goto out;
}
}
continue_at_nobarrier(cl, __bch2_write, NULL);
return;
err:
- if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
- bch2_disk_reservation_put(c, &op->res);
+ bch2_disk_reservation_put(c, &op->res);
if (op->end_io) {
EBUG_ON(cl->parent);
opts,
DATA_PROMOTE,
(struct data_opts) {
- .target = opts.promote_target
+ .target = opts.promote_target,
+ .nr_replicas = 1,
},
btree_id, k);
BUG_ON(ret);
goto out;
}
- ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags);
+ ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, 0, failed, flags);
if (ret == READ_RETRY)
goto retry;
if (ret)
bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
swap(bvec_iter.bi_size, bytes);
- ret = __bch2_read_extent(c, rbio, bvec_iter, k,
+ ret = __bch2_read_extent(&trans, rbio, bvec_iter, k,
offset_into_extent, failed, flags);
switch (ret) {
case READ_RETRY:
if (!rbio->split)
rbio->bio.bi_end_io = rbio->end_io;
- if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
+ if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s",
+ bch2_blk_status_to_str(bio->bi_status))) {
bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
return;
}
return ret;
}
-int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
+int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
struct bvec_iter iter, struct bkey_s_c k,
unsigned offset_into_extent,
struct bch_io_failures *failed, unsigned flags)
{
+ struct bch_fs *c = trans->c;
struct extent_ptr_decoded pick;
struct bch_read_bio *rbio = NULL;
struct bch_dev *ca;
bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
- rcu_read_lock();
- bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
- rcu_read_unlock();
+ if (pick.ptr.cached)
+ bch2_bucket_io_time_reset(trans, pick.ptr.dev,
+ PTR_BUCKET_NR(ca, &pick.ptr), READ);
if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
bio_inc_remaining(&orig->bio);
goto out;
}
- this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
bio_sectors(&rbio->bio));
bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
if (rbio->bio.bi_iter.bi_size == bytes)
flags |= BCH_READ_LAST_FRAGMENT;
- bch2_read_extent(c, rbio, k, offset_into_extent, flags);
+ bch2_read_extent(&trans, rbio, k, offset_into_extent, flags);
if (flags & BCH_READ_LAST_FRAGMENT)
break;