X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fio.c;h=58cc90b198f593e947f02b025f6b7e105f4d27c2;hb=37270fc79cbe4ab62001893eebd16b6fde4b621b;hp=c309080c9de0f530069652513fe40921f7b3b2f9;hpb=05408b6f8fea54bf53e68a4ef24291214970f6d0;p=bcachefs-tools-debian

diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index c309080..58cc90b 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -7,6 +7,7 @@
  */
 
 #include "bcachefs.h"
+#include "alloc_background.h"
 #include "alloc_foreground.h"
 #include "bkey_on_stack.h"
 #include "bset.h"
@@ -31,9 +32,17 @@
 
 #include <linux/blkdev.h>
 #include <linux/random.h>
+#include <linux/sched/mm.h>
 
 #include <trace/events/bcachefs.h>
 
+const char *bch2_blk_status_to_str(blk_status_t status)
+{
+	if (status == BLK_STS_REMOVED)
+		return "device removed";
+	return blk_status_to_str(status);
+}
+
 static bool bch2_target_congested(struct bch_fs *c, u16 target)
 {
 	const struct bch_devs_mask *devs;
@@ -46,7 +55,9 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target)
 		return false;
 
 	rcu_read_lock();
-	devs = bch2_target_to_mask(c, target);
+	devs = bch2_target_to_mask(c, target) ?:
+		&c->rw_devs[BCH_DATA_user];
+
 	for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) {
 		ca = rcu_dereference(c->devs[d]);
 		if (!ca)
@@ -463,7 +474,8 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 
 		n->c			= c;
 		n->dev			= ptr->dev;
-		n->have_ioref		= bch2_dev_get_ioref(ca, WRITE);
+		n->have_ioref		= bch2_dev_get_ioref(ca,
+					type == BCH_DATA_btree ? READ : WRITE);
 		n->submit_time		= local_clock();
 		n->bio.bi_iter.bi_sector = ptr->offset;
 
@@ -493,8 +505,7 @@ static void bch2_write_done(struct closure *cl)
 	if (!op->error && (op->flags & BCH_WRITE_FLUSH))
 		op->error = bch2_journal_error(&c->journal);
 
-	if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
-		bch2_disk_reservation_put(c, &op->res);
+	bch2_disk_reservation_put(c, &op->res);
 	percpu_ref_put(&c->writes);
 	bch2_keylist_free(&op->insert_keys, op->inline_keys);
 
@@ -612,7 +623,8 @@ static void bch2_write_endio(struct bio *bio)
 	struct bch_fs *c		= wbio->c;
 	struct bch_dev *ca		= bch_dev_bkey_exists(c, wbio->dev);
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "data write"))
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s",
+			       bch2_blk_status_to_str(bio->bi_status)))
 		set_bit(wbio->dev, op->failed.d);
 
 	if (wbio->have_ioref) {
@@ -1054,7 +1066,10 @@ static void __bch2_write(struct closure *cl)
 	struct write_point *wp;
 	struct bio *bio;
 	bool skip_put = true;
+	unsigned nofs_flags;
 	int ret;
+
+	nofs_flags = memalloc_nofs_save();
 again:
 	memset(&op->failed, 0, sizeof(op->failed));
 
@@ -1080,6 +1095,11 @@ again:
 			goto err;
 		}
 
+		/*
+		 * The copygc thread is now global, which means it's no longer
+		 * freeing up space on specific disks, which means that
+		 * allocations for specific disks may hang arbitrarily long:
+		 */
 		wp = bch2_alloc_sectors_start(c,
 			op->target,
 			op->opts.erasure_code,
@@ -1089,7 +1109,8 @@ again:
 			op->nr_replicas_required,
 			op->alloc_reserve,
 			op->flags,
-			(op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
+			(op->flags & (BCH_WRITE_ALLOC_NOWAIT|
+				      BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl);
 		EBUG_ON(!wp);
 
 		if (unlikely(IS_ERR(wp))) {
@@ -1101,6 +1122,16 @@ again:
 			goto flush_io;
 		}
 
+		/*
+		 * It's possible for the allocator to fail, put us on the
+		 * freelist waitlist, and then succeed in one of various retry
+		 * paths: if that happens, we need to disable the skip_put
+		 * optimization because otherwise there won't necessarily be a
+		 * barrier before we free the bch_write_op:
+		 */
+		if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
+			skip_put = false;
+
 		bch2_open_bucket_get(c, wp, &op->open_buckets);
 		ret = bch2_write_extent(op, wp, &bio);
 		bch2_alloc_sectors_done(c, wp);
@@ -1130,19 +1161,21 @@ again:
 		key_to_write = (void *) (op->insert_keys.keys_p +
 					 key_to_write_offset);
 
-		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
+		bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user,
 					  key_to_write);
 	} while (ret);
 
 	if (!skip_put)
 		continue_at(cl, bch2_write_index, index_update_wq(op));
+out:
+	memalloc_nofs_restore(nofs_flags);
 	return;
 err:
 	op->error = ret;
 	op->flags |= BCH_WRITE_DONE;
 
 	continue_at(cl, bch2_write_index, index_update_wq(op));
-	return;
+	goto out;
 flush_io:
 	/*
 	 * If the write can't all be submitted at once, we generally want to
@@ -1153,7 +1186,7 @@ flush_io:
 	 */
 	if (current->flags & PF_WQ_WORKER) {
 		continue_at(cl, bch2_write_index, index_update_wq(op));
-		return;
+		goto out;
 	}
 
 	closure_sync(cl);
@@ -1164,7 +1197,7 @@ flush_io:
 		if (op->error) {
 			op->flags |= BCH_WRITE_DONE;
 			continue_at_nobarrier(cl, bch2_write_done, NULL);
-			return;
+			goto out;
 		}
 	}
 
@@ -1281,8 +1314,7 @@ void bch2_write(struct closure *cl)
 	continue_at_nobarrier(cl, __bch2_write, NULL);
 	return;
 err:
-	if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
-		bch2_disk_reservation_put(c, &op->res);
+	bch2_disk_reservation_put(c, &op->res);
 
 	if (op->end_io) {
 		EBUG_ON(cl->parent);
@@ -1443,7 +1475,8 @@ static struct promote_op *__promote_alloc(struct bch_fs *c,
 			opts,
 			DATA_PROMOTE,
 			(struct data_opts) {
-				.target = opts.promote_target
+				.target		= opts.promote_target,
+				.nr_replicas	= 1,
 			},
 			btree_id, k);
 	BUG_ON(ret);
@@ -1604,7 +1637,7 @@ retry:
 		goto out;
 	}
 
-	ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags);
+	ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, 0, failed, flags);
 	if (ret == READ_RETRY)
 		goto retry;
 	if (ret)
@@ -1661,7 +1694,7 @@ retry:
 		bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9;
 		swap(bvec_iter.bi_size, bytes);
 
-		ret = __bch2_read_extent(c, rbio, bvec_iter, k,
+		ret = __bch2_read_extent(&trans, rbio, bvec_iter, k,
 				offset_into_extent, failed, flags);
 		switch (ret) {
 		case READ_RETRY:
@@ -1923,7 +1956,8 @@ static void bch2_read_endio(struct bio *bio)
 	if (!rbio->split)
 		rbio->bio.bi_end_io = rbio->end_io;
 
-	if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) {
+	if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s",
+			       bch2_blk_status_to_str(bio->bi_status))) {
 		bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status);
 		return;
 	}
@@ -1988,11 +2022,12 @@ err:
 	return ret;
 }
 
-int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
+int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig,
 		       struct bvec_iter iter, struct bkey_s_c k,
 		       unsigned offset_into_extent,
 		       struct bch_io_failures *failed, unsigned flags)
 {
+	struct bch_fs *c = trans->c;
 	struct extent_ptr_decoded pick;
 	struct bch_read_bio *rbio = NULL;
 	struct bch_dev *ca;
@@ -2160,9 +2195,9 @@ get_bio:
 
 	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
 
-	rcu_read_lock();
-	bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ);
-	rcu_read_unlock();
+	if (pick.ptr.cached)
+		bch2_bucket_io_time_reset(trans, pick.ptr.dev,
+			PTR_BUCKET_NR(ca, &pick.ptr), READ);
 
 	if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) {
 		bio_inc_remaining(&orig->bio);
@@ -2176,7 +2211,7 @@ get_bio:
 			goto out;
 		}
 
-		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER],
+		this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user],
 			     bio_sectors(&rbio->bio));
 		bio_set_dev(&rbio->bio, ca->disk_sb.bdev);
 
@@ -2304,7 +2339,7 @@ retry:
 		if (rbio->bio.bi_iter.bi_size == bytes)
 			flags |= BCH_READ_LAST_FRAGMENT;
 
-		bch2_read_extent(c, rbio, k, offset_into_extent, flags);
+		bch2_read_extent(&trans, rbio, k, offset_into_extent, flags);
 
 		if (flags & BCH_READ_LAST_FRAGMENT)
 			break;