From ea83a3985d28372d56ec7cea6e73907551869f63 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 13 Dec 2017 16:01:18 -0500
Subject: [PATCH] Update bcachefs sources to e57b5958cf bcachefs: fix for
 building in userspace

---
 .bcachefs_revision                  |    2 +-
 cmd_migrate.c                       |    2 +-
 include/trace/events/bcachefs.h     |  226 +----
 libbcachefs/alloc.c                 |  921 ++++++++------------
 libbcachefs/alloc.h                 |   87 +-
 libbcachefs/alloc_types.h           |   25 +-
 libbcachefs/bcachefs.h              |   85 +-
 libbcachefs/bcachefs_format.h       |   14 +-
 libbcachefs/bkey.c                  |    8 +
 libbcachefs/bset.h                  |   24 +-
 libbcachefs/btree_gc.c              |  193 +++--
 libbcachefs/btree_gc.h              |   10 +-
 libbcachefs/btree_io.c              |    8 +-
 libbcachefs/btree_io.h              |    1 +
 libbcachefs/btree_locking.h         |    2 +-
 libbcachefs/btree_types.h           |   30 +-
 libbcachefs/btree_update_interior.c |   96 ++-
 libbcachefs/btree_update_leaf.c     |    5 +
 libbcachefs/buckets.c               |  350 ++++----
 libbcachefs/buckets.h               |   76 +-
 libbcachefs/buckets_types.h         |   15 +-
 libbcachefs/checksum.c              |  168 +++-
 libbcachefs/checksum.h              |   36 +-
 libbcachefs/compress.c              |  135 ++-
 libbcachefs/compress.h              |   10 +-
 libbcachefs/extents.c               |  539 ++++++------
 libbcachefs/extents.h               |  339 +++-----
 libbcachefs/extents_types.h         |   27 +
 libbcachefs/eytzinger.h             |   86 +-
 libbcachefs/fs-io.c                 |  193 ++++-
 libbcachefs/fs-io.h                 |   65 +-
 libbcachefs/fs.c                    |   26 +-
 libbcachefs/io.c                    | 1209 ++++++++++++++++-----------
 libbcachefs/io.h                    |   93 ++-
 libbcachefs/io_types.h              |   53 +-
 libbcachefs/journal.c               |   50 +-
 libbcachefs/keylist.h               |    5 +-
 libbcachefs/migrate.c               |  125 +--
 libbcachefs/move.c                  |  466 ++++++-----
 libbcachefs/move.h                  |   80 +-
 libbcachefs/movinggc.c              |  303 +++----
 libbcachefs/movinggc.h              |   28 +-
 libbcachefs/super-io.c              |    5 +
 libbcachefs/super.c                 |   33 +-
 libbcachefs/super.h                 |   24 +
 libbcachefs/super_types.h           |   29 +
 libbcachefs/sysfs.c                 |   80 +-
 libbcachefs/tier.c                  |  126 +--
 libbcachefs/util.c                  |  135 ++-
 libbcachefs/util.h                  |   24 +
 50 files changed, 3405 insertions(+), 3267 deletions(-)
 create mode 100644 libbcachefs/extents_types.h

diff --git a/.bcachefs_revision b/.bcachefs_revision
index 04ebc30..7724716 100644
--- a/.bcachefs_revision
+++ b/.bcachefs_revision
@@ -1 +1 @@
-192d759a491f50d92c89c2e842639d2307c815a5
+e57b5958cf4e8530d26f7c36a6e1427fb284cc70
diff --git a/cmd_migrate.c b/cmd_migrate.c
index d683a5f..58c0bb9 100644
--- a/cmd_migrate.c
+++ b/cmd_migrate.c
@@ -265,7 +265,7 @@ static void write_data(struct bch_fs *c,
 	if (ret)
 		die("error reserving space in new filesystem: %s", strerror(-ret));
 
-	bch2_write_op_init(&op, c, res, NULL, 0,
+	bch2_write_op_init(&op, c, res, NULL, writepoint_hashed(0),
 			   POS(dst_inode->bi_inum, dst_offset >> 9), NULL, 0);
 	closure_call(&op.cl, bch2_write, NULL, &cl);
 	closure_sync(&cl);
diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h
index 0c9f3de..bf187f5 100644
--- a/include/trace/events/bcachefs.h
+++ b/include/trace/events/bcachefs.h
@@ -98,23 +98,6 @@ DECLARE_EVENT_CLASS(bio,
 		  (unsigned long long)__entry->sector, __entry->nr_sector)
 );
 
-DECLARE_EVENT_CLASS(page_alloc_fail,
-	TP_PROTO(struct bch_fs *c, u64 size),
-	TP_ARGS(c, size),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,	16	)
-		__field(u64,		size		)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->size = size;
-	),
-
-	TP_printk("%pU size %llu", __entry->uuid, __entry->size)
-);
-
 /* io.c: */
 
 DEFINE_EVENT(bio, read_split,
@@ -137,34 +120,6 @@ DEFINE_EVENT(bio, promote,
 	TP_ARGS(bio)
 );
 
-TRACE_EVENT(write_throttle,
-	TP_PROTO(struct bch_fs *c, u64 inode, struct bio *bio, u64 delay),
-	TP_ARGS(c, inode, bio, delay),
-
-	TP_STRUCT__entry(
-		__array(char,		uuid,	16		)
-		__field(u64,		inode			)
-		__field(sector_t,	sector			)
-		__field(unsigned int,	nr_sector		)
-		__array(char,		rwbs,	6		)
-		__field(u64,		delay			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->inode		= inode;
-		__entry->sector		= bio->bi_iter.bi_sector;
-		__entry->nr_sector	= bio->bi_iter.bi_size >> 9;
-		blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size);
-		__entry->delay		= delay;
-	),
-
-	TP_printk("%pU inode %llu  %s %llu + %u delay %llu",
-		  __entry->uuid, __entry->inode,
-		  __entry->rwbs, (unsigned long long)__entry->sector,
-		  __entry->nr_sector, __entry->delay)
-);
-
 /* Journal */
 
 DEFINE_EVENT(bch_fs, journal_full,
@@ -439,16 +394,6 @@ TRACE_EVENT(alloc_batch,
 		__entry->uuid, __entry->free, __entry->total)
 );
 
-DEFINE_EVENT(bch_dev, prio_write_start,
-	TP_PROTO(struct bch_dev *ca),
-	TP_ARGS(ca)
-);
-
-DEFINE_EVENT(bch_dev, prio_write_end,
-	TP_PROTO(struct bch_dev *ca),
-	TP_ARGS(ca)
-);
-
 TRACE_EVENT(invalidate,
 	TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors),
 	TP_ARGS(ca, offset, sectors),
@@ -502,174 +447,77 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc_fail,
 	TP_ARGS(ca, reserve)
 );
 
-TRACE_EVENT(freelist_empty_fail,
-	TP_PROTO(struct bch_fs *c, enum alloc_reserve reserve,
-		 struct closure *cl),
-	TP_ARGS(c, reserve, cl),
-
-	TP_STRUCT__entry(
-		__array(char,			uuid,	16	)
-		__field(enum alloc_reserve,	reserve		)
-		__field(struct closure *,	cl		)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->reserve = reserve;
-		__entry->cl = cl;
-	),
-
-	TP_printk("%pU reserve %d cl %p", __entry->uuid, __entry->reserve,
-		  __entry->cl)
-);
-
-DECLARE_EVENT_CLASS(open_bucket_alloc,
-	TP_PROTO(struct bch_fs *c, struct closure *cl),
-	TP_ARGS(c, cl),
-
-	TP_STRUCT__entry(
-		__array(char,			uuid,	16	)
-		__field(struct closure *,	cl		)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->cl = cl;
-	),
-
-	TP_printk("%pU cl %p",
-		  __entry->uuid, __entry->cl)
-);
-
-DEFINE_EVENT(open_bucket_alloc, open_bucket_alloc,
-	TP_PROTO(struct bch_fs *c, struct closure *cl),
-	TP_ARGS(c, cl)
-);
-
-DEFINE_EVENT(open_bucket_alloc, open_bucket_alloc_fail,
-	TP_PROTO(struct bch_fs *c, struct closure *cl),
-	TP_ARGS(c, cl)
+DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail,
+	TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve),
+	TP_ARGS(ca, reserve)
 );
 
 /* Moving IO */
 
-DECLARE_EVENT_CLASS(moving_io,
-	TP_PROTO(struct bkey *k),
-	TP_ARGS(k),
-
-	TP_STRUCT__entry(
-		__field(__u32,		inode			)
-		__field(__u64,		offset			)
-		__field(__u32,		sectors			)
-	),
-
-	TP_fast_assign(
-		__entry->inode		= k->p.inode;
-		__entry->offset		= k->p.offset;
-		__entry->sectors	= k->size;
-	),
-
-	TP_printk("%u:%llu sectors %u",
-		  __entry->inode, __entry->offset, __entry->sectors)
-);
-
-DEFINE_EVENT(moving_io, move_read,
-	TP_PROTO(struct bkey *k),
-	TP_ARGS(k)
-);
-
-DEFINE_EVENT(moving_io, move_read_done,
-	TP_PROTO(struct bkey *k),
+DEFINE_EVENT(bkey, move_extent,
+	TP_PROTO(const struct bkey *k),
 	TP_ARGS(k)
 );
 
-DEFINE_EVENT(moving_io, move_write,
-	TP_PROTO(struct bkey *k),
+DEFINE_EVENT(bkey, move_alloc_fail,
+	TP_PROTO(const struct bkey *k),
 	TP_ARGS(k)
 );
 
-DEFINE_EVENT(moving_io, copy_collision,
-	TP_PROTO(struct bkey *k),
+DEFINE_EVENT(bkey, move_race,
+	TP_PROTO(const struct bkey *k),
 	TP_ARGS(k)
 );
 
-/* Copy GC */
-
-DEFINE_EVENT(page_alloc_fail, moving_gc_alloc_fail,
-	TP_PROTO(struct bch_fs *c, u64 size),
-	TP_ARGS(c, size)
-);
-
-DEFINE_EVENT(bch_dev, moving_gc_start,
-	TP_PROTO(struct bch_dev *ca),
-	TP_ARGS(ca)
-);
-
-TRACE_EVENT(moving_gc_end,
-	TP_PROTO(struct bch_dev *ca, u64 sectors_moved, u64 keys_moved,
-		u64 buckets_moved),
-	TP_ARGS(ca, sectors_moved, keys_moved, buckets_moved),
+TRACE_EVENT(move_data,
+	TP_PROTO(struct bch_fs *c, u64 sectors_moved,
+		 u64 keys_moved),
+	TP_ARGS(c, sectors_moved, keys_moved),
 
 	TP_STRUCT__entry(
 		__array(char,		uuid,	16	)
 		__field(u64,		sectors_moved	)
 		__field(u64,		keys_moved	)
-		__field(u64,		buckets_moved	)
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->uuid, ca->uuid.b, 16);
+		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
 		__entry->sectors_moved = sectors_moved;
 		__entry->keys_moved = keys_moved;
-		__entry->buckets_moved = buckets_moved;
 	),
 
-	TP_printk("%pU sectors_moved %llu keys_moved %llu buckets_moved %llu",
-		__entry->uuid, __entry->sectors_moved, __entry->keys_moved,
-		__entry->buckets_moved)
-);
-
-DEFINE_EVENT(bkey, gc_copy,
-	TP_PROTO(const struct bkey *k),
-	TP_ARGS(k)
-);
-
-/* Tiering */
-
-DEFINE_EVENT(page_alloc_fail, tiering_alloc_fail,
-	TP_PROTO(struct bch_fs *c, u64 size),
-	TP_ARGS(c, size)
+	TP_printk("%pU sectors_moved %llu keys_moved %llu",
+		__entry->uuid, __entry->sectors_moved, __entry->keys_moved)
 );
 
-DEFINE_EVENT(bch_fs, tiering_start,
-	TP_PROTO(struct bch_fs *c),
-	TP_ARGS(c)
-);
-
-TRACE_EVENT(tiering_end,
-	TP_PROTO(struct bch_fs *c, u64 sectors_moved,
-		u64 keys_moved),
-	TP_ARGS(c, sectors_moved, keys_moved),
+TRACE_EVENT(copygc,
+	TP_PROTO(struct bch_dev *ca,
+		 u64 sectors_moved, u64 sectors_not_moved,
+		 u64 buckets_moved, u64 buckets_not_moved),
+	TP_ARGS(ca,
+		sectors_moved, sectors_not_moved,
+		buckets_moved, buckets_not_moved),
 
 	TP_STRUCT__entry(
-		__array(char,		uuid,	16	)
-		__field(u64,		sectors_moved	)
-		__field(u64,		keys_moved	)
+		__array(char,		uuid,	16		)
+		__field(u64,		sectors_moved		)
+		__field(u64,		sectors_not_moved	)
+		__field(u64,		buckets_moved		)
+		__field(u64,		buckets_not_moved	)
 	),
 
 	TP_fast_assign(
-		memcpy(__entry->uuid, c->sb.user_uuid.b, 16);
-		__entry->sectors_moved = sectors_moved;
-		__entry->keys_moved = keys_moved;
+		memcpy(__entry->uuid, ca->uuid.b, 16);
+		__entry->sectors_moved		= sectors_moved;
+		__entry->sectors_not_moved	= sectors_not_moved;
+		__entry->buckets_moved		= buckets_moved;
+		__entry->buckets_not_moved = buckets_moved;
 	),
 
-	TP_printk("%pU sectors_moved %llu keys_moved %llu",
-		__entry->uuid, __entry->sectors_moved, __entry->keys_moved)
-);
-
-DEFINE_EVENT(bkey, tiering_copy,
-	TP_PROTO(const struct bkey *k),
-	TP_ARGS(k)
+	TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu",
+		__entry->uuid,
+		__entry->sectors_moved, __entry->sectors_not_moved,
+		__entry->buckets_moved, __entry->buckets_not_moved)
 );
 
 #endif /* _TRACE_BCACHE_H */
diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c
index dc7348f..d29d871 100644
--- a/libbcachefs/alloc.c
+++ b/libbcachefs/alloc.c
@@ -56,6 +56,7 @@
 #include "bcachefs.h"
 #include "alloc.h"
 #include "btree_update.h"
+#include "btree_gc.h"
 #include "buckets.h"
 #include "checksum.h"
 #include "clock.h"
@@ -76,7 +77,7 @@
 #include <linux/sort.h>
 #include <trace/events/bcachefs.h>
 
-static void bch2_recalc_min_prio(struct bch_dev *, int);
+static void bch2_recalc_min_prio(struct bch_fs *, struct bch_dev *, int);
 
 /* Ratelimiting/PD controllers */
 
@@ -92,8 +93,6 @@ static void pd_controllers_update(struct work_struct *work)
 	u64 faster_tiers_size	= 0;
 	u64 faster_tiers_dirty	= 0;
 
-	u64 fastest_tier_size	= 0;
-	u64 fastest_tier_free	= 0;
 	u64 copygc_can_free	= 0;
 
 	rcu_read_lock();
@@ -105,7 +104,7 @@ static void pd_controllers_update(struct work_struct *work)
 				-1);
 
 		for_each_member_device_rcu(ca, c, iter, &c->tiers[i].devs) {
-			struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+			struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
 
 			u64 size = bucket_to_sector(ca, ca->mi.nbuckets -
 					ca->mi.first_bucket) << 9;
@@ -125,18 +124,12 @@ static void pd_controllers_update(struct work_struct *work)
 
 			fragmented = max(0LL, fragmented);
 
-			bch2_pd_controller_update(&ca->moving_gc_pd,
+			bch2_pd_controller_update(&ca->copygc_pd,
 						 free, fragmented, -1);
 
 			faster_tiers_size		+= size;
 			faster_tiers_dirty		+= dirty;
 
-			if (!c->fastest_tier ||
-			    c->fastest_tier == &c->tiers[i]) {
-				fastest_tier_size	+= size;
-				fastest_tier_free	+= free;
-			}
-
 			copygc_can_free			+= fragmented;
 		}
 	}
@@ -157,14 +150,6 @@ static void pd_controllers_update(struct work_struct *work)
 	if (c->fastest_tier)
 		copygc_can_free = U64_MAX;
 
-	bch2_pd_controller_update(&c->foreground_write_pd,
-				 min(copygc_can_free,
-				     div_u64(fastest_tier_size *
-					     c->foreground_target_percent,
-					     100)),
-				 fastest_tier_free,
-				 -1);
-
 	schedule_delayed_work(&c->pd_controllers_update,
 			      c->pd_controllers_update_seconds * HZ);
 }
@@ -295,6 +280,8 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
 	struct journal_replay *r;
 	struct btree_iter iter;
 	struct bkey_s_c k;
+	struct bch_dev *ca;
+	unsigned i;
 	int ret;
 
 	if (!c->btree_roots[BTREE_ID_ALLOC].b)
@@ -318,6 +305,11 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list)
 				bch2_alloc_read_key(c, bkey_i_to_s_c(k));
 	}
 
+	for_each_member_device(ca, c, i) {
+		bch2_recalc_min_prio(c, ca, READ);
+		bch2_recalc_min_prio(c, ca, WRITE);
+	}
+
 	return 0;
 }
 
@@ -436,7 +428,7 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 		if (gc_count != c->gc_count)
 			ca->inc_gen_really_needs_gc = 0;
 
-		if ((ssize_t) (dev_buckets_available(ca) -
+		if ((ssize_t) (dev_buckets_available(c, ca) -
 			       ca->inc_gen_really_needs_gc) >=
 		    (ssize_t) fifo_free(&ca->free_inc))
 			break;
@@ -451,9 +443,10 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 	return ret;
 }
 
-static void verify_not_on_freelist(struct bch_dev *ca, size_t bucket)
+static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca,
+				   size_t bucket)
 {
-	if (expensive_debug_checks(ca->fs)) {
+	if (expensive_debug_checks(c)) {
 		size_t iter;
 		long i;
 		unsigned j;
@@ -468,9 +461,8 @@ static void verify_not_on_freelist(struct bch_dev *ca, size_t bucket)
 
 /* Bucket heap / gen */
 
-void bch2_recalc_min_prio(struct bch_dev *ca, int rw)
+void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw)
 {
-	struct bch_fs *c = ca->fs;
 	struct prio_clock *clock = &c->prio_clock[rw];
 	struct bucket *g;
 	u16 max_delta = 1;
@@ -478,14 +470,14 @@ void bch2_recalc_min_prio(struct bch_dev *ca, int rw)
 
 	lockdep_assert_held(&c->bucket_lock);
 
-	/* Determine min prio for this particular cache */
+	/* Determine min prio for this particular device */
 	for_each_bucket(g, ca)
 		max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw]));
 
 	ca->min_prio[rw] = clock->hand - max_delta;
 
 	/*
-	 * This may possibly increase the min prio for the whole cache, check
+	 * This may possibly increase the min prio for the whole device, check
 	 * that as well.
 	 */
 	max_delta = 1;
@@ -511,7 +503,7 @@ static void bch2_rescale_prios(struct bch_fs *c, int rw)
 			g->prio[rw] = clock->hand -
 				(clock->hand - g->prio[rw]) / 2;
 
-		bch2_recalc_min_prio(ca, rw);
+		bch2_recalc_min_prio(c, ca, rw);
 	}
 }
 
@@ -588,20 +580,20 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
 	return can_inc_bucket_gen(ca, g);
 }
 
-static void bch2_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g)
+static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
+				       struct bucket *g)
 {
-	struct bch_fs *c = ca->fs;
 	struct bucket_mark m;
 
-	spin_lock(&ca->freelist_lock);
-	if (!bch2_invalidate_bucket(ca, g, &m)) {
-		spin_unlock(&ca->freelist_lock);
+	spin_lock(&c->freelist_lock);
+	if (!bch2_invalidate_bucket(c, ca, g, &m)) {
+		spin_unlock(&c->freelist_lock);
 		return;
 	}
 
-	verify_not_on_freelist(ca, g - ca->buckets);
+	verify_not_on_freelist(c, ca, g - ca->buckets);
 	BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets));
-	spin_unlock(&ca->freelist_lock);
+	spin_unlock(&c->freelist_lock);
 
 	g->prio[READ] = c->prio_clock[READ].hand;
 	g->prio[WRITE] = c->prio_clock[WRITE].hand;
@@ -641,9 +633,8 @@ static void bch2_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g)
  *   number wraparound.
  */
 
-static unsigned long bucket_sort_key(struct bch_dev *ca,
-				     struct bucket *g,
-				     struct bucket_mark m)
+static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca,
+				     struct bucket *g, struct bucket_mark m)
 {
 	/*
 	 * Time since last read, scaled to [0, 8) where larger value indicates
@@ -651,14 +642,14 @@ static unsigned long bucket_sort_key(struct bch_dev *ca,
 	 */
 	unsigned long hotness =
 		(g->prio[READ]			- ca->min_prio[READ]) * 7 /
-		(ca->fs->prio_clock[READ].hand	- ca->min_prio[READ]);
+		(c->prio_clock[READ].hand	- ca->min_prio[READ]);
 
 	/* How much we want to keep the data in this bucket: */
 	unsigned long data_wantness =
 		(hotness + 1) * bucket_sectors_used(m);
 
 	unsigned long needs_journal_commit =
-		    bucket_needs_journal_commit(m, ca->fs->journal.last_seq_ondisk);
+		    bucket_needs_journal_commit(m, c->journal.last_seq_ondisk);
 
 	return  (data_wantness << 9) |
 		(needs_journal_commit << 8) |
@@ -672,16 +663,16 @@ static inline int bucket_alloc_cmp(alloc_heap *h,
 	return (l.key > r.key) - (l.key < r.key);
 }
 
-static void invalidate_buckets_lru(struct bch_dev *ca)
+static void invalidate_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct alloc_heap_entry e;
 	struct bucket *g;
 
 	ca->alloc_heap.used = 0;
 
-	mutex_lock(&ca->fs->bucket_lock);
-	bch2_recalc_min_prio(ca, READ);
-	bch2_recalc_min_prio(ca, WRITE);
+	mutex_lock(&c->bucket_lock);
+	bch2_recalc_min_prio(c, ca, READ);
+	bch2_recalc_min_prio(c, ca, WRITE);
 
 	/*
 	 * Find buckets with lowest read priority, by building a maxheap sorted
@@ -696,7 +687,7 @@ static void invalidate_buckets_lru(struct bch_dev *ca)
 
 		e = (struct alloc_heap_entry) {
 			.bucket = g - ca->buckets,
-			.key	= bucket_sort_key(ca, g, m)
+			.key	= bucket_sort_key(c, ca, g, m)
 		};
 
 		heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp);
@@ -710,12 +701,12 @@ static void invalidate_buckets_lru(struct bch_dev *ca)
 	 */
 	while (!fifo_full(&ca->free_inc) &&
 	       heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp))
-		bch2_invalidate_one_bucket(ca, &ca->buckets[e.bucket]);
+		bch2_invalidate_one_bucket(c, ca, &ca->buckets[e.bucket]);
 
-	mutex_unlock(&ca->fs->bucket_lock);
+	mutex_unlock(&c->bucket_lock);
 }
 
-static void invalidate_buckets_fifo(struct bch_dev *ca)
+static void invalidate_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bucket_mark m;
 	struct bucket *g;
@@ -730,14 +721,14 @@ static void invalidate_buckets_fifo(struct bch_dev *ca)
 		m = READ_ONCE(g->mark);
 
 		if (bch2_can_invalidate_bucket(ca, g, m))
-			bch2_invalidate_one_bucket(ca, g);
+			bch2_invalidate_one_bucket(c, ca, g);
 
 		if (++checked >= ca->mi.nbuckets)
 			return;
 	}
 }
 
-static void invalidate_buckets_random(struct bch_dev *ca)
+static void invalidate_buckets_random(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct bucket_mark m;
 	struct bucket *g;
@@ -752,27 +743,27 @@ static void invalidate_buckets_random(struct bch_dev *ca)
 		m = READ_ONCE(g->mark);
 
 		if (bch2_can_invalidate_bucket(ca, g, m))
-			bch2_invalidate_one_bucket(ca, g);
+			bch2_invalidate_one_bucket(c, ca, g);
 
 		if (++checked >= ca->mi.nbuckets / 2)
 			return;
 	}
 }
 
-static void invalidate_buckets(struct bch_dev *ca)
+static void invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
 {
 	ca->inc_gen_needs_gc			= 0;
 	ca->inc_gen_really_needs_gc		= 0;
 
 	switch (ca->mi.replacement) {
 	case CACHE_REPLACEMENT_LRU:
-		invalidate_buckets_lru(ca);
+		invalidate_buckets_lru(c, ca);
 		break;
 	case CACHE_REPLACEMENT_FIFO:
-		invalidate_buckets_fifo(ca);
+		invalidate_buckets_fifo(c, ca);
 		break;
 	case CACHE_REPLACEMENT_RANDOM:
-		invalidate_buckets_random(ca);
+		invalidate_buckets_random(c, ca);
 		break;
 	}
 }
@@ -812,7 +803,8 @@ static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca,
  * Given an invalidated, ready to use bucket: issue a discard to it if enabled,
  * then add it to the freelist, waiting until there's room if necessary:
  */
-static void discard_invalidated_bucket(struct bch_dev *ca, long bucket)
+static void discard_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca,
+				       long bucket)
 {
 	if (ca->mi.discard &&
 	    blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
@@ -830,15 +822,15 @@ static void discard_invalidated_bucket(struct bch_dev *ca, long bucket)
 		 * Don't remove from free_inc until after it's added to
 		 * freelist, so gc can find it:
 		 */
-		spin_lock(&ca->freelist_lock);
+		spin_lock(&c->freelist_lock);
 		for (i = 0; i < RESERVE_NR; i++)
 			if (fifo_push(&ca->free[i], bucket)) {
 				fifo_pop(&ca->free_inc, bucket);
-				closure_wake_up(&ca->fs->freelist_wait);
+				closure_wake_up(&c->freelist_wait);
 				pushed = true;
 				break;
 			}
-		spin_unlock(&ca->freelist_lock);
+		spin_unlock(&c->freelist_lock);
 
 		if (pushed)
 			break;
@@ -877,7 +869,7 @@ static int bch2_allocator_thread(void *arg)
 				BUG_ON(fifo_empty(&ca->free_inc));
 
 				bucket = fifo_peek(&ca->free_inc);
-				discard_invalidated_bucket(ca, bucket);
+				discard_invalidated_bucket(c, ca, bucket);
 				if (kthread_should_stop())
 					return 0;
 				--ca->nr_invalidated;
@@ -924,7 +916,7 @@ static int bch2_allocator_thread(void *arg)
 			 * another cache tier
 			 */
 
-			invalidate_buckets(ca);
+			invalidate_buckets(c, ca);
 			trace_alloc_batch(ca, fifo_used(&ca->free_inc),
 					  ca->free_inc.size);
 
@@ -949,12 +941,12 @@ static int bch2_allocator_thread(void *arg)
 
 		BUG_ON(ca->free_inc.front);
 
-		spin_lock(&ca->freelist_lock);
+		spin_lock(&c->freelist_lock);
 		sort(ca->free_inc.data,
 		     ca->free_inc.back,
 		     sizeof(ca->free_inc.data[0]),
 		     size_t_cmp, NULL);
-		spin_unlock(&ca->freelist_lock);
+		spin_unlock(&c->freelist_lock);
 
 		/*
 		 * free_inc is now full of newly-invalidated buckets: next,
@@ -965,6 +957,55 @@ static int bch2_allocator_thread(void *arg)
 
 /* Allocation */
 
+/*
+ * Open buckets represent a bucket that's currently being allocated from.  They
+ * serve two purposes:
+ *
+ *  - They track buckets that have been partially allocated, allowing for
+ *    sub-bucket sized allocations - they're used by the sector allocator below
+ *
+ *  - They provide a reference to the buckets they own that mark and sweep GC
+ *    can find, until the new allocation has a pointer to it inserted into the
+ *    btree
+ *
+ * When allocating some space with the sector allocator, the allocation comes
+ * with a reference to an open bucket - the caller is required to put that
+ * reference _after_ doing the index update that makes its allocation reachable.
+ */
+
+void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+	struct bch_dev *ca = c->devs[ob->ptr.dev];
+
+	spin_lock(&ob->lock);
+	bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), false,
+			       gc_pos_alloc(c, ob), 0);
+	ob->valid = false;
+	spin_unlock(&ob->lock);
+
+	spin_lock(&c->freelist_lock);
+	ob->freelist = c->open_buckets_freelist;
+	c->open_buckets_freelist = ob - c->open_buckets;
+	c->open_buckets_nr_free++;
+	spin_unlock(&c->freelist_lock);
+
+	closure_wake_up(&c->open_buckets_wait);
+}
+
+static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c)
+{
+	struct open_bucket *ob;
+
+	BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free);
+
+	ob = c->open_buckets + c->open_buckets_freelist;
+	c->open_buckets_freelist = ob->freelist;
+	atomic_set(&ob->pin, 1);
+
+	c->open_buckets_nr_free--;
+	return ob;
+}
+
 /*
  * XXX: allocation on startup is still sketchy. There is insufficient
  * synchronization for bch2_bucket_alloc_startup() to work correctly after
@@ -994,7 +1035,7 @@ static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca)
 	for_each_bucket(g, ca)
 		if (!g->mark.touched_this_mount &&
 		    is_available_bucket(g->mark) &&
-		    bch2_mark_alloc_bucket_startup(ca, g)) {
+		    bch2_mark_alloc_bucket_startup(c, ca, g)) {
 			r = g - ca->buckets;
 			set_bit(r, ca->bucket_dirty);
 			break;
@@ -1004,69 +1045,105 @@ out:
 	return r;
 }
 
+static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
+{
+	switch (reserve) {
+	case RESERVE_ALLOC:
+		return 0;
+	case RESERVE_BTREE:
+		return BTREE_NODE_RESERVE / 2;
+	default:
+		return BTREE_NODE_RESERVE;
+	}
+}
+
 /**
  * bch_bucket_alloc - allocate a single bucket from a specific device
  *
  * Returns index of bucket on success, 0 on failure
  * */
-long bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
-		       enum alloc_reserve reserve)
+int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
+		      enum alloc_reserve reserve,
+		      bool may_alloc_partial,
+		      struct closure *cl)
 {
-	size_t r;
+	struct open_bucket *ob;
+	long bucket;
+
+	spin_lock(&c->freelist_lock);
+	if (may_alloc_partial &&
+	    ca->open_buckets_partial_nr) {
+		int ret = ca->open_buckets_partial[--ca->open_buckets_partial_nr];
+		c->open_buckets[ret].on_partial_list = false;
+		spin_unlock(&c->freelist_lock);
+		return ret;
+	}
+
+	if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) {
+		if (cl)
+			closure_wait(&c->open_buckets_wait, cl);
+		spin_unlock(&c->freelist_lock);
+		trace_open_bucket_alloc_fail(ca, reserve);
+		return OPEN_BUCKETS_EMPTY;
+	}
 
-	spin_lock(&ca->freelist_lock);
-	if (likely(fifo_pop(&ca->free[RESERVE_NONE], r)))
+	if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket)))
 		goto out;
 
 	switch (reserve) {
 	case RESERVE_ALLOC:
-		if (fifo_pop(&ca->free[RESERVE_BTREE], r))
+		if (fifo_pop(&ca->free[RESERVE_BTREE], bucket))
 			goto out;
 		break;
 	case RESERVE_BTREE:
 		if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >=
 		    ca->free[RESERVE_BTREE].size &&
-		    fifo_pop(&ca->free[RESERVE_BTREE], r))
+		    fifo_pop(&ca->free[RESERVE_BTREE], bucket))
 			goto out;
 		break;
 	case RESERVE_MOVINGGC:
-		if (fifo_pop(&ca->free[RESERVE_MOVINGGC], r))
+		if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket))
 			goto out;
 		break;
 	default:
 		break;
 	}
 
-	spin_unlock(&ca->freelist_lock);
-
 	if (unlikely(!ca->alloc_thread_started) &&
 	    (reserve == RESERVE_ALLOC) &&
-	    (r = bch2_bucket_alloc_startup(c, ca)) >= 0) {
-		verify_not_on_freelist(ca, r);
-		goto out2;
-	}
+	    (bucket = bch2_bucket_alloc_startup(c, ca)) >= 0)
+		goto out;
+
+	spin_unlock(&c->freelist_lock);
 
 	trace_bucket_alloc_fail(ca, reserve);
-	return -1;
+	return FREELIST_EMPTY;
 out:
-	verify_not_on_freelist(ca, r);
-	spin_unlock(&ca->freelist_lock);
+	verify_not_on_freelist(c, ca, bucket);
+
+	ob = bch2_open_bucket_alloc(c);
+
+	spin_lock(&ob->lock);
+	ob->valid	= true;
+	ob->sectors_free = ca->mi.bucket_size;
+	ob->ptr		= (struct bch_extent_ptr) {
+		.gen	= ca->buckets[bucket].mark.gen,
+		.offset	= bucket_to_sector(ca, bucket),
+		.dev	= ca->dev_idx,
+	};
+	spin_unlock(&ob->lock);
+
+	spin_unlock(&c->freelist_lock);
 
 	bch2_wake_allocator(ca);
-out2:
-	ca->buckets[r].prio[READ]	= c->prio_clock[READ].hand;
-	ca->buckets[r].prio[WRITE]	= c->prio_clock[WRITE].hand;
+
+	ca->buckets[bucket].prio[READ]	= c->prio_clock[READ].hand;
+	ca->buckets[bucket].prio[WRITE]	= c->prio_clock[WRITE].hand;
 
 	trace_bucket_alloc(ca, reserve);
-	return r;
+	return ob - c->open_buckets;
 }
 
-enum bucket_alloc_ret {
-	ALLOC_SUCCESS,
-	NO_DEVICES,		/* -EROFS */
-	FREELIST_EMPTY,		/* Allocator thread not keeping up */
-};
-
 struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
 					 struct write_point *wp,
 					 struct bch_devs_mask *devs)
@@ -1091,11 +1168,7 @@ struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c,
 				break;
 		}
 
-		memmove(&ret.devs[j + 1],
-			&ret.devs[j],
-			sizeof(ret.devs[0]) * (ret.nr - j));
-		ret.nr++;
-		ret.devs[j] = i;
+		array_insert_item(ret.devs, ret.nr, j, i);
 	}
 
 	return ret;
@@ -1112,63 +1185,46 @@ void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca,
 
 static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
 					struct write_point *wp,
-					struct open_bucket *ob,
 					unsigned nr_replicas,
 					enum alloc_reserve reserve,
-					struct bch_devs_mask *devs)
+					struct bch_devs_mask *devs,
+					struct closure *cl)
 {
 	enum bucket_alloc_ret ret = NO_DEVICES;
 	struct dev_alloc_list devs_sorted;
 	u64 buckets_free;
 	unsigned i;
 
-	BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs));
+	BUG_ON(nr_replicas > ARRAY_SIZE(wp->ptrs));
 
-	if (ob->nr_ptrs >= nr_replicas)
+	if (wp->nr_ptrs >= nr_replicas)
 		return ALLOC_SUCCESS;
 
 	rcu_read_lock();
 	devs_sorted = bch2_wp_alloc_list(c, wp, devs);
-	spin_lock(&ob->lock);
 
 	for (i = 0; i < devs_sorted.nr; i++) {
 		struct bch_dev *ca =
 			rcu_dereference(c->devs[devs_sorted.devs[i]]);
-		struct open_bucket_ptr ptr;
+		int ob;
 
 		if (!ca)
 			continue;
 
-		if (wp->type == BCH_DATA_USER &&
-		    ca->open_buckets_partial_nr) {
-			ptr = ca->open_buckets_partial[--ca->open_buckets_partial_nr];
-		} else {
-			long bucket = bch2_bucket_alloc(c, ca, reserve);
-			if (bucket < 0) {
-				ret = FREELIST_EMPTY;
-				continue;
-			}
-
-			ptr = (struct open_bucket_ptr) {
-				.ptr.gen	= ca->buckets[bucket].mark.gen,
-				.ptr.offset	= bucket_to_sector(ca, bucket),
-				.ptr.dev	= ca->dev_idx,
-				.sectors_free	= ca->mi.bucket_size,
-			};
+		ob = bch2_bucket_alloc(c, ca, reserve,
+				       wp->type == BCH_DATA_USER, cl);
+		if (ob < 0) {
+			ret = ob;
+			if (ret == OPEN_BUCKETS_EMPTY)
+				break;
+			continue;
 		}
 
-		/*
-		 * open_bucket_add_buckets expects new pointers at the head of
-		 * the list:
-		 */
-		BUG_ON(ob->nr_ptrs >= ARRAY_SIZE(ob->ptrs));
-		memmove(&ob->ptrs[1],
-			&ob->ptrs[0],
-			ob->nr_ptrs * sizeof(ob->ptrs[0]));
-		ob->nr_ptrs++;
-		ob->ptrs[0] = ptr;
-
-		buckets_free = U64_MAX, dev_buckets_free(ca);
+		BUG_ON(ob <= 0 || ob > U8_MAX);
+		BUG_ON(wp->nr_ptrs >= ARRAY_SIZE(wp->ptrs));
+		wp->ptrs[wp->nr_ptrs++] = c->open_buckets + ob;
+
+		buckets_free = U64_MAX, dev_buckets_free(c, ca);
 		if (buckets_free)
 			wp->next_alloc[ca->dev_idx] +=
 				div64_u64(U64_MAX, buckets_free *
@@ -1179,20 +1235,21 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c,
 
 		__clear_bit(ca->dev_idx, devs->d);
 
-		if (ob->nr_ptrs == nr_replicas) {
+		if (wp->nr_ptrs == nr_replicas) {
 			ret = ALLOC_SUCCESS;
 			break;
 		}
 	}
 
-	EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC);
-	spin_unlock(&ob->lock);
+	EBUG_ON(reserve == RESERVE_MOVINGGC &&
+		ret != ALLOC_SUCCESS &&
+		ret != OPEN_BUCKETS_EMPTY);
 	rcu_read_unlock();
 	return ret;
 }
 
 static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
-				struct open_bucket *ob, unsigned nr_replicas,
+				unsigned nr_replicas,
 				enum alloc_reserve reserve,
 				struct bch_devs_mask *devs,
 				struct closure *cl)
@@ -1200,8 +1257,8 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
 	bool waiting = false;
 
 	while (1) {
-		switch (__bch2_bucket_alloc_set(c, wp, ob, nr_replicas,
-						reserve, devs)) {
+		switch (__bch2_bucket_alloc_set(c, wp, nr_replicas,
+						reserve, devs, cl)) {
 		case ALLOC_SUCCESS:
 			if (waiting)
 				closure_wake_up(&c->freelist_wait);
@@ -1214,10 +1271,6 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
 			return -EROFS;
 
 		case FREELIST_EMPTY:
-			if (!cl || waiting)
-				trace_freelist_empty_fail(c,
-							reserve, cl);
-
 			if (!cl)
 				return -ENOSPC;
 
@@ -1228,226 +1281,89 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp,
 			closure_wait(&c->freelist_wait, cl);
 			waiting = true;
 			break;
+		case OPEN_BUCKETS_EMPTY:
+			return cl ? -EAGAIN : -ENOSPC;
 		default:
 			BUG();
 		}
 	}
 }
 
-/* Open buckets: */
-
-/*
- * Open buckets represent one or more buckets (on multiple devices) that are
- * currently being allocated from. They serve two purposes:
- *
- *  - They track buckets that have been partially allocated, allowing for
- *    sub-bucket sized allocations - they're used by the sector allocator below
- *
- *  - They provide a reference to the buckets they own that mark and sweep GC
- *    can find, until the new allocation has a pointer to it inserted into the
- *    btree
- *
- * When allocating some space with the sector allocator, the allocation comes
- * with a reference to an open bucket - the caller is required to put that
- * reference _after_ doing the index update that makes its allocation reachable.
- */
+/* Sector allocator */
 
-void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+static void writepoint_drop_ptrs(struct bch_fs *c,
+				 struct write_point *wp,
+				 struct bch_devs_mask *devs,
+				 unsigned nr_ptrs_dislike)
 {
-	const struct open_bucket_ptr *ptr;
-	u8 new_ob;
+	int i;
 
-	if (!atomic_dec_and_test(&ob->pin))
+	if (!nr_ptrs_dislike)
 		return;
 
-	down_read(&c->alloc_gc_lock);
-	spin_lock(&ob->lock);
-
-	open_bucket_for_each_ptr(ob, ptr) {
-		struct bch_dev *ca = c->devs[ptr->ptr.dev];
+	for (i = wp->nr_ptrs - 1; i >= 0; --i) {
+		struct open_bucket *ob = wp->ptrs[i];
+		struct bch_dev *ca = c->devs[ob->ptr.dev];
 
-		if (ptr->sectors_free) {
-			/*
-			 * This is a ptr to a bucket that still has free space,
-			 * but we don't want to use it
-			 */
+		if (nr_ptrs_dislike && !test_bit(ob->ptr.dev, devs->d)) {
 			BUG_ON(ca->open_buckets_partial_nr >=
 			       ARRAY_SIZE(ca->open_buckets_partial));
 
-			spin_lock(&ca->freelist_lock);
-			ca->open_buckets_partial[ca->open_buckets_partial_nr++]
-				= *ptr;
-			spin_unlock(&ca->freelist_lock);
-		} else {
-			bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), false);
-		}
-	}
-	ob->nr_ptrs = 0;
-
-	spin_unlock(&ob->lock);
-	up_read(&c->alloc_gc_lock);
-
-	new_ob = ob->new_ob;
-	ob->new_ob = 0;
-
-	spin_lock(&c->open_buckets_lock);
-	ob->freelist = c->open_buckets_freelist;
-	c->open_buckets_freelist = ob - c->open_buckets;
-	c->open_buckets_nr_free++;
-	spin_unlock(&c->open_buckets_lock);
-
-	closure_wake_up(&c->open_buckets_wait);
-
-	if (new_ob)
-		bch2_open_bucket_put(c, c->open_buckets + new_ob);
-}
-
-static struct open_bucket *bch2_open_bucket_get(struct bch_fs *c,
-						unsigned nr_reserved,
-						struct closure *cl)
-{
-	struct open_bucket *ret;
-
-	spin_lock(&c->open_buckets_lock);
-
-	if (c->open_buckets_nr_free > nr_reserved) {
-		BUG_ON(!c->open_buckets_freelist);
-
-		ret = c->open_buckets + c->open_buckets_freelist;
-		c->open_buckets_freelist = ret->freelist;
-		atomic_set(&ret->pin, 1); /* XXX */
+			spin_lock(&c->freelist_lock);
+			ob->on_partial_list = true;
+			ca->open_buckets_partial[ca->open_buckets_partial_nr++] =
+				ob - c->open_buckets;
+			spin_unlock(&c->freelist_lock);
 
-		BUG_ON(ret->new_ob);
-		BUG_ON(ret->nr_ptrs);
-
-		c->open_buckets_nr_free--;
-		trace_open_bucket_alloc(c, cl);
-	} else {
-		trace_open_bucket_alloc_fail(c, cl);
-
-		if (cl) {
-			closure_wait(&c->open_buckets_wait, cl);
-			ret = ERR_PTR(-EAGAIN);
-		} else
-			ret = ERR_PTR(-ENOSPC);
-	}
-
-	spin_unlock(&c->open_buckets_lock);
-
-	return ret;
-}
-
-static unsigned open_bucket_sectors_free(struct bch_fs *c,
-					 struct open_bucket *ob,
-					 unsigned nr_replicas)
-{
-	unsigned sectors_free = UINT_MAX;
-	struct open_bucket_ptr *ptr;
-
-	open_bucket_for_each_ptr(ob, ptr)
-		sectors_free = min(sectors_free, ptr->sectors_free);
-
-	return sectors_free != UINT_MAX ? sectors_free : 0;
-}
-
-static void open_bucket_move_ptrs(struct bch_fs *c,
-				  struct open_bucket *dst,
-				  struct open_bucket *src,
-				  struct bch_devs_mask *devs,
-				  unsigned nr_ptrs_dislike)
-{
-	bool moved_ptr = false;
-	int i;
-
-	down_read(&c->alloc_gc_lock);
-
-	if (dst < src) {
-		spin_lock(&dst->lock);
-		spin_lock_nested(&src->lock, 1);
-	} else {
-		spin_lock(&src->lock);
-		spin_lock_nested(&dst->lock, 1);
-	}
+			closure_wake_up(&c->open_buckets_wait);
+			closure_wake_up(&c->freelist_wait);
 
-	for (i = src->nr_ptrs - 1; i >= 0; --i) {
-		if (!src->ptrs[i].sectors_free) {
-			/*
-			 * Don't do anything: leave the ptr on the old
-			 * open_bucket for gc to find
-			 */
-		} else if (nr_ptrs_dislike &&
-			   !test_bit(src->ptrs[i].ptr.dev, devs->d)) {
-			/*
-			 * We don't want this pointer; bch2_open_bucket_put()
-			 * will stick it on ca->open_buckets_partial to be
-			 * reused
-			 */
+			array_remove_item(wp->ptrs, wp->nr_ptrs, i);
 			--nr_ptrs_dislike;
-		} else {
-			BUG_ON(dst->nr_ptrs >= ARRAY_SIZE(dst->ptrs));
-
-			dst->ptrs[dst->nr_ptrs++] = src->ptrs[i];
-
-			src->nr_ptrs--;
-			memmove(&src->ptrs[i],
-				&src->ptrs[i + 1],
-				(src->nr_ptrs - i) * sizeof(src->ptrs[0]));
-
-			moved_ptr = true;
 		}
 	}
-
-	if (moved_ptr) {
-		BUG_ON(src->new_ob);
-
-		atomic_inc(&dst->pin);
-		src->new_ob = dst - c->open_buckets;
-	}
-
-	spin_unlock(&dst->lock);
-	spin_unlock(&src->lock);
-	up_read(&c->alloc_gc_lock);
 }
 
-static void verify_not_stale(struct bch_fs *c, const struct open_bucket *ob)
+static void verify_not_stale(struct bch_fs *c, const struct write_point *wp)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
-	const struct open_bucket_ptr *ptr;
+	struct open_bucket *ob;
+	unsigned i;
 
-	open_bucket_for_each_ptr(ob, ptr) {
-		struct bch_dev *ca = c->devs[ptr->ptr.dev];
+	writepoint_for_each_ptr(wp, ob, i) {
+		struct bch_dev *ca = c->devs[ob->ptr.dev];
 
-		BUG_ON(ptr_stale(ca, &ptr->ptr));
+		BUG_ON(ptr_stale(ca, &ob->ptr));
 	}
 #endif
 }
 
-/* Sector allocator */
-
 static int open_bucket_add_buckets(struct bch_fs *c,
-				   struct write_point *wp,
 				   struct bch_devs_mask *_devs,
-				   struct open_bucket *ob,
+				   struct write_point *wp,
+				   struct bch_devs_list *devs_have,
 				   unsigned nr_replicas,
 				   enum alloc_reserve reserve,
 				   struct closure *cl)
 {
 	struct bch_devs_mask devs = c->rw_devs[wp->type];
-	struct open_bucket_ptr *ptr;
+	struct open_bucket *ob;
+	unsigned i;
 
-	if (ob->nr_ptrs >= nr_replicas)
+	if (wp->nr_ptrs >= nr_replicas)
 		return 0;
 
+	/* Don't allocate from devices we already have pointers to: */
+	for (i = 0; i < devs_have->nr; i++)
+		__clear_bit(devs_have->devs[i], devs.d);
+
+	writepoint_for_each_ptr(wp, ob, i)
+		__clear_bit(ob->ptr.dev, devs.d);
+
 	if (_devs)
 		bitmap_and(devs.d, devs.d, _devs->d, BCH_SB_MEMBERS_MAX);
 
-	/* Don't allocate from devices we already have pointers to: */
-	open_bucket_for_each_ptr(ob, ptr)
-		if (ptr->sectors_free)
-			__clear_bit(ptr->ptr.dev, devs.d);
-
-	return bch2_bucket_alloc_set(c, wp, ob, nr_replicas,
-				     reserve, &devs, cl);
+	return bch2_bucket_alloc_set(c, wp, nr_replicas, reserve, &devs, cl);
 }
 
 static struct write_point *__writepoint_find(struct hlist_head *head,
@@ -1455,15 +1371,9 @@ static struct write_point *__writepoint_find(struct hlist_head *head,
 {
 	struct write_point *wp;
 
-	hlist_for_each_entry_rcu(wp, head, node) {
-		if (wp->write_point == write_point)
-			continue;
-
-		mutex_lock(&wp->lock);
+	hlist_for_each_entry_rcu(wp, head, node)
 		if (wp->write_point == write_point)
 			return wp;
-		mutex_unlock(&wp->lock);
-	}
 
 	return NULL;
 }
@@ -1478,47 +1388,49 @@ static struct hlist_head *writepoint_hash(struct bch_fs *c,
 }
 
 static struct write_point *writepoint_find(struct bch_fs *c,
-					   enum bch_data_type data_type,
 					   unsigned long write_point)
 {
-	struct write_point *wp, *oldest = NULL;
+	struct write_point *wp, *oldest;
 	struct hlist_head *head;
 
-	switch (data_type) {
-	case BCH_DATA_BTREE:
-		wp = &c->btree_write_point;
+	if (!(write_point & 1UL)) {
+		wp = (struct write_point *) write_point;
 		mutex_lock(&wp->lock);
 		return wp;
-	case BCH_DATA_USER:
-		break;
-	default:
-		BUG();
 	}
 
 	head = writepoint_hash(c, write_point);
+restart_find:
 	wp = __writepoint_find(head, write_point);
-	if (wp)
-		goto out;
-
-	mutex_lock(&c->write_points_hash_lock);
-	wp = __writepoint_find(head, write_point);
-	if (wp)
-		goto out_unlock;
+	if (wp) {
+lock_wp:
+		mutex_lock(&wp->lock);
+		if (wp->write_point == write_point)
+			goto out;
+		mutex_unlock(&wp->lock);
+		goto restart_find;
+	}
 
+	oldest = NULL;
 	for (wp = c->write_points;
 	     wp < c->write_points + ARRAY_SIZE(c->write_points);
 	     wp++)
 		if (!oldest || time_before64(wp->last_used, oldest->last_used))
 			oldest = wp;
 
-	wp = oldest;
-	BUG_ON(!wp);
+	mutex_lock(&oldest->lock);
+	mutex_lock(&c->write_points_hash_lock);
+	wp = __writepoint_find(head, write_point);
+	if (wp && wp != oldest) {
+		mutex_unlock(&c->write_points_hash_lock);
+		mutex_unlock(&oldest->lock);
+		goto lock_wp;
+	}
 
-	mutex_lock(&wp->lock);
+	wp = oldest;
 	hlist_del_rcu(&wp->node);
 	wp->write_point = write_point;
 	hlist_add_head_rcu(&wp->node, head);
-out_unlock:
 	mutex_unlock(&c->write_points_hash_lock);
 out:
 	wp->last_used = sched_clock();
@@ -1529,97 +1441,81 @@ out:
  * Get us an open_bucket we can allocate from, return with it locked:
  */
 struct write_point *bch2_alloc_sectors_start(struct bch_fs *c,
-					     enum bch_data_type data_type,
-					     struct bch_devs_mask *devs,
-					     unsigned long write_point,
-					     unsigned nr_replicas,
-					     unsigned nr_replicas_required,
-					     enum alloc_reserve reserve,
-					     unsigned flags,
-					     struct closure *cl)
+				struct bch_devs_mask *devs,
+				struct write_point_specifier write_point,
+				struct bch_devs_list *devs_have,
+				unsigned nr_replicas,
+				unsigned nr_replicas_required,
+				enum alloc_reserve reserve,
+				unsigned flags,
+				struct closure *cl)
 {
-	struct open_bucket *ob;
 	struct write_point *wp;
-	struct open_bucket_ptr *ptr;
-	unsigned open_buckets_reserved = data_type == BCH_DATA_BTREE
-		? 0 : BTREE_NODE_RESERVE;
-	unsigned nr_ptrs_empty = 0, nr_ptrs_dislike = 0;
+	struct open_bucket *ob;
+	unsigned i, nr_ptrs_dislike = 0, nr_ptrs_have = 0;
 	int ret;
 
-	BUG_ON(!nr_replicas);
+	BUG_ON(!nr_replicas || !nr_replicas_required);
 
-	wp = writepoint_find(c, data_type, write_point);
-	BUG_ON(wp->type != data_type);
-
-	wp->last_used = sched_clock();
-
-	ob = wp->ob;
+	wp = writepoint_find(c, write_point.v);
 
 	/* does ob have ptrs we don't need? */
-	open_bucket_for_each_ptr(ob, ptr) {
-		if (!ptr->sectors_free)
-			nr_ptrs_empty++;
-		else if (devs && !test_bit(ptr->ptr.dev, devs->d))
+	writepoint_for_each_ptr(wp, ob, i)
+		if (bch2_dev_list_has_dev(*devs_have, ob->ptr.dev))
+			nr_ptrs_have++;
+		else if (devs && !test_bit(ob->ptr.dev, devs->d))
 			nr_ptrs_dislike++;
-	}
 
-	ret = open_bucket_add_buckets(c, wp, devs, ob,
-				nr_replicas + nr_ptrs_empty + nr_ptrs_dislike,
+	ret = open_bucket_add_buckets(c, devs, wp, devs_have,
+				nr_replicas + nr_ptrs_have + nr_ptrs_dislike,
 				reserve, cl);
 	if (ret && ret != -EROFS)
 		goto err;
 
-	if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
-		goto alloc_done;
-
-	/*
-	 * XXX:
-	 * Should this allocation be _forced_ to used the specified device (e.g.
-	 * internal migration), or should we fall back to allocating from all
-	 * devices?
-	 */
-	ret = open_bucket_add_buckets(c, wp, NULL, ob,
-				nr_replicas + nr_ptrs_empty,
-				reserve, cl);
-	if (ret && ret != -EROFS)
-		goto err;
-alloc_done:
-	if (ob->nr_ptrs - nr_ptrs_empty -
-	    ((flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) ? nr_ptrs_dislike : 0)
-	    < nr_replicas_required) {
+	if (wp->nr_ptrs <
+	    nr_ptrs_have + nr_ptrs_dislike + nr_replicas_required) {
 		ret = -EROFS;
 		goto err;
 	}
 
+	if ((int) wp->nr_ptrs - nr_ptrs_dislike < nr_replicas)
+		nr_ptrs_dislike = clamp_t(int, wp->nr_ptrs - nr_replicas,
+					  0, nr_ptrs_dislike);
+
+	/* Remove pointers we don't want to use: */
+	writepoint_drop_ptrs(c, wp, devs, nr_ptrs_dislike);
+
 	/*
-	 * If ob->sectors_free == 0, one or more of the buckets ob points to is
-	 * full. We can't drop pointers from an open bucket - garbage collection
-	 * still needs to find them; instead, we must allocate a new open bucket
-	 * and copy any pointers to non-full buckets into the new open bucket.
+	 * Move pointers to devices we already have to end of open bucket
+	 * pointer list - note that removing pointers we don't want to use might
+	 * have changed nr_ptrs_have:
 	 */
-	BUG_ON(ob->nr_ptrs - nr_ptrs_empty - nr_replicas > nr_ptrs_dislike);
-	nr_ptrs_dislike = ob->nr_ptrs - nr_ptrs_empty - nr_replicas;
-
-	if (nr_ptrs_empty || nr_ptrs_dislike) {
-		ob = bch2_open_bucket_get(c, open_buckets_reserved, cl);
-		if (IS_ERR(ob)) {
-			ret = PTR_ERR(ob);
-			goto err;
-		}
+	if (nr_ptrs_have) {
+		i = nr_ptrs_have = 0;
+		while (i < wp->nr_ptrs - nr_ptrs_have)
+			if (bch2_dev_list_has_dev(*devs_have, wp->ptrs[i]->ptr.dev)) {
+				nr_ptrs_have++;
+				swap(wp->ptrs[i], wp->ptrs[wp->nr_ptrs - nr_ptrs_have]);
+			} else {
+				i++;
+			}
+	}
 
-		/* Remove pointers we don't want to use: */
+	wp->nr_ptrs_can_use =
+		min_t(unsigned, nr_replicas, wp->nr_ptrs - nr_ptrs_have);
 
-		open_bucket_move_ptrs(c, ob, wp->ob, devs, nr_ptrs_dislike);
-		bch2_open_bucket_put(c, wp->ob);
-		wp->ob = ob;
-	}
+	BUG_ON(wp->nr_ptrs_can_use < nr_replicas_required ||
+	       wp->nr_ptrs_can_use > wp->nr_ptrs);
+
+	wp->sectors_free = UINT_MAX;
 
-	BUG_ON(ob->nr_ptrs < nr_replicas_required);
+	for (i = 0; i < wp->nr_ptrs_can_use; i++)
+		wp->sectors_free = min(wp->sectors_free,
+				       wp->ptrs[i]->sectors_free);
 
-	wp->sectors_free = open_bucket_sectors_free(c, ob, nr_replicas);
+	BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX);
 
-	BUG_ON(!wp->sectors_free);
-	verify_not_stale(c, ob);
+	verify_not_stale(c, wp);
 
 	return wp;
 err:
@@ -1631,31 +1527,27 @@ err:
  * Append pointers to the space we just allocated to @k, and mark @sectors space
  * as allocated out of @ob
  */
-void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e,
-				    unsigned nr_replicas, struct open_bucket *ob,
-				    unsigned sectors)
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp,
+				    struct bkey_i_extent *e, unsigned sectors)
 {
-	struct bch_extent_ptr tmp;
-	struct open_bucket_ptr *ptr;
+	unsigned i;
 
-	/*
-	 * We're keeping any existing pointer k has, and appending new pointers:
-	 * __bch2_write() will only write to the pointers we add here:
-	 */
+	BUG_ON(sectors > wp->sectors_free);
+	wp->sectors_free -= sectors;
 
-	for (ptr = ob->ptrs;
-	     ptr < ob->ptrs + min_t(u8, ob->nr_ptrs, nr_replicas); ptr++) {
-		struct bch_dev *ca = c->devs[ptr->ptr.dev];
+	for (i = 0; i < wp->nr_ptrs_can_use; i++) {
+		struct open_bucket *ob = wp->ptrs[i];
+		struct bch_dev *ca = c->devs[ob->ptr.dev];
+		struct bch_extent_ptr tmp = ob->ptr;
 
-		EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ptr->ptr.dev));
+		EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev));
 
-		tmp = ptr->ptr;
 		tmp.cached = bkey_extent_is_cached(&e->k);
-		tmp.offset += ca->mi.bucket_size - ptr->sectors_free;
+		tmp.offset += ca->mi.bucket_size - ob->sectors_free;
 		extent_ptr_append(e, tmp);
 
-		BUG_ON(sectors > ptr->sectors_free);
-		ptr->sectors_free -= sectors;
+		BUG_ON(sectors > ob->sectors_free);
+		ob->sectors_free -= sectors;
 	}
 }
 
@@ -1665,76 +1557,20 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e,
  */
 void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp)
 {
-	struct open_bucket *ob = wp->ob, *new_ob = NULL;
-	struct open_bucket_ptr *ptr;
-	bool empty = false;
-
-	open_bucket_for_each_ptr(ob, ptr)
-		empty |= !ptr->sectors_free;
+	int i;
 
-	if (empty)
-		new_ob = bch2_open_bucket_get(c, 0, NULL);
+	for (i = wp->nr_ptrs - 1; i >= 0; --i) {
+		struct open_bucket *ob = wp->ptrs[i];
 
-	if (!IS_ERR_OR_NULL(new_ob)) {
-		/* writepoint's ref becomes our ref: */
-		wp->ob = new_ob;
-		open_bucket_move_ptrs(c, new_ob, ob, 0, 0);
-	} else {
-		atomic_inc(&ob->pin);
+		if (!ob->sectors_free) {
+			array_remove_item(wp->ptrs, wp->nr_ptrs, i);
+			bch2_open_bucket_put(c, ob);
+		}
 	}
 
 	mutex_unlock(&wp->lock);
 }
 
-/*
- * Allocates some space in the cache to write to, and k to point to the newly
- * allocated space, and updates k->size and k->offset (to point to the
- * end of the newly allocated space).
- *
- * May allocate fewer sectors than @sectors, k->size indicates how many
- * sectors were actually allocated.
- *
- * Return codes:
- * - -EAGAIN: closure was added to waitlist
- * - -ENOSPC: out of space and no closure provided
- *
- * @c  - filesystem.
- * @wp - write point to use for allocating sectors.
- * @k  - key to return the allocated space information.
- * @cl - closure to wait for a bucket
- */
-struct open_bucket *bch2_alloc_sectors(struct bch_fs *c,
-				       enum bch_data_type data_type,
-				       struct bch_devs_mask *devs,
-				       unsigned long write_point,
-				       struct bkey_i_extent *e,
-				       unsigned nr_replicas,
-				       unsigned nr_replicas_required,
-				       enum alloc_reserve reserve,
-				       unsigned flags,
-				       struct closure *cl)
-{
-	struct write_point *wp;
-	struct open_bucket *ob;
-
-	wp = bch2_alloc_sectors_start(c, data_type, devs, write_point,
-				      nr_replicas, nr_replicas_required,
-				      reserve, flags, cl);
-	if (IS_ERR_OR_NULL(wp))
-		return ERR_CAST(wp);
-
-	ob = wp->ob;
-
-	if (e->k.size > wp->sectors_free)
-		bch2_key_resize(&e->k, wp->sectors_free);
-
-	bch2_alloc_sectors_append_ptrs(c, e, nr_replicas, ob, e->k.size);
-
-	bch2_alloc_sectors_done(c, wp);
-
-	return ob;
-}
-
 /* Startup/shutdown (ro/rw): */
 
 void bch2_recalc_capacity(struct bch_fs *c)
@@ -1839,46 +1675,15 @@ set_capacity:
 	closure_wake_up(&c->freelist_wait);
 }
 
-static bool open_bucket_has_device(struct open_bucket *ob,
-				   struct bch_dev *ca)
-{
-	struct open_bucket_ptr *ptr;
-	bool ret = false;
-
-	spin_lock(&ob->lock);
-	open_bucket_for_each_ptr(ob, ptr)
-		ret |= ptr->ptr.dev == ca->dev_idx;
-	spin_unlock(&ob->lock);
-
-	return ret;
-}
-
 static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca,
 				  struct write_point *wp)
 {
-	struct open_bucket *ob;
-	struct closure cl;
+	struct bch_devs_mask not_self;
 
-	closure_init_stack(&cl);
-retry:
-	mutex_lock(&wp->lock);
-	if (!open_bucket_has_device(wp->ob, ca)) {
-		mutex_unlock(&wp->lock);
-		return;
-	}
-
-	ob = bch2_open_bucket_get(c, 0, &cl);
-	if (IS_ERR(ob)) {
-		mutex_unlock(&wp->lock);
-		closure_sync(&cl);
-		goto retry;
-
-	}
-
-	open_bucket_move_ptrs(c, ob, wp->ob, &ca->self, ob->nr_ptrs);
-	bch2_open_bucket_put(c, wp->ob);
-	wp->ob = ob;
+	bitmap_complement(not_self.d, ca->self.d, BCH_SB_MEMBERS_MAX);
 
+	mutex_lock(&wp->lock);
+	writepoint_drop_ptrs(c, wp, &not_self, wp->nr_ptrs);
 	mutex_unlock(&wp->lock);
 }
 
@@ -1889,9 +1694,13 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
 
 	for (ob = c->open_buckets;
 	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
-	     ob++)
-		if (atomic_read(&ob->pin))
-			ret |= open_bucket_has_device(ob, ca);
+	     ob++) {
+		spin_lock(&ob->lock);
+		if (ob->valid && !ob->on_partial_list &&
+		    ob->ptr.dev == ca->dev_idx)
+			ret = true;
+		spin_unlock(&ob->lock);
+	}
 
 	return ret;
 }
@@ -1899,13 +1708,10 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
 /* device goes ro: */
 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 {
-	struct closure cl;
 	unsigned i;
 
 	BUG_ON(ca->alloc_thread);
 
-	closure_init_stack(&cl);
-
 	/* First, remove device from allocation groups: */
 
 	clear_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d);
@@ -1920,6 +1726,9 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 	/* Next, close write points that point to this device... */
 	for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
 		bch2_stop_write_point(c, ca, &c->write_points[i]);
+
+	bch2_stop_write_point(c, ca, &ca->copygc_write_point);
+	bch2_stop_write_point(c, ca, &c->tiers[ca->mi.tier].wp);
 	bch2_stop_write_point(c, ca, &c->btree_write_point);
 
 	mutex_lock(&c->btree_reserve_cache_lock);
@@ -1927,7 +1736,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 		struct btree_alloc *a =
 			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];
 
-		bch2_open_bucket_put(c, a->ob);
+		bch2_open_bucket_put_refs(c, &a->ob.nr, a->ob.refs);
 	}
 	mutex_unlock(&c->btree_reserve_cache_lock);
 
@@ -1945,16 +1754,8 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
 
 	/* Now wait for any in flight writes: */
 
-	while (1) {
-		closure_wait(&c->open_buckets_wait, &cl);
-
-		if (!bch2_dev_has_open_write_point(c, ca)) {
-			closure_wake_up(&c->open_buckets_wait);
-			break;
-		}
-
-		closure_sync(&cl);
-	}
+	closure_wait_event(&c->open_buckets_wait,
+			   !bch2_dev_has_open_write_point(c, ca));
 }
 
 /* device goes rw: */
@@ -2015,10 +1816,10 @@ void bch2_fs_allocator_init(struct bch_fs *c)
 {
 	struct open_bucket *ob;
 	struct write_point *wp;
+	unsigned i;
 
 	mutex_init(&c->write_points_hash_lock);
-	init_rwsem(&c->alloc_gc_lock);
-	spin_lock_init(&c->open_buckets_lock);
+	spin_lock_init(&c->freelist_lock);
 	bch2_prio_timer_init(c, READ);
 	bch2_prio_timer_init(c, WRITE);
 
@@ -2034,40 +1835,20 @@ void bch2_fs_allocator_init(struct bch_fs *c)
 		c->open_buckets_freelist = ob - c->open_buckets;
 	}
 
-	mutex_init(&c->btree_write_point.lock);
-	c->btree_write_point.type	= BCH_DATA_BTREE;
-	c->btree_write_point.ob		= bch2_open_bucket_get(c, 0, NULL);
-	BUG_ON(IS_ERR(c->btree_write_point.ob));
+	writepoint_init(&c->btree_write_point, BCH_DATA_BTREE);
+
+	for (i = 0; i < ARRAY_SIZE(c->tiers); i++)
+		writepoint_init(&c->tiers[i].wp, BCH_DATA_USER);
 
 	for (wp = c->write_points;
 	     wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) {
-		mutex_init(&wp->lock);
-		wp->type	= BCH_DATA_USER;
-		wp->ob		= bch2_open_bucket_get(c, 0, NULL);
-		wp->last_used	= sched_clock();
+		writepoint_init(wp, BCH_DATA_USER);
 
+		wp->last_used	= sched_clock();
 		wp->write_point	= (unsigned long) wp;
 		hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point));
-
-		BUG_ON(IS_ERR(wp->ob));
 	}
 
 	c->pd_controllers_update_seconds = 5;
 	INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);
-
-	spin_lock_init(&c->foreground_write_pd_lock);
-	bch2_pd_controller_init(&c->foreground_write_pd);
-	/*
-	 * We do not want the write rate to have an effect on the computed
-	 * rate, for two reasons:
-	 *
-	 * We do not call bch2_ratelimit_delay() at all if the write rate
-	 * exceeds 1GB/s. In this case, the PD controller will think we are
-	 * not "keeping up" and not change the rate.
-	 */
-	c->foreground_write_pd.backpressure = 0;
-	init_timer(&c->foreground_write_wakeup);
-
-	c->foreground_write_wakeup.data = (unsigned long) c;
-	c->foreground_write_wakeup.function = bch2_wake_delayed_writes;
 }
diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h
index 1ea747d..8dffb86 100644
--- a/libbcachefs/alloc.h
+++ b/libbcachefs/alloc.h
@@ -8,7 +8,7 @@ struct bkey;
 struct bucket;
 struct bch_dev;
 struct bch_fs;
-struct dev_group;
+struct bch_devs_List;
 
 struct dev_alloc_list {
 	unsigned	nr;
@@ -24,33 +24,61 @@ void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
 int bch2_alloc_read(struct bch_fs *, struct list_head *);
 int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
 
-long bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve);
+enum bucket_alloc_ret {
+	ALLOC_SUCCESS		= 0,
+	OPEN_BUCKETS_EMPTY	= -1,
+	FREELIST_EMPTY		= -2,	/* Allocator thread not keeping up */
+	NO_DEVICES		= -3,	/* -EROFS */
+};
+
+int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
+		      struct closure *);
+
+void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
+
+static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+	if (atomic_dec_and_test(&ob->pin))
+		__bch2_open_bucket_put(c, ob);
+}
+
+static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs)
+{
+	unsigned i;
+
+	for (i = 0; i < *nr; i++)
+		bch2_open_bucket_put(c, c->open_buckets + refs[i]);
+
+	*nr = 0;
+}
+
+static inline void bch2_open_bucket_get(struct bch_fs *c,
+					struct write_point *wp,
+					u8 *nr, u8 *refs)
+{
+	unsigned i;
 
-void bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
+	for (i = 0; i < wp->nr_ptrs_can_use; i++) {
+		struct open_bucket *ob = wp->ptrs[i];
+
+		atomic_inc(&ob->pin);
+		refs[(*nr)++] = ob - c->open_buckets;
+	}
+}
 
 struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
-					     enum bch_data_type,
 					     struct bch_devs_mask *,
-					     unsigned long,
+					     struct write_point_specifier,
+					     struct bch_devs_list *,
 					     unsigned, unsigned,
 					     enum alloc_reserve,
 					     unsigned,
 					     struct closure *);
 
-void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct bkey_i_extent *,
-				   unsigned, struct open_bucket *, unsigned);
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
+				    struct bkey_i_extent *, unsigned);
 void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
 
-struct open_bucket *bch2_alloc_sectors(struct bch_fs *,
-				       enum bch_data_type,
-				       struct bch_devs_mask *,
-				       unsigned long,
-				       struct bkey_i_extent *,
-				       unsigned, unsigned,
-				       enum alloc_reserve,
-				       unsigned,
-				       struct closure *);
-
 static inline void bch2_wake_allocator(struct bch_dev *ca)
 {
 	struct task_struct *p;
@@ -61,10 +89,20 @@ static inline void bch2_wake_allocator(struct bch_dev *ca)
 	rcu_read_unlock();
 }
 
-#define open_bucket_for_each_ptr(_ob, _ptr)				\
-	for ((_ptr) = (_ob)->ptrs;					\
-	     (_ptr) < (_ob)->ptrs + (_ob)->nr_ptrs;			\
-	     (_ptr)++)
+#define writepoint_for_each_ptr(_wp, _ob, _i)				\
+	for ((_i) = 0;							\
+	     (_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true);	\
+	     (_i)++)
+
+static inline struct write_point_specifier writepoint_hashed(unsigned long v)
+{
+	return (struct write_point_specifier) { .v = v | 1 };
+}
+
+static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
+{
+	return (struct write_point_specifier) { .v = (unsigned long) wp };
+}
 
 void bch2_recalc_capacity(struct bch_fs *);
 
@@ -74,6 +112,13 @@ void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_stop(struct bch_dev *);
 int bch2_dev_allocator_start(struct bch_dev *);
 
+static inline void writepoint_init(struct write_point *wp,
+				   enum bch_data_type type)
+{
+	mutex_init(&wp->lock);
+	wp->type = type;
+}
+
 void bch2_fs_allocator_init(struct bch_fs *);
 
 extern const struct bkey_ops bch2_bkey_alloc_ops;
diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h
index c48d0aa..90123ff 100644
--- a/libbcachefs/alloc_types.h
+++ b/libbcachefs/alloc_types.h
@@ -47,19 +47,14 @@ enum alloc_reserve {
 #define OPEN_BUCKETS_COUNT	256
 #define WRITE_POINT_COUNT	32
 
-struct open_bucket_ptr {
-	struct bch_extent_ptr	ptr;
-	unsigned		sectors_free;
-};
-
 struct open_bucket {
 	spinlock_t		lock;
 	atomic_t		pin;
 	u8			freelist;
-	u8			new_ob;
-	u8			nr_ptrs;
-
-	struct open_bucket_ptr	ptrs[BCH_REPLICAS_MAX * 2];
+	bool			valid;
+	bool			on_partial_list;
+	unsigned		sectors_free;
+	struct bch_extent_ptr	ptr;
 };
 
 struct write_point {
@@ -69,13 +64,23 @@ struct write_point {
 	unsigned long		write_point;
 	enum bch_data_type	type;
 
+	u8			nr_ptrs;
+	/*
+	 * number of pointers in @ob we can't use, because we already had
+	 * pointers to those devices:
+	 */
+	u8			nr_ptrs_can_use;
 	/* calculated based on how many pointers we're actually going to use: */
 	unsigned		sectors_free;
 
-	struct open_bucket	*ob;
+	struct open_bucket	*ptrs[BCH_REPLICAS_MAX * 2];
 	u64			next_alloc[BCH_SB_MEMBERS_MAX];
 };
 
+struct write_point_specifier {
+	unsigned long		v;
+};
+
 struct alloc_heap_entry {
 	size_t			bucket;
 	unsigned long		key;
diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h
index 58d4723..b679dd1 100644
--- a/libbcachefs/bcachefs.h
+++ b/libbcachefs/bcachefs.h
@@ -251,9 +251,6 @@ do {									\
 	BCH_DEBUG_PARAM(debug_check_bkeys,				\
 		"Run bkey_debugcheck (primarily checking GC/allocation "\
 		"information) when iterating over keys")		\
-	BCH_DEBUG_PARAM(version_stress_test,				\
-		"Assigns random version numbers to newly written "	\
-		"extents, to test overlapping extent cases")		\
 	BCH_DEBUG_PARAM(verify_btree_ondisk,				\
 		"Reread btree nodes at various points to verify the "	\
 		"mergesort in the read path against modifications "	\
@@ -310,8 +307,9 @@ struct crypto_blkcipher;
 struct crypto_ahash;
 
 enum gc_phase {
-	GC_PHASE_SB_METADATA		= BTREE_ID_NR + 1,
+	GC_PHASE_SB		= BTREE_ID_NR + 1,
 	GC_PHASE_PENDING_DELETE,
+	GC_PHASE_ALLOC,
 	GC_PHASE_DONE
 };
 
@@ -321,30 +319,6 @@ struct gc_pos {
 	unsigned		level;
 };
 
-struct bch_member_cpu {
-	u64			nbuckets;	/* device size */
-	u16			first_bucket;   /* index of first bucket used */
-	u16			bucket_size;	/* sectors */
-	u8			state;
-	u8			tier;
-	u8			replacement;
-	u8			discard;
-	u8			data_allowed;
-	u8			valid;
-};
-
-struct bch_replicas_cpu_entry {
-	u8			data_type;
-	u8			devs[BCH_SB_MEMBERS_MAX / 8];
-};
-
-struct bch_replicas_cpu {
-	struct rcu_head		rcu;
-	unsigned		nr;
-	unsigned		entry_size;
-	struct bch_replicas_cpu_entry entries[];
-};
-
 struct io_count {
 	u64			sectors[2][BCH_DATA_NR];
 };
@@ -372,7 +346,7 @@ struct bch_dev {
 
 	struct bch_devs_mask	self;
 
-	/* biosets used in cloned bios for replicas and moving_gc */
+	/* biosets used in cloned bios for writing multiple replicas */
 	struct bio_set		replica_set;
 
 	struct task_struct	*alloc_thread;
@@ -392,7 +366,7 @@ struct bch_dev {
 	unsigned		nr_invalidated;
 	bool			alloc_thread_started;
 
-	struct open_bucket_ptr	open_buckets_partial[BCH_REPLICAS_MAX * WRITE_POINT_COUNT];
+	u8			open_buckets_partial[OPEN_BUCKETS_COUNT];
 	unsigned		open_buckets_partial_nr;
 
 	size_t			fifo_last_bucket;
@@ -422,18 +396,20 @@ struct bch_dev {
 	bool			allocator_invalidating_data;
 
 	alloc_heap		alloc_heap;
-	bucket_heap		copygc_heap;
 
-	/* Moving GC: */
-	struct task_struct	*moving_gc_read;
-
-	struct bch_pd_controller moving_gc_pd;
+	/* Copying GC: */
+	struct task_struct	*copygc_thread;
+	copygc_heap		copygc_heap;
+	struct bch_pd_controller copygc_pd;
+	struct write_point	copygc_write_point;
 
 	struct journal_device	journal;
 
 	struct work_struct	io_error_work;
 
 	/* The rest of this all shows up in sysfs */
+	atomic_t		latency[2];
+
 	struct io_count __percpu *io_done;
 };
 
@@ -473,6 +449,7 @@ struct bch_tier {
 	struct bch_pd_controller pd;
 
 	struct bch_devs_mask	devs;
+	struct write_point	wp;
 };
 
 enum bch_fs_state {
@@ -557,10 +534,7 @@ struct bch_fs {
 	 * when allocating btree reserves fail halfway through) - instead, we
 	 * can stick them here:
 	 */
-	struct btree_alloc {
-		struct open_bucket	*ob;
-		BKEY_PADDED(k);
-	}			btree_reserve_cache[BTREE_NODE_RESERVE * 2];
+	struct btree_alloc	btree_reserve_cache[BTREE_NODE_RESERVE * 2];
 	unsigned		btree_reserve_cache_nr;
 	struct mutex		btree_reserve_cache_lock;
 
@@ -573,15 +547,9 @@ struct bch_fs {
 	struct workqueue_struct	*copygc_wq;
 
 	/* ALLOCATION */
-	struct rw_semaphore	alloc_gc_lock;
-	struct bch_pd_controller foreground_write_pd;
 	struct delayed_work	pd_controllers_update;
 	unsigned		pd_controllers_update_seconds;
-	spinlock_t		foreground_write_pd_lock;
-	struct bch_write_op	*write_wait_head;
-	struct bch_write_op	*write_wait_tail;
 
-	struct timer_list	foreground_write_wakeup;
 
 	/*
 	 * These contain all r/w devices - i.e. devices we can currently
@@ -622,8 +590,8 @@ struct bch_fs {
 
 	struct io_clock		io_clock[2];
 
-	/* SECTOR ALLOCATOR */
-	spinlock_t		open_buckets_lock;
+	/* ALLOCATOR */
+	spinlock_t		freelist_lock;
 	u8			open_buckets_freelist;
 	u8			open_buckets_nr_free;
 	struct closure_waitlist	open_buckets_wait;
@@ -635,15 +603,6 @@ struct bch_fs {
 	struct hlist_head	write_points_hash[WRITE_POINT_COUNT];
 	struct mutex		write_points_hash_lock;
 
-	/*
-	 * This write point is used for migrating data off a device
-	 * and can point to any other device.
-	 * We can't use the normal write points because those will
-	 * gang up n replicas, and for migration we want only one new
-	 * replica.
-	 */
-	struct write_point	migration_write_point;
-
 	/* GARBAGE COLLECTION */
 	struct task_struct	*gc_thread;
 	atomic_t		kick_gc;
@@ -688,6 +647,11 @@ struct bch_fs {
 
 	atomic64_t		key_version;
 
+	/* VFS IO PATH - fs-io.c */
+	struct bio_set		writepage_bioset;
+	struct bio_set		dio_write_bioset;
+	struct bio_set		dio_read_bioset;
+
 	struct bio_list		btree_write_error_list;
 	struct work_struct	btree_write_error_work;
 	spinlock_t		btree_write_error_lock;
@@ -728,19 +692,14 @@ struct bch_fs {
 
 	/* The rest of this all shows up in sysfs */
 	atomic_long_t		read_realloc_races;
+	atomic_long_t		extent_migrate_done;
+	atomic_long_t		extent_migrate_raced;
 
 	unsigned		btree_gc_periodic:1;
-	unsigned		foreground_write_ratelimit_enabled:1;
 	unsigned		copy_gc_enabled:1;
 	unsigned		tiering_enabled:1;
 	unsigned		tiering_percent;
 
-	/*
-	 * foreground writes will be throttled when the number of free
-	 * buckets is below this percentage
-	 */
-	unsigned		foreground_target_percent;
-
 #define BCH_DEBUG_PARAM(name, description) bool name;
 	BCH_DEBUG_PARAMS_ALL()
 #undef BCH_DEBUG_PARAM
diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h
index 16a1edd..2dc9a7e 100644
--- a/libbcachefs/bcachefs_format.h
+++ b/libbcachefs/bcachefs_format.h
@@ -344,11 +344,13 @@ struct bch_csum {
 
 enum bch_csum_type {
 	BCH_CSUM_NONE			= 0,
-	BCH_CSUM_CRC32C			= 1,
-	BCH_CSUM_CRC64			= 2,
+	BCH_CSUM_CRC32C_NONZERO		= 1,
+	BCH_CSUM_CRC64_NONZERO		= 2,
 	BCH_CSUM_CHACHA20_POLY1305_80	= 3,
 	BCH_CSUM_CHACHA20_POLY1305_128	= 4,
-	BCH_CSUM_NR			= 5,
+	BCH_CSUM_CRC32C			= 5,
+	BCH_CSUM_CRC64			= 6,
+	BCH_CSUM_NR			= 7,
 };
 
 static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type)
@@ -550,7 +552,7 @@ BKEY_VAL_TYPE(reservation,	BCH_RESERVATION);
 /* Maximum possible size of an entire extent value: */
 /* There's a hack in the keylist code that needs to be fixed.. */
 #define BKEY_EXTENT_VAL_U64s_MAX				\
-	(BKEY_EXTENT_PTR_U64s_MAX * BCH_REPLICAS_MAX)
+	(BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
 
 /* * Maximum possible size of an entire extent, key + value: */
 #define BKEY_EXTENT_U64s_MAX		(BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
@@ -734,11 +736,13 @@ BKEY_VAL_TYPE(alloc,	BCH_ALLOC);
 /*
  * Version 8:	BCH_SB_ENCODED_EXTENT_MAX_BITS
  *		BCH_MEMBER_DATA_ALLOWED
+ * Version 9:	incompatible extent nonce change
  */
 
 #define BCH_SB_VERSION_MIN		7
 #define BCH_SB_VERSION_EXTENT_MAX	8
-#define BCH_SB_VERSION_MAX		8
+#define BCH_SB_VERSION_EXTENT_NONCE_V1	9
+#define BCH_SB_VERSION_MAX		9
 
 #define BCH_SB_SECTOR			8
 #define BCH_SB_LABEL_SIZE		32
diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c
index d33bc4e..73089a9 100644
--- a/libbcachefs/bkey.c
+++ b/libbcachefs/bkey.c
@@ -4,6 +4,14 @@
 #include "bset.h"
 #include "util.h"
 
+#undef EBUG_ON
+
+#ifdef DEBUG_BKEYS
+#define EBUG_ON(cond)		BUG_ON(cond)
+#else
+#define EBUG_ON(cond)
+#endif
+
 const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT;
 
 struct bkey __bch2_bkey_unpack_key(const struct bkey_format *,
diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h
index a1337bf..c195cd9 100644
--- a/libbcachefs/bset.h
+++ b/libbcachefs/bset.h
@@ -146,6 +146,17 @@
  * first key in that range of bytes again.
  */
 
+extern bool bch2_expensive_debug_checks;
+
+static inline bool btree_keys_expensive_checks(const struct btree *b)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+	return bch2_expensive_debug_checks || *b->expensive_debug_checks;
+#else
+	return false;
+#endif
+}
+
 struct btree_node_iter;
 struct btree_node_iter_set;
 
@@ -188,7 +199,7 @@ bkey_unpack_key_format_checked(const struct btree *b,
 		compiled_unpack_fn unpack_fn = b->aux_data;
 		unpack_fn(&dst, src);
 
-		if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
+		if (btree_keys_expensive_checks(b)) {
 			struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src);
 
 			/*
@@ -260,17 +271,6 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b,
 #define for_each_bset(_b, _t)					\
 	for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
 
-extern bool bch2_expensive_debug_checks;
-
-static inline bool btree_keys_expensive_checks(struct btree *b)
-{
-#ifdef CONFIG_BCACHEFS_DEBUG
-	return bch2_expensive_debug_checks || *b->expensive_debug_checks;
-#else
-	return false;
-#endif
-}
-
 static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
 {
 	return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c
index b090196..1198fe3 100644
--- a/libbcachefs/btree_gc.c
+++ b/libbcachefs/btree_gc.c
@@ -24,6 +24,7 @@
 #include <linux/bitops.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
+#include <linux/preempt.h>
 #include <linux/rcupdate.h>
 #include <trace/events/bcachefs.h>
 
@@ -111,19 +112,35 @@ u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k)
 /*
  * For runtime mark and sweep:
  */
-static u8 bch2_btree_mark_key(struct bch_fs *c, enum bkey_type type,
-			      struct bkey_s_c k, unsigned flags)
+static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type,
+			   struct bkey_s_c k, unsigned flags)
 {
+	struct gc_pos pos = { 0 };
+	struct bch_fs_usage *stats;
+	u8 ret = 0;
+
+	preempt_disable();
+	stats = this_cpu_ptr(c->usage_percpu);
 	switch (type) {
 	case BKEY_TYPE_BTREE:
-		bch2_gc_mark_key(c, k, c->opts.btree_node_size, true, flags);
-		return 0;
+		bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, stats,
+			      0, flags|
+			      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+			      BCH_BUCKET_MARK_GC_LOCK_HELD);
+		break;
 	case BKEY_TYPE_EXTENTS:
-		bch2_gc_mark_key(c, k, k.k->size, false, flags);
-		return bch2_btree_key_recalc_oldest_gen(c, k);
+		bch2_mark_key(c, k, k.k->size, false, pos, stats,
+			      0, flags|
+			      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+			      BCH_BUCKET_MARK_GC_LOCK_HELD);
+		ret = bch2_btree_key_recalc_oldest_gen(c, k);
+		break;
 	default:
 		BUG();
 	}
+	preempt_enable();
+
+	return ret;
 }
 
 int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
@@ -182,7 +199,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type,
 		     max_t(u64, k.k->version.lo,
 			   atomic64_read(&c->key_version)));
 
-	bch2_btree_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
+	bch2_gc_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC);
 fsck_err:
 	return ret;
 }
@@ -200,7 +217,7 @@ static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b)
 					       btree_node_is_extents(b),
 					       &unpacked) {
 			bch2_bkey_debugcheck(c, b, k);
-			stale = max(stale, bch2_btree_mark_key(c, type, k, 0));
+			stale = max(stale, bch2_gc_mark_key(c, type, k, 0));
 		}
 
 	return stale;
@@ -267,123 +284,79 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id)
 	mutex_lock(&c->btree_root_lock);
 
 	b = c->btree_roots[btree_id].b;
-	bch2_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
+	bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0);
 	gc_pos_set(c, gc_pos_btree_root(b->btree_id));
 
 	mutex_unlock(&c->btree_root_lock);
 	return 0;
 }
 
-static void bch2_mark_allocator_buckets(struct bch_fs *c)
-{
-	struct bch_dev *ca;
-	struct open_bucket *ob;
-	const struct open_bucket_ptr *ptr;
-	size_t i, j, iter;
-	unsigned ci;
-
-	down_write(&c->alloc_gc_lock);
-
-	for_each_member_device(ca, c, ci) {
-		spin_lock(&ca->freelist_lock);
-
-		fifo_for_each_entry(i, &ca->free_inc, iter)
-			bch2_mark_alloc_bucket(ca, &ca->buckets[i], true);
-
-		for (j = 0; j < RESERVE_NR; j++)
-			fifo_for_each_entry(i, &ca->free[j], iter)
-				bch2_mark_alloc_bucket(ca, &ca->buckets[i], true);
-
-		for (ptr = ca->open_buckets_partial;
-		     ptr < ca->open_buckets_partial + ca->open_buckets_partial_nr;
-		     ptr++)
-			bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true);
-
-		spin_unlock(&ca->freelist_lock);
-	}
-
-	for (ob = c->open_buckets;
-	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
-	     ob++) {
-		spin_lock(&ob->lock);
-		open_bucket_for_each_ptr(ob, ptr) {
-			ca = c->devs[ptr->ptr.dev];
-			bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true);
-		}
-		spin_unlock(&ob->lock);
-	}
-
-	up_write(&c->alloc_gc_lock);
-}
-
-static void mark_metadata_sectors(struct bch_dev *ca, u64 start, u64 end,
-				  enum bucket_data_type type)
+static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
+				  u64 start, u64 end,
+				  enum bucket_data_type type,
+				  unsigned flags)
 {
 	u64 b = sector_to_bucket(ca, start);
 
 	do {
-		bch2_mark_metadata_bucket(ca, ca->buckets + b, type, true);
+		bch2_mark_metadata_bucket(c, ca, ca->buckets + b, type,
+					  gc_phase(GC_PHASE_SB), flags);
 		b++;
 	} while (b < sector_to_bucket(ca, end));
 }
 
-static void bch2_dev_mark_superblocks(struct bch_dev *ca)
+void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
+			      unsigned flags)
 {
 	struct bch_sb_layout *layout = &ca->disk_sb.sb->layout;
 	unsigned i;
+	u64 b;
+
+	lockdep_assert_held(&c->sb_lock);
 
 	for (i = 0; i < layout->nr_superblocks; i++) {
 		if (layout->sb_offset[i] == BCH_SB_SECTOR)
-			mark_metadata_sectors(ca, 0, BCH_SB_SECTOR,
-					      BUCKET_SB);
+			mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
+					      BUCKET_SB, flags);
 
-		mark_metadata_sectors(ca,
+		mark_metadata_sectors(c, ca,
 				      layout->sb_offset[i],
 				      layout->sb_offset[i] +
 				      (1 << layout->sb_max_size_bits),
-				      BUCKET_SB);
+				      BUCKET_SB, flags);
 	}
-}
-
-/*
- * Mark non btree metadata - prios, journal
- */
-void bch2_mark_dev_metadata(struct bch_fs *c, struct bch_dev *ca)
-{
-	unsigned i;
-	u64 b;
-
-	lockdep_assert_held(&c->sb_lock);
-
-	bch2_dev_mark_superblocks(ca);
 
 	spin_lock(&c->journal.lock);
 
 	for (i = 0; i < ca->journal.nr; i++) {
 		b = ca->journal.buckets[i];
-		bch2_mark_metadata_bucket(ca, ca->buckets + b,
-					 BUCKET_JOURNAL, true);
+		bch2_mark_metadata_bucket(c, ca, ca->buckets + b,
+					  BUCKET_JOURNAL,
+					  gc_phase(GC_PHASE_SB), flags);
 	}
 
 	spin_unlock(&c->journal.lock);
 }
 
-static void bch2_mark_metadata(struct bch_fs *c)
+static void bch2_mark_superblocks(struct bch_fs *c)
 {
 	struct bch_dev *ca;
 	unsigned i;
 
 	mutex_lock(&c->sb_lock);
-	gc_pos_set(c, gc_phase(GC_PHASE_SB_METADATA));
+	gc_pos_set(c, gc_phase(GC_PHASE_SB));
 
 	for_each_online_member(ca, c, i)
-		bch2_mark_dev_metadata(c, ca);
+		bch2_mark_dev_superblock(c, ca,
+					 BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+					 BCH_BUCKET_MARK_GC_LOCK_HELD);
 	mutex_unlock(&c->sb_lock);
 }
 
 /* Also see bch2_pending_btree_node_free_insert_done() */
 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 {
+	struct gc_pos pos = { 0 };
 	struct bch_fs_usage stats = { 0 };
 	struct btree_update *as;
 	struct pending_btree_node_free *d;
@@ -393,10 +366,11 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 
 	for_each_pending_btree_node_free(c, as, d)
 		if (d->index_update_done)
-			__bch2_mark_key(c, bkey_i_to_s_c(&d->key),
-					c->opts.btree_node_size, true,
-					&stats, 0,
-					BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
+			bch2_mark_key(c, bkey_i_to_s_c(&d->key),
+				      c->opts.btree_node_size, true, pos,
+				      &stats, 0,
+				      BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+				      BCH_BUCKET_MARK_GC_LOCK_HELD);
 	/*
 	 * Don't apply stats - pending deletes aren't tracked in
 	 * bch_alloc_stats:
@@ -405,6 +379,51 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 	mutex_unlock(&c->btree_interior_update_lock);
 }
 
+static void bch2_mark_allocator_buckets(struct bch_fs *c)
+{
+	struct bch_dev *ca;
+	struct open_bucket *ob;
+	size_t i, j, iter;
+	unsigned ci;
+
+	spin_lock(&c->freelist_lock);
+	gc_pos_set(c, gc_pos_alloc(c, NULL));
+
+	for_each_member_device(ca, c, ci) {
+		fifo_for_each_entry(i, &ca->free_inc, iter)
+			bch2_mark_alloc_bucket(c, ca, &ca->buckets[i], true,
+					       gc_pos_alloc(c, NULL),
+					       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+					       BCH_BUCKET_MARK_GC_LOCK_HELD);
+
+
+
+		for (j = 0; j < RESERVE_NR; j++)
+			fifo_for_each_entry(i, &ca->free[j], iter)
+				bch2_mark_alloc_bucket(c, ca, &ca->buckets[i], true,
+						       gc_pos_alloc(c, NULL),
+						       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+						       BCH_BUCKET_MARK_GC_LOCK_HELD);
+	}
+
+	spin_unlock(&c->freelist_lock);
+
+	for (ob = c->open_buckets;
+	     ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+	     ob++) {
+		spin_lock(&ob->lock);
+		if (ob->valid) {
+			gc_pos_set(c, gc_pos_alloc(c, ob));
+			ca = c->devs[ob->ptr.dev];
+			bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), true,
+					       gc_pos_alloc(c, ob),
+					       BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE|
+					       BCH_BUCKET_MARK_GC_LOCK_HELD);
+		}
+		spin_unlock(&ob->lock);
+	}
+}
+
 void bch2_gc_start(struct bch_fs *c)
 {
 	struct bch_dev *ca;
@@ -495,9 +514,6 @@ void bch2_gc(struct bch_fs *c)
 
 	bch2_gc_start(c);
 
-	/* Walk allocator's references: */
-	bch2_mark_allocator_buckets(c);
-
 	/* Walk btree: */
 	while (c->gc_pos.phase < (int) BTREE_ID_NR) {
 		int ret = c->btree_roots[c->gc_pos.phase].b
@@ -513,8 +529,9 @@ void bch2_gc(struct bch_fs *c)
 		gc_pos_set(c, gc_phase(c->gc_pos.phase + 1));
 	}
 
-	bch2_mark_metadata(c);
+	bch2_mark_superblocks(c);
 	bch2_mark_pending_btree_node_frees(c);
+	bch2_mark_allocator_buckets(c);
 
 	for_each_member_device(ca, c, i)
 		atomic_long_set(&ca->saturated_count, 0);
@@ -570,7 +587,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 	struct bkey_format new_format;
 
 	memset(new_nodes, 0, sizeof(new_nodes));
-	bch2_keylist_init(&keylist, NULL, 0);
+	bch2_keylist_init(&keylist, NULL);
 
 	/* Count keys that are not deleted */
 	for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++)
@@ -1023,8 +1040,6 @@ again:
 	if (ret)
 	return ret;
 
-	bch2_mark_metadata(c);
-
 	if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) {
 		if (iter++ > 2) {
 			bch_info(c, "Unable to fix bucket gens, looping");
@@ -1043,6 +1058,8 @@ again:
 	if (c->sb.encryption_type)
 		atomic64_add(1 << 16, &c->key_version);
 
+	bch2_mark_superblocks(c);
+
 	gc_pos_set(c, gc_phase(GC_PHASE_DONE));
 	set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags);
 
diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h
index 27dcc06..4d1ab9d 100644
--- a/libbcachefs/btree_gc.h
+++ b/libbcachefs/btree_gc.h
@@ -13,7 +13,7 @@ int bch2_initial_gc(struct bch_fs *, struct list_head *);
 u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c);
 int bch2_btree_mark_key_initial(struct bch_fs *, enum bkey_type,
 				struct bkey_s_c);
-void bch2_mark_dev_metadata(struct bch_fs *, struct bch_dev *);
+void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned);
 
 /*
  * For concurrent mark and sweep (with other index updates), we define a total
@@ -88,6 +88,14 @@ static inline struct gc_pos gc_pos_btree_root(enum btree_id id)
 	};
 }
 
+static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob)
+{
+	return (struct gc_pos) {
+		.phase	= GC_PHASE_ALLOC,
+		.pos	= POS(ob ? ob - c->open_buckets : 0, 0),
+	};
+}
+
 static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos)
 {
 	unsigned seq;
diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c
index d50e9e8..38c373c 100644
--- a/libbcachefs/btree_io.c
+++ b/libbcachefs/btree_io.c
@@ -146,9 +146,7 @@ static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
 	BUG_ON(iter->data->k > iter->data->end);
 
 	if (iter->data->k == iter->data->end)
-		memmove(&iter->data[0],
-			&iter->data[1],
-			sizeof(iter->data[0]) * --iter->used);
+		array_remove_item(iter->data, iter->used, 0);
 	else
 		sort_iter_sift(iter, cmp);
 }
@@ -1307,6 +1305,8 @@ static void btree_node_read_endio(struct bio *bio)
 	struct btree_read_bio *rb =
 		container_of(bio, struct btree_read_bio, bio);
 
+	bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ);
+
 	INIT_WORK(&rb->work, btree_node_read_work);
 	schedule_work(&rb->work);
 }
@@ -1471,6 +1471,8 @@ static void btree_node_write_endio(struct bio *bio)
 	struct bch_fs *c		= wbio->c;
 	struct bch_dev *ca		= wbio->ca;
 
+	bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
+
 	if (bch2_dev_io_err_on(bio->bi_error, ca, "btree write") ||
 	    bch2_meta_write_fault("btree"))
 		set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed);
diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h
index f3290f9..61165a6 100644
--- a/libbcachefs/btree_io.h
+++ b/libbcachefs/btree_io.h
@@ -10,6 +10,7 @@ struct btree_iter;
 
 struct btree_read_bio {
 	struct bch_fs		*c;
+	unsigned		submit_time_us;
 	u64			start_time;
 	struct extent_pick_ptr	pick;
 	struct work_struct	work;
diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h
index 0c174e4..c271189 100644
--- a/libbcachefs/btree_locking.h
+++ b/libbcachefs/btree_locking.h
@@ -91,7 +91,7 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level)
 {
 	int lock_type = btree_node_locked_type(iter, level);
 
-	EBUG_ON(iter->flags & BTREE_ITER_UPTODATE);
+	EBUG_ON(!level && iter->flags & BTREE_ITER_UPTODATE);
 
 	if (lock_type != BTREE_NODE_UNLOCKED)
 		six_unlock_type(&iter->nodes[level]->lock, lock_type);
diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h
index 8b4df03..f1e06a3 100644
--- a/libbcachefs/btree_types.h
+++ b/libbcachefs/btree_types.h
@@ -55,6 +55,16 @@ struct btree_write {
 	struct closure_waitlist		wait;
 };
 
+struct btree_ob_ref {
+	u8			nr;
+	u8			refs[BCH_REPLICAS_MAX];
+};
+
+struct btree_alloc {
+	struct btree_ob_ref	ob;
+	BKEY_PADDED(k);
+};
+
 struct btree {
 	/* Hottest entries first */
 	struct rhash_head	hash;
@@ -118,7 +128,7 @@ struct btree {
 	 */
 	struct btree_update	*will_make_reachable;
 
-	struct open_bucket	*ob;
+	struct btree_ob_ref	ob;
 
 	/* lru list */
 	struct list_head	list;
@@ -317,18 +327,6 @@ struct btree_root {
 struct btree_iter;
 struct btree_node_iter;
 
-enum extent_insert_hook_ret {
-	BTREE_HOOK_DO_INSERT,
-	BTREE_HOOK_NO_INSERT,
-	BTREE_HOOK_RESTART_TRANS,
-};
-
-struct extent_insert_hook {
-	enum extent_insert_hook_ret
-	(*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
-	      struct bkey_s_c, const struct bkey_i *);
-};
-
 enum btree_insert_ret {
 	BTREE_INSERT_OK,
 	/* extent spanned multiple leaf nodes: have to traverse to next node: */
@@ -342,6 +340,12 @@ enum btree_insert_ret {
 	BTREE_INSERT_NEED_GC_LOCK,
 };
 
+struct extent_insert_hook {
+	enum btree_insert_ret
+	(*fn)(struct extent_insert_hook *, struct bpos, struct bpos,
+	      struct bkey_s_c, const struct bkey_i *);
+};
+
 enum btree_gc_coalesce_fail_reason {
 	BTREE_GC_COALESCE_FAIL_RESERVE_GET,
 	BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC,
diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c
index 2efb01c..1fe8fff 100644
--- a/libbcachefs/btree_update_interior.c
+++ b/libbcachefs/btree_update_interior.c
@@ -211,7 +211,7 @@ found:
 			     -c->opts.btree_node_size, true, b
 			     ? gc_pos_btree_node(b)
 			     : gc_pos_btree_root(as->btree_id),
-			     &tmp, 0);
+			     &tmp, 0, 0);
 		/*
 		 * Don't apply tmp - pending deletes aren't tracked in
 		 * bch_alloc_stats:
@@ -229,7 +229,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
 	BUG_ON(btree_node_dirty(b));
 	BUG_ON(btree_node_need_write(b));
 	BUG_ON(b == btree_node_root(c, b));
-	BUG_ON(b->ob);
+	BUG_ON(b->ob.nr);
 	BUG_ON(!list_empty(&b->write_blocked));
 	BUG_ON(b->will_make_reachable);
 
@@ -254,17 +254,17 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b,
 
 void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b)
 {
-	struct open_bucket *ob = b->ob;
+	struct btree_ob_ref ob = b->ob;
 
 	btree_update_drop_new_node(c, b);
 
-	b->ob = NULL;
+	b->ob.nr = 0;
 
 	clear_btree_node_dirty(b);
 
 	__btree_node_free(c, b, NULL);
 
-	bch2_open_bucket_put(c, ob);
+	bch2_open_bucket_put_refs(c, &ob.nr, ob.refs);
 }
 
 void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b,
@@ -287,7 +287,7 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
 	bch2_mark_key(c, bkey_i_to_s_c(&pending->key),
 		     -c->opts.btree_node_size, true,
 		     gc_phase(GC_PHASE_PENDING_DELETE),
-		     &stats, 0);
+		     &stats, 0, 0);
 	/*
 	 * Don't apply stats - pending deletes aren't tracked in
 	 * bch_alloc_stats:
@@ -296,8 +296,7 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c,
 
 void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b)
 {
-	bch2_open_bucket_put(c, b->ob);
-	b->ob = NULL;
+	bch2_open_bucket_put_refs(c, &b->ob.nr, b->ob.refs);
 }
 
 static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
@@ -305,9 +304,12 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 					     struct closure *cl,
 					     unsigned flags)
 {
-	BKEY_PADDED(k) tmp;
-	struct open_bucket *ob;
+	struct write_point *wp;
 	struct btree *b;
+	BKEY_PADDED(k) tmp;
+	struct bkey_i_extent *e;
+	struct btree_ob_ref ob;
+	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
 	unsigned nr_reserve;
 	enum alloc_reserve alloc_reserve;
 
@@ -335,31 +337,41 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 	mutex_unlock(&c->btree_reserve_cache_lock);
 
 retry:
-	/* alloc_sectors is weird, I suppose */
-	bkey_extent_init(&tmp.k);
-	tmp.k.k.size = c->opts.btree_node_size,
-
-	ob = bch2_alloc_sectors(c, BCH_DATA_BTREE, 0, 0,
-				bkey_i_to_extent(&tmp.k),
-				res->nr_replicas,
-				c->opts.metadata_replicas_required,
-				alloc_reserve, 0, cl);
-	if (IS_ERR(ob))
-		return ERR_CAST(ob);
-
-	if (tmp.k.k.size < c->opts.btree_node_size) {
-		bch2_open_bucket_put(c, ob);
+	wp = bch2_alloc_sectors_start(c, NULL,
+				      writepoint_ptr(&c->btree_write_point),
+				      &devs_have,
+				      res->nr_replicas,
+				      c->opts.metadata_replicas_required,
+				      alloc_reserve, 0, cl);
+	if (IS_ERR(wp))
+		return ERR_CAST(wp);
+
+	if (wp->sectors_free < c->opts.btree_node_size) {
+		struct open_bucket *ob;
+		unsigned i;
+
+		writepoint_for_each_ptr(wp, ob, i)
+			if (ob->sectors_free < c->opts.btree_node_size)
+				ob->sectors_free = 0;
+
+		bch2_alloc_sectors_done(c, wp);
 		goto retry;
 	}
+
+	e = bkey_extent_init(&tmp.k);
+	bch2_alloc_sectors_append_ptrs(c, wp, e, c->opts.btree_node_size);
+
+	ob.nr = 0;
+	bch2_open_bucket_get(c, wp, &ob.nr, ob.refs);
+	bch2_alloc_sectors_done(c, wp);
 mem_alloc:
 	b = bch2_btree_node_mem_alloc(c);
 
 	/* we hold cannibalize_lock: */
 	BUG_ON(IS_ERR(b));
-	BUG_ON(b->ob);
+	BUG_ON(b->ob.nr);
 
 	bkey_copy(&b->key, &tmp.k);
-	b->key.k.size = 0;
 	b->ob = ob;
 
 	return b;
@@ -466,11 +478,10 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser
 				&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
 
 			a->ob = b->ob;
-			b->ob = NULL;
+			b->ob.nr = 0;
 			bkey_copy(&a->k, &b->key);
 		} else {
-			bch2_open_bucket_put(c, b->ob);
-			b->ob = NULL;
+			bch2_btree_open_bucket_put(c, b);
 		}
 
 		__btree_node_free(c, b, NULL);
@@ -857,10 +868,7 @@ static void __btree_interior_update_drop_new_node(struct btree *b)
 
 	BUG();
 found:
-	as->nr_new_nodes--;
-	memmove(&as->new_nodes[i],
-		&as->new_nodes[i + 1],
-		sizeof(struct btree *) * (as->nr_new_nodes - i));
+	array_remove_item(as->new_nodes, as->nr_new_nodes, i);
 	b->will_make_reachable = NULL;
 }
 
@@ -1000,8 +1008,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id,
 	as->reserve	= reserve;
 	INIT_LIST_HEAD(&as->write_blocked_list);
 
-	bch2_keylist_init(&as->parent_keys, as->inline_keys,
-			 ARRAY_SIZE(as->inline_keys));
+	bch2_keylist_init(&as->parent_keys, as->inline_keys);
 
 	mutex_lock(&c->btree_interior_update_lock);
 	list_add(&as->list, &c->btree_interior_update_list);
@@ -1037,7 +1044,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 	bch2_mark_key(c, bkey_i_to_s_c(&b->key),
 		      c->opts.btree_node_size, true,
 		      gc_pos_btree_root(b->btree_id),
-		      &stats, 0);
+		      &stats, 0, 0);
 
 	if (old)
 		bch2_btree_node_free_index(as, NULL,
@@ -1121,7 +1128,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	if (bkey_extent_is_data(&insert->k))
 		bch2_mark_key(c, bkey_i_to_s_c(insert),
 			     c->opts.btree_node_size, true,
-			     gc_pos_btree_node(b), &stats, 0);
+			     gc_pos_btree_node(b), &stats, 0, 0);
 
 	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
 	       !btree_iter_pos_cmp_packed(b, &insert->k.p, k, false))
@@ -1479,6 +1486,13 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 	struct closure cl;
 	int ret = 0;
 
+	/*
+	 * We already have a disk reservation and open buckets pinned; this
+	 * allocation must not block:
+	 */
+	if (iter->btree_id == BTREE_ID_EXTENTS)
+		btree_reserve_flags |= BTREE_INSERT_USE_RESERVE;
+
 	closure_init_stack(&cl);
 
 	/* Hack, because gc and splitting nodes doesn't mix yet: */
@@ -1519,6 +1533,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
 	bch2_btree_iter_set_locks_want(iter, 1);
 out:
 	up_read(&c->gc_lock);
+	closure_sync(&cl);
 	return ret;
 }
 
@@ -1904,7 +1919,7 @@ retry:
 		bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i),
 			      c->opts.btree_node_size, true,
 			      gc_pos_btree_root(b->btree_id),
-			      &stats, 0);
+			      &stats, 0, 0);
 		bch2_btree_node_free_index(as, NULL,
 					   bkey_i_to_s_c(&b->key),
 					   &stats);
@@ -1928,6 +1943,7 @@ out:
 	}
 	bch2_btree_iter_unlock(&iter);
 	up_read(&c->gc_lock);
+	closure_sync(&cl);
 	return ret;
 err:
 	if (as)
@@ -1965,13 +1981,13 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id,
 					     BTREE_INSERT_USE_RESERVE|
 					     BTREE_INSERT_USE_ALLOC_RESERVE,
 					     &cl);
+		closure_sync(&cl);
+
 		if (!IS_ERR(as))
 			break;
 
 		if (PTR_ERR(as) == -ENOSPC)
 			return PTR_ERR(as);
-
-		closure_sync(&cl);
 	}
 
 	b = __btree_root_alloc(as, 0);
diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c
index 6c490dd..e62e0d2 100644
--- a/libbcachefs/btree_update_leaf.c
+++ b/libbcachefs/btree_update_leaf.c
@@ -355,6 +355,11 @@ retry:
 
 	multi_lock_write(c, trans);
 
+	if (race_fault()) {
+		ret = -EINTR;
+		goto unlock;
+	}
+
 	u64s = 0;
 	trans_for_each_entry(trans, i) {
 		/* Multiple inserts might go to same leaf: */
diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c
index 6fdbb46..b73002d 100644
--- a/libbcachefs/buckets.c
+++ b/libbcachefs/buckets.c
@@ -101,9 +101,41 @@ static void bch2_fs_stats_verify(struct bch_fs *c)
 		      stats.online_reserved);
 }
 
+static void bch2_dev_stats_verify(struct bch_dev *ca)
+{
+	struct bch_dev_usage stats =
+		__bch2_dev_usage_read(ca);
+	u64 n = ca->mi.nbuckets - ca->mi.first_bucket;
+
+	BUG_ON(stats.buckets[S_META]		> n);
+	BUG_ON(stats.buckets[S_DIRTY]		> n);
+	BUG_ON(stats.buckets_cached		> n);
+	BUG_ON(stats.buckets_alloc		> n);
+	BUG_ON(stats.buckets_unavailable	> n);
+}
+
+static void bch2_disk_reservations_verify(struct bch_fs *c, int flags)
+{
+	if (!(flags & BCH_DISK_RESERVATION_NOFAIL)) {
+		u64 used = __bch2_fs_sectors_used(c);
+		u64 cached = 0;
+		u64 avail = atomic64_read(&c->sectors_available);
+		int cpu;
+
+		for_each_possible_cpu(cpu)
+			cached += per_cpu_ptr(c->usage_percpu, cpu)->available_cache;
+
+		if (used + avail + cached > c->capacity)
+			panic("used %llu avail %llu cached %llu capacity %llu\n",
+			      used, avail, cached, c->capacity);
+	}
+}
+
 #else
 
 static void bch2_fs_stats_verify(struct bch_fs *c) {}
+static void bch2_dev_stats_verify(struct bch_dev *ca) {}
+static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {}
 
 #endif
 
@@ -171,11 +203,9 @@ struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca)
 	return bch2_usage_read_raw(ca->usage_percpu);
 }
 
-struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
+struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
 {
-	return bch2_usage_read_cached(ca->fs,
-				ca->usage_cached,
-				ca->usage_percpu);
+	return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu);
 }
 
 struct bch_fs_usage
@@ -208,6 +238,11 @@ static inline int is_cached_bucket(struct bucket_mark m)
 		!m.dirty_sectors && !!m.cached_sectors;
 }
 
+static inline int is_unavailable_bucket(struct bucket_mark m)
+{
+	return !is_available_bucket(m);
+}
+
 static inline enum s_alloc bucket_type(struct bucket_mark m)
 {
 	return is_meta_bucket(m) ? S_META : S_DIRTY;
@@ -256,12 +291,15 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 	memset(stats, 0, sizeof(*stats));
 }
 
-static void bch2_dev_usage_update(struct bch_dev *ca,
-				  struct bucket_mark old, struct bucket_mark new)
+static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
+				  struct bucket *g, struct bucket_mark old,
+				  struct bucket_mark new)
 {
-	struct bch_fs *c = ca->fs;
 	struct bch_dev_usage *dev_usage;
 
+	BUG_ON((g - ca->buckets) < ca->mi.first_bucket ||
+	       (g - ca->buckets) >= ca->mi.nbuckets);
+
 	bch2_fs_inconsistent_on(old.data_type && new.data_type &&
 			old.data_type != new.data_type, c,
 			"different types of metadata in same bucket: %u, %u",
@@ -270,38 +308,44 @@ static void bch2_dev_usage_update(struct bch_dev *ca,
 	preempt_disable();
 	dev_usage = this_cpu_ptr(ca->usage_percpu);
 
-	dev_usage->sectors_cached +=
-		(int) new.cached_sectors - (int) old.cached_sectors;
-
-	dev_usage->sectors[bucket_type(old)] -= old.dirty_sectors;
-	dev_usage->sectors[bucket_type(new)] += new.dirty_sectors;
-
+	dev_usage->buckets[S_META] +=
+		is_meta_bucket(new) - is_meta_bucket(old);
+	dev_usage->buckets[S_DIRTY] +=
+		is_dirty_bucket(new) - is_dirty_bucket(old);
+	dev_usage->buckets_cached +=
+		is_cached_bucket(new) - is_cached_bucket(old);
 	dev_usage->buckets_alloc +=
 		(int) new.owned_by_allocator - (int) old.owned_by_allocator;
+	dev_usage->buckets_unavailable +=
+		is_unavailable_bucket(new) - is_unavailable_bucket(old);
 
-	dev_usage->buckets[S_META] += is_meta_bucket(new) - is_meta_bucket(old);
-	dev_usage->buckets[S_DIRTY] += is_dirty_bucket(new) - is_dirty_bucket(old);
-	dev_usage->buckets_cached += is_cached_bucket(new) - is_cached_bucket(old);
+	dev_usage->sectors[bucket_type(old)] -= old.dirty_sectors;
+	dev_usage->sectors[bucket_type(new)] += new.dirty_sectors;
+	dev_usage->sectors_cached +=
+		(int) new.cached_sectors - (int) old.cached_sectors;
 	preempt_enable();
 
 	if (!is_available_bucket(old) && is_available_bucket(new))
 		bch2_wake_allocator(ca);
+
+	bch2_dev_stats_verify(ca);
 }
 
-#define bucket_data_cmpxchg(ca, g, new, expr)			\
+#define bucket_data_cmpxchg(c, ca, g, new, expr)		\
 ({								\
 	struct bucket_mark _old = bucket_cmpxchg(g, new, expr);	\
 								\
-	bch2_dev_usage_update(ca, _old, new);			\
+	bch2_dev_usage_update(c, ca, g, _old, new);		\
 	_old;							\
 })
 
-bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
-			    struct bucket_mark *old)
+bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    struct bucket *g, struct bucket_mark *old)
 {
 	struct bucket_mark new;
 
-	*old = bucket_data_cmpxchg(ca, g, new, ({
+	lg_local_lock(&c->usage_lock);
+	*old = bucket_data_cmpxchg(c, ca, g, new, ({
 		if (!is_available_bucket(new))
 			return false;
 
@@ -312,6 +356,7 @@ bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
 		new.dirty_sectors	= 0;
 		new.gen++;
 	}));
+	lg_local_unlock(&c->usage_lock);
 
 	if (!old->owned_by_allocator && old->cached_sectors)
 		trace_invalidate(ca, bucket_to_sector(ca, g - ca->buckets),
@@ -319,11 +364,13 @@ bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g,
 	return true;
 }
 
-bool bch2_mark_alloc_bucket_startup(struct bch_dev *ca, struct bucket *g)
+bool bch2_mark_alloc_bucket_startup(struct bch_fs *c, struct bch_dev *ca,
+				    struct bucket *g)
 {
 	struct bucket_mark new, old;
 
-	old = bucket_data_cmpxchg(ca, g, new, ({
+	lg_local_lock(&c->usage_lock);
+	old = bucket_data_cmpxchg(c, ca, g, new, ({
 		if (new.touched_this_mount ||
 		    !is_available_bucket(new))
 			return false;
@@ -331,37 +378,32 @@ bool bch2_mark_alloc_bucket_startup(struct bch_dev *ca, struct bucket *g)
 		new.owned_by_allocator	= 1;
 		new.touched_this_mount	= 1;
 	}));
+	lg_local_unlock(&c->usage_lock);
 
 	return true;
 }
 
-void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g)
+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
+			    struct bucket *g, bool owned_by_allocator,
+			    struct gc_pos pos, unsigned flags)
 {
 	struct bucket_mark old, new;
 
-	old = bucket_data_cmpxchg(ca, g, new, ({
-		new.touched_this_mount	= 1;
-		new.owned_by_allocator	= 0;
-		new.data_type		= 0;
-		new.cached_sectors	= 0;
-		new.dirty_sectors	= 0;
-	}));
-
-	BUG_ON(bucket_became_unavailable(ca->fs, old, new));
-}
-
-void bch2_mark_alloc_bucket(struct bch_dev *ca, struct bucket *g,
-			   bool owned_by_allocator)
-{
-	struct bucket_mark old, new;
+	lg_local_lock(&c->usage_lock);
+	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+	    gc_will_visit(c, pos)) {
+		lg_local_unlock(&c->usage_lock);
+		return;
+	}
 
-	old = bucket_data_cmpxchg(ca, g, new, ({
+	old = bucket_data_cmpxchg(c, ca, g, new, ({
 		new.touched_this_mount	= 1;
 		new.owned_by_allocator	= owned_by_allocator;
 	}));
+	lg_local_unlock(&c->usage_lock);
 
 	BUG_ON(!owned_by_allocator && !old.owned_by_allocator &&
-	       ca->fs->gc_pos.phase == GC_PHASE_DONE);
+	       c->gc_pos.phase == GC_PHASE_DONE);
 }
 
 #define saturated_add(ca, dst, src, max)			\
@@ -377,41 +419,49 @@ do {								\
 	}							\
 } while (0)
 
-void bch2_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g,
-			       enum bucket_data_type type,
-			       bool may_make_unavailable)
+void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
+			       struct bucket *g, enum bucket_data_type type,
+			       struct gc_pos pos, unsigned flags)
 {
 	struct bucket_mark old, new;
 
 	BUG_ON(!type);
 
-	old = bucket_data_cmpxchg(ca, g, new, ({
+	lg_local_lock(&c->usage_lock);
+	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+	    gc_will_visit(c, pos)) {
+		lg_local_unlock(&c->usage_lock);
+		return;
+	}
+
+	old = bucket_data_cmpxchg(c, ca, g, new, ({
 		saturated_add(ca, new.dirty_sectors, ca->mi.bucket_size,
 			      GC_MAX_SECTORS_USED);
 		new.data_type		= type;
 		new.touched_this_mount	= 1;
 	}));
+	lg_local_unlock(&c->usage_lock);
 
 	if (old.data_type != type &&
 	    (old.data_type ||
 	     old.cached_sectors ||
 	     old.dirty_sectors))
-		bch_err(ca->fs, "bucket %zu has multiple types of data (%u, %u)",
+		bch_err(c, "bucket %zu has multiple types of data (%u, %u)",
 			g - ca->buckets, old.data_type, new.data_type);
 
-	BUG_ON(!may_make_unavailable &&
-	       bucket_became_unavailable(ca->fs, old, new));
+	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
+	       bucket_became_unavailable(c, old, new));
 }
 
 /* Reverting this until the copygc + compression issue is fixed: */
 
-static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
+static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors)
 {
 	if (!sectors)
 		return 0;
 
-	return max(1U, DIV_ROUND_UP(sectors * crc_compressed_size(NULL, crc),
-				    crc_uncompressed_size(NULL, crc)));
+	return max(1U, DIV_ROUND_UP(sectors * crc.compressed_size,
+				    crc.uncompressed_size));
 }
 
 /*
@@ -420,12 +470,12 @@ static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors)
  * that with the gc pos seqlock held.
  */
 static void bch2_mark_pointer(struct bch_fs *c,
-			     struct bkey_s_c_extent e,
-			     const union bch_extent_crc *crc,
-			     const struct bch_extent_ptr *ptr,
-			     s64 sectors, enum s_alloc type,
-			     struct bch_fs_usage *stats,
-			     u64 journal_seq, unsigned flags)
+			      struct bkey_s_c_extent e,
+			      const struct bch_extent_ptr *ptr,
+			      struct bch_extent_crc_unpacked crc,
+			      s64 sectors, enum s_alloc type,
+			      struct bch_fs_usage *stats,
+			      u64 journal_seq, unsigned flags)
 {
 	struct bucket_mark old, new;
 	unsigned saturated;
@@ -435,7 +485,7 @@ static void bch2_mark_pointer(struct bch_fs *c,
 		? BUCKET_BTREE : BUCKET_DATA;
 	u64 v;
 
-	if (crc_compression_type(crc)) {
+	if (crc.compression_type) {
 		unsigned old_sectors, new_sectors;
 
 		if (sectors > 0) {
@@ -512,13 +562,13 @@ static void bch2_mark_pointer(struct bch_fs *c,
 			      old.counter,
 			      new.counter)) != old.counter);
 
-	bch2_dev_usage_update(ca, old, new);
+	bch2_dev_usage_update(c, ca, g, old, new);
 
 	if (old.data_type != data_type &&
 	    (old.data_type ||
 	     old.cached_sectors ||
 	     old.dirty_sectors))
-		bch_err(ca->fs, "bucket %zu has multiple types of data (%u, %u)",
+		bch_err(c, "bucket %zu has multiple types of data (%u, %u)",
 			g - ca->buckets, old.data_type, new.data_type);
 
 	BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) &&
@@ -535,71 +585,12 @@ static void bch2_mark_pointer(struct bch_fs *c,
 	}
 }
 
-static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c_extent e,
-			    s64 sectors, bool metadata,
-			    struct bch_fs_usage *stats,
-			    u64 journal_seq, unsigned flags)
-{
-	const struct bch_extent_ptr *ptr;
-	const union bch_extent_crc *crc;
-	enum s_alloc type = metadata ? S_META : S_DIRTY;
-	unsigned replicas = 0;
-
-	BUG_ON(metadata && bkey_extent_is_cached(e.k));
-	BUG_ON(!sectors);
-
-	extent_for_each_ptr_crc(e, ptr, crc) {
-		bch2_mark_pointer(c, e, crc, ptr, sectors, type,
-				  stats, journal_seq, flags);
-		replicas += !ptr->cached;
-	}
-
-	BUG_ON(replicas >= BCH_REPLICAS_MAX);
-
-	if (replicas)
-		stats->s[replicas - 1].data[type] += sectors;
-}
-
-void __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
-		     s64 sectors, bool metadata,
-		     struct bch_fs_usage *stats,
-		     u64 journal_seq, unsigned flags)
-{
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		bch2_mark_extent(c, bkey_s_c_to_extent(k), sectors, metadata,
-				stats, journal_seq, flags);
-		break;
-	case BCH_RESERVATION: {
-		struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-
-		if (r.v->nr_replicas)
-			stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
-		break;
-	}
-	}
-}
-
-void bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
-		     s64 sectors, bool metadata, unsigned flags)
-{
-	struct bch_fs_usage stats = { 0 };
-
-	__bch2_mark_key(c, k, sectors, metadata, &stats, 0,
-			flags|BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE);
-
-	preempt_disable();
-	bch2_usage_add(this_cpu_ptr(c->usage_percpu), &stats);
-	preempt_enable();
-}
-
 void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
-		  s64 sectors, bool metadata, struct gc_pos gc_pos,
-		  struct bch_fs_usage *stats, u64 journal_seq)
+		   s64 sectors, bool metadata,
+		   struct gc_pos pos,
+		   struct bch_fs_usage *stats,
+		   u64 journal_seq, unsigned flags)
 {
-	unsigned flags = gc_will_visit(c, gc_pos)
-		? BCH_BUCKET_MARK_GC_WILL_VISIT : 0;
 	/*
 	 * synchronization w.r.t. GC:
 	 *
@@ -614,69 +605,104 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 	 * To know whether we should mark a given reference (GC either isn't
 	 * running, or has already marked references at this position) we
 	 * construct a total order for everything GC walks. Then, we can simply
-	 * compare the position of the reference we're marking - @gc_pos - with
+	 * compare the position of the reference we're marking - @pos - with
 	 * GC's current position. If GC is going to mark this reference, GC's
-	 * current position will be less than @gc_pos; if GC's current position
-	 * is greater than @gc_pos GC has either already walked this position,
-	 * or isn't running.
+	 * current position will be less than @pos; if GC's current position is
+	 * greater than @pos GC has either already walked this position, or
+	 * isn't running.
 	 *
 	 * To avoid racing with GC's position changing, we have to deal with
 	 *  - GC's position being set to GC_POS_MIN when GC starts:
 	 *    usage_lock guards against this
-	 *  - GC's position overtaking @gc_pos: we guard against this with
+	 *  - GC's position overtaking @pos: we guard against this with
 	 *    whatever lock protects the data structure the reference lives in
 	 *    (e.g. the btree node lock, or the relevant allocator lock).
 	 */
+
 	lg_local_lock(&c->usage_lock);
-	__bch2_mark_key(c, k, sectors, metadata, stats, journal_seq, flags);
-	bch2_fs_stats_verify(c);
+	if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) &&
+	    gc_will_visit(c, pos))
+		flags |= BCH_BUCKET_MARK_GC_WILL_VISIT;
+
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const struct bch_extent_ptr *ptr;
+		struct bch_extent_crc_unpacked crc;
+		enum s_alloc type = metadata ? S_META : S_DIRTY;
+		unsigned replicas = 0;
+
+		BUG_ON(metadata && bkey_extent_is_cached(e.k));
+		BUG_ON(!sectors);
+
+		extent_for_each_ptr_crc(e, ptr, crc) {
+			bch2_mark_pointer(c, e, ptr, crc, sectors, type,
+					  stats, journal_seq, flags);
+			replicas += !ptr->cached;
+		}
+
+		BUG_ON(replicas >= BCH_REPLICAS_MAX);
+
+		if (replicas)
+			stats->s[replicas - 1].data[type] += sectors;
+		break;
+	}
+	case BCH_RESERVATION: {
+		struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+		if (r.v->nr_replicas)
+			stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors;
+		break;
+	}
+	}
 	lg_local_unlock(&c->usage_lock);
 }
 
-static u64 __recalc_sectors_available(struct bch_fs *c)
-{
-	return c->capacity - bch2_fs_sectors_used(c);
-}
+/* Disk reservations: */
 
-/* Used by gc when it's starting: */
-void bch2_recalc_sectors_available(struct bch_fs *c)
+static u64 __recalc_sectors_available(struct bch_fs *c)
 {
+	u64 avail;
 	int cpu;
 
-	lg_global_lock(&c->usage_lock);
-
 	for_each_possible_cpu(cpu)
 		per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0;
 
-	atomic64_set(&c->sectors_available,
-		     __recalc_sectors_available(c));
+	avail = c->capacity - bch2_fs_sectors_used(c);
 
+	avail <<= RESERVE_FACTOR;
+	avail /= (1 << RESERVE_FACTOR) + 1;
+	return avail;
+}
+
+/* Used by gc when it's starting: */
+void bch2_recalc_sectors_available(struct bch_fs *c)
+{
+	lg_global_lock(&c->usage_lock);
+	atomic64_set(&c->sectors_available, __recalc_sectors_available(c));
 	lg_global_unlock(&c->usage_lock);
 }
 
-void bch2_disk_reservation_put(struct bch_fs *c,
-			      struct disk_reservation *res)
+void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
 {
-	if (res->sectors) {
-		lg_local_lock(&c->usage_lock);
-		this_cpu_sub(c->usage_percpu->online_reserved,
-			     res->sectors);
+	lg_local_lock(&c->usage_lock);
+	this_cpu_sub(c->usage_percpu->online_reserved,
+		     res->sectors);
 
-		bch2_fs_stats_verify(c);
-		lg_local_unlock(&c->usage_lock);
+	bch2_fs_stats_verify(c);
+	lg_local_unlock(&c->usage_lock);
 
-		res->sectors = 0;
-	}
+	res->sectors = 0;
 }
 
 #define SECTORS_CACHE	1024
 
-int bch2_disk_reservation_add(struct bch_fs *c,
-			     struct disk_reservation *res,
-			     unsigned sectors, int flags)
+int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
+			      unsigned sectors, int flags)
 {
 	struct bch_fs_usage *stats;
-	u64 old, new, v;
+	u64 old, v, get;
 	s64 sectors_available;
 	int ret;
 
@@ -685,27 +711,29 @@ int bch2_disk_reservation_add(struct bch_fs *c,
 	lg_local_lock(&c->usage_lock);
 	stats = this_cpu_ptr(c->usage_percpu);
 
-	if (sectors >= stats->available_cache)
+	if (sectors <= stats->available_cache)
 		goto out;
 
 	v = atomic64_read(&c->sectors_available);
 	do {
 		old = v;
-		if (old < sectors) {
+		get = min((u64) sectors + SECTORS_CACHE, old);
+
+		if (get < sectors) {
 			lg_local_unlock(&c->usage_lock);
 			goto recalculate;
 		}
-
-		new = max_t(s64, 0, old - sectors - SECTORS_CACHE);
 	} while ((v = atomic64_cmpxchg(&c->sectors_available,
-				       old, new)) != old);
+				       old, old - get)) != old);
+
+	stats->available_cache	+= get;
 
-	stats->available_cache	+= old - new;
 out:
 	stats->available_cache	-= sectors;
 	stats->online_reserved	+= sectors;
 	res->sectors		+= sectors;
 
+	bch2_disk_reservations_verify(c, flags);
 	bch2_fs_stats_verify(c);
 	lg_local_unlock(&c->usage_lock);
 	return 0;
@@ -738,6 +766,8 @@ recalculate:
 		stats->online_reserved	+= sectors;
 		res->sectors		+= sectors;
 		ret = 0;
+
+		bch2_disk_reservations_verify(c, flags);
 	} else {
 		atomic64_set(&c->sectors_available, sectors_available);
 		ret = -ENOSPC;
diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h
index 141aa4a..7d2b08c 100644
--- a/libbcachefs/buckets.h
+++ b/libbcachefs/buckets.h
@@ -95,37 +95,39 @@ static inline bool bucket_unused(struct bucket_mark mark)
 /* Per device stats: */
 
 struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *);
-struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *);
+struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *);
 
 static inline u64 __dev_buckets_available(struct bch_dev *ca,
 					  struct bch_dev_usage stats)
 {
-	return max_t(s64, 0,
-		     ca->mi.nbuckets - ca->mi.first_bucket -
-		     stats.buckets[S_META] -
-		     stats.buckets[S_DIRTY] -
-		     stats.buckets_alloc);
+	u64 total = ca->mi.nbuckets - ca->mi.first_bucket;
+
+	if (WARN_ONCE(stats.buckets_unavailable > total,
+		      "buckets_unavailable overflow\n"))
+		return 0;
+
+	return total - stats.buckets_unavailable;
 }
 
 /*
  * Number of reclaimable buckets - only for use by the allocator thread:
  */
-static inline u64 dev_buckets_available(struct bch_dev *ca)
+static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 {
-	return __dev_buckets_available(ca, bch2_dev_usage_read(ca));
+	return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca));
 }
 
 static inline u64 __dev_buckets_free(struct bch_dev *ca,
-				       struct bch_dev_usage stats)
+				     struct bch_dev_usage stats)
 {
 	return __dev_buckets_available(ca, stats) +
 		fifo_used(&ca->free[RESERVE_NONE]) +
 		fifo_used(&ca->free_inc);
 }
 
-static inline u64 dev_buckets_free(struct bch_dev *ca)
+static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
 {
-	return __dev_buckets_free(ca, bch2_dev_usage_read(ca));
+	return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca));
 }
 
 /* Cache set stats: */
@@ -133,7 +135,7 @@ static inline u64 dev_buckets_free(struct bch_dev *ca)
 struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *);
 struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
 void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
-			struct disk_reservation *, struct gc_pos);
+			 struct disk_reservation *, struct gc_pos);
 
 struct fs_usage_sum {
 	u64	data;
@@ -155,11 +157,18 @@ static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats)
 	return sum;
 }
 
+#define RESERVE_FACTOR	6
+
+static u64 reserve_factor(u64 r)
+{
+	return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
+}
+
 static inline u64 __bch2_fs_sectors_used(struct bch_fs *c)
 {
 	struct fs_usage_sum sum = __fs_usage_sum(__bch2_fs_usage_read(c));
 
-	return sum.data + sum.reserved + (sum.reserved >> 7);
+	return sum.data + reserve_factor(sum.reserved);
 }
 
 static inline u64 bch2_fs_sectors_used(struct bch_fs *c)
@@ -184,30 +193,35 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m,
 
 void bch2_bucket_seq_cleanup(struct bch_fs *);
 
-bool bch2_invalidate_bucket(struct bch_dev *, struct bucket *,
-			    struct bucket_mark *);
-bool bch2_mark_alloc_bucket_startup(struct bch_dev *, struct bucket *);
-void bch2_mark_free_bucket(struct bch_dev *, struct bucket *);
-void bch2_mark_alloc_bucket(struct bch_dev *, struct bucket *, bool);
-void bch2_mark_metadata_bucket(struct bch_dev *, struct bucket *,
-			       enum bucket_data_type, bool);
+bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *,
+			    struct bucket *, struct bucket_mark *);
+bool bch2_mark_alloc_bucket_startup(struct bch_fs *, struct bch_dev *,
+				    struct bucket *);
+void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *,
+			    struct bucket *, bool,
+			    struct gc_pos, unsigned);
+void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
+			       struct bucket *, enum bucket_data_type,
+			       struct gc_pos, unsigned);
 
 #define BCH_BUCKET_MARK_NOATOMIC		(1 << 0)
-#define BCH_BUCKET_MARK_GC_WILL_VISIT		(1 << 1)
-#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE	(1 << 2)
-
-void __bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool,
-		     struct bch_fs_usage *, u64, unsigned);
+#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE	(1 << 1)
+#define BCH_BUCKET_MARK_GC_WILL_VISIT		(1 << 2)
+#define BCH_BUCKET_MARK_GC_LOCK_HELD		(1 << 3)
 
-void bch2_gc_mark_key(struct bch_fs *, struct bkey_s_c,
-		      s64, bool, unsigned);
-void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool,
-		  struct gc_pos, struct bch_fs_usage *, u64);
+void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, struct gc_pos,
+		   struct bch_fs_usage *, u64, unsigned);
 
 void bch2_recalc_sectors_available(struct bch_fs *);
 
-void bch2_disk_reservation_put(struct bch_fs *,
-			      struct disk_reservation *);
+void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *);
+
+static inline void bch2_disk_reservation_put(struct bch_fs *c,
+					     struct disk_reservation *res)
+{
+	if (res->sectors)
+		__bch2_disk_reservation_put(c, res);
+}
 
 #define BCH_DISK_RESERVATION_NOFAIL		(1 << 0)
 #define BCH_DISK_RESERVATION_METADATA		(1 << 1)
diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h
index 63f1b27..0bd8d2d 100644
--- a/libbcachefs/buckets_types.h
+++ b/libbcachefs/buckets_types.h
@@ -59,6 +59,7 @@ struct bch_dev_usage {
 	u64			buckets[S_ALLOC_NR];
 	u64			buckets_cached;
 	u64			buckets_alloc;
+	u64			buckets_unavailable;
 
 	/* _compressed_ sectors: */
 	u64			sectors[S_ALLOC_NR];
@@ -79,13 +80,6 @@ struct bch_fs_usage {
 	u64			available_cache;
 };
 
-struct bucket_heap_entry {
-	size_t			bucket;
-	struct bucket_mark	mark;
-};
-
-typedef HEAP(struct bucket_heap_entry) bucket_heap;
-
 /*
  * A reservation for space on disk:
  */
@@ -95,4 +89,11 @@ struct disk_reservation {
 	unsigned	nr_replicas;
 };
 
+struct copygc_heap_entry {
+	u64			offset;
+	struct bucket_mark	mark;
+};
+
+typedef HEAP(struct copygc_heap_entry) copygc_heap;
+
 #endif /* _BUCKETS_TYPES_H */
diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c
index 01bdc86..0875585 100644
--- a/libbcachefs/checksum.c
+++ b/libbcachefs/checksum.c
@@ -141,10 +141,14 @@ static u64 bch2_checksum_init(unsigned type)
 	switch (type) {
 	case BCH_CSUM_NONE:
 		return 0;
-	case BCH_CSUM_CRC32C:
+	case BCH_CSUM_CRC32C_NONZERO:
 		return U32_MAX;
-	case BCH_CSUM_CRC64:
+	case BCH_CSUM_CRC64_NONZERO:
 		return U64_MAX;
+	case BCH_CSUM_CRC32C:
+		return 0;
+	case BCH_CSUM_CRC64:
+		return 0;
 	default:
 		BUG();
 	}
@@ -155,10 +159,14 @@ static u64 bch2_checksum_final(unsigned type, u64 crc)
 	switch (type) {
 	case BCH_CSUM_NONE:
 		return 0;
-	case BCH_CSUM_CRC32C:
+	case BCH_CSUM_CRC32C_NONZERO:
 		return crc ^ U32_MAX;
-	case BCH_CSUM_CRC64:
+	case BCH_CSUM_CRC64_NONZERO:
 		return crc ^ U64_MAX;
+	case BCH_CSUM_CRC32C:
+		return crc;
+	case BCH_CSUM_CRC64:
+		return crc;
 	default:
 		BUG();
 	}
@@ -169,8 +177,10 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t
 	switch (type) {
 	case BCH_CSUM_NONE:
 		return 0;
+	case BCH_CSUM_CRC32C_NONZERO:
 	case BCH_CSUM_CRC32C:
 		return crc32c(crc, data, len);
+	case BCH_CSUM_CRC64_NONZERO:
 	case BCH_CSUM_CRC64:
 		return bch2_crc64_update(crc, data, len);
 	default:
@@ -243,6 +253,8 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
 {
 	switch (type) {
 	case BCH_CSUM_NONE:
+	case BCH_CSUM_CRC32C_NONZERO:
+	case BCH_CSUM_CRC64_NONZERO:
 	case BCH_CSUM_CRC32C:
 	case BCH_CSUM_CRC64: {
 		u64 crc = bch2_checksum_init(type);
@@ -250,7 +262,7 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type,
 		crc = bch2_checksum_update(type, crc, data, len);
 		crc = bch2_checksum_final(type, crc);
 
-		return (struct bch_csum) { .lo = crc };
+		return (struct bch_csum) { .lo = cpu_to_le64(crc) };
 	}
 
 	case BCH_CSUM_CHACHA20_POLY1305_80:
@@ -281,28 +293,36 @@ void bch2_encrypt(struct bch_fs *c, unsigned type,
 	do_encrypt(c->chacha20, nonce, data, len);
 }
 
-struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
-				  struct nonce nonce, struct bio *bio)
+static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type,
+					   struct nonce nonce, struct bio *bio,
+					   struct bvec_iter *iter)
 {
 	struct bio_vec bv;
-	struct bvec_iter iter;
 
 	switch (type) {
 	case BCH_CSUM_NONE:
 		return (struct bch_csum) { 0 };
+	case BCH_CSUM_CRC32C_NONZERO:
+	case BCH_CSUM_CRC64_NONZERO:
 	case BCH_CSUM_CRC32C:
 	case BCH_CSUM_CRC64: {
 		u64 crc = bch2_checksum_init(type);
 
-		bio_for_each_contig_segment(bv, bio, iter) {
+#ifdef CONFIG_HIGHMEM
+		__bio_for_each_segment(bv, bio, *iter, *iter) {
 			void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
 			crc = bch2_checksum_update(type,
 				crc, p, bv.bv_len);
 			kunmap_atomic(p);
 		}
-
+#else
+		__bio_for_each_contig_segment(bv, bio, *iter, *iter)
+			crc = bch2_checksum_update(type, crc,
+				page_address(bv.bv_page) + bv.bv_offset,
+				bv.bv_len);
+#endif
 		crc = bch2_checksum_final(type, crc);
-		return (struct bch_csum) { .lo = crc };
+		return (struct bch_csum) { .lo = cpu_to_le64(crc) };
 	}
 
 	case BCH_CSUM_CHACHA20_POLY1305_80:
@@ -313,13 +333,19 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
 
 		gen_poly_key(c, desc, nonce);
 
-		bio_for_each_contig_segment(bv, bio, iter) {
+#ifdef CONFIG_HIGHMEM
+		__bio_for_each_segment(bv, bio, *iter, *iter) {
 			void *p = kmap_atomic(bv.bv_page) + bv.bv_offset;
 
 			crypto_shash_update(desc, p, bv.bv_len);
 			kunmap_atomic(p);
 		}
-
+#else
+		__bio_for_each_contig_segment(bv, bio, *iter, *iter)
+			crypto_shash_update(desc,
+				page_address(bv.bv_page) + bv.bv_offset,
+				bv.bv_len);
+#endif
 		crypto_shash_final(desc, digest);
 
 		memcpy(&ret, digest, bch_crc_bytes[type]);
@@ -330,6 +356,14 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
 	}
 }
 
+struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type,
+				  struct nonce nonce, struct bio *bio)
+{
+	struct bvec_iter iter = bio->bi_iter;
+
+	return __bch2_checksum_bio(c, type, nonce, bio, &iter);
+}
+
 void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
 		      struct nonce nonce, struct bio *bio)
 {
@@ -343,12 +377,12 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
 
 	sg_init_table(sgl, ARRAY_SIZE(sgl));
 
-	bio_for_each_contig_segment(bv, bio, iter) {
+	bio_for_each_segment(bv, bio, iter) {
 		if (sg == sgl + ARRAY_SIZE(sgl)) {
 			sg_mark_end(sg - 1);
 			do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
 
-			le32_add_cpu(nonce.d, bytes / CHACHA20_BLOCK_SIZE);
+			nonce = nonce_add(nonce, bytes);
 			bytes = 0;
 
 			sg_init_table(sgl, ARRAY_SIZE(sgl));
@@ -357,13 +391,115 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type,
 
 		sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset);
 		bytes += bv.bv_len;
-
 	}
 
 	sg_mark_end(sg - 1);
 	do_encrypt_sg(c->chacha20, nonce, sgl, bytes);
 }
 
+static inline bool bch2_checksum_mergeable(unsigned type)
+{
+
+	switch (type) {
+	case BCH_CSUM_NONE:
+	case BCH_CSUM_CRC32C:
+	case BCH_CSUM_CRC64:
+		return true;
+	default:
+		return false;
+	}
+}
+
+static struct bch_csum bch2_checksum_merge(unsigned type,
+					   struct bch_csum a,
+					   struct bch_csum b, size_t b_len)
+{
+	BUG_ON(!bch2_checksum_mergeable(type));
+
+	while (b_len) {
+		unsigned b = min(b_len, PAGE_SIZE);
+
+		a.lo = bch2_checksum_update(type, a.lo,
+				page_address(ZERO_PAGE(0)), b);
+		b_len -= b;
+	}
+
+	a.lo ^= b.lo;
+	a.hi ^= b.hi;
+	return a;
+}
+
+int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
+			struct bversion version,
+			struct bch_extent_crc_unpacked crc_old,
+			struct bch_extent_crc_unpacked *crc_a,
+			struct bch_extent_crc_unpacked *crc_b,
+			unsigned len_a, unsigned len_b,
+			unsigned new_csum_type)
+{
+	struct bvec_iter iter = bio->bi_iter;
+	struct nonce nonce = extent_nonce(version, crc_old);
+	struct bch_csum merged = { 0 };
+	struct crc_split {
+		struct bch_extent_crc_unpacked	*crc;
+		unsigned			len;
+		unsigned			csum_type;
+		struct bch_csum			csum;
+	} splits[3] = {
+		{ crc_a, len_a, new_csum_type },
+		{ crc_b, len_b, new_csum_type },
+		{ NULL,	 bio_sectors(bio) - len_a - len_b, new_csum_type },
+	}, *i;
+	bool mergeable = crc_old.csum_type == new_csum_type &&
+		bch2_checksum_mergeable(new_csum_type);
+	unsigned crc_nonce = crc_old.nonce;
+
+	BUG_ON(len_a + len_b > bio_sectors(bio));
+	BUG_ON(crc_old.uncompressed_size != bio_sectors(bio));
+	BUG_ON(crc_old.compression_type);
+	BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) !=
+	       bch2_csum_type_is_encryption(new_csum_type));
+
+	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+		iter.bi_size = i->len << 9;
+		if (mergeable || i->crc)
+			i->csum = __bch2_checksum_bio(c, i->csum_type,
+						      nonce, bio, &iter);
+		else
+			bio_advance_iter(bio, &iter, i->len << 9);
+		nonce = nonce_add(nonce, i->len << 9);
+	}
+
+	if (mergeable)
+		for (i = splits; i < splits + ARRAY_SIZE(splits); i++)
+			merged = bch2_checksum_merge(new_csum_type, merged,
+						     i->csum, i->len << 9);
+	else
+		merged = bch2_checksum_bio(c, crc_old.csum_type,
+				extent_nonce(version, crc_old), bio);
+
+	if (bch2_crc_cmp(merged, crc_old.csum))
+		return -EIO;
+
+	for (i = splits; i < splits + ARRAY_SIZE(splits); i++) {
+		if (i->crc)
+			*i->crc = (struct bch_extent_crc_unpacked) {
+				.csum_type		= i->csum_type,
+				.compressed_size	= i->len,
+				.uncompressed_size	= i->len,
+				.offset			= 0,
+				.live_size		= i->len,
+				.nonce			= crc_nonce,
+				.csum			= i->csum,
+			};
+
+		if (bch2_csum_type_is_encryption(new_csum_type))
+			crc_nonce += i->len;
+	}
+
+	return 0;
+}
+
 #ifdef __KERNEL__
 int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
 {
diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h
index e8f6ef4..1a08941 100644
--- a/libbcachefs/checksum.h
+++ b/libbcachefs/checksum.h
@@ -2,6 +2,7 @@
 #define _BCACHEFS_CHECKSUM_H
 
 #include "bcachefs.h"
+#include "extents_types.h"
 #include "super-io.h"
 
 #include <crypto/chacha20.h>
@@ -36,7 +37,14 @@ void bch2_encrypt(struct bch_fs *, unsigned, struct nonce,
 		 void *data, size_t);
 
 struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned,
-				 struct nonce, struct bio *);
+				  struct nonce, struct bio *);
+
+int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion,
+			struct bch_extent_crc_unpacked,
+			struct bch_extent_crc_unpacked *,
+			struct bch_extent_crc_unpacked *,
+			unsigned, unsigned, unsigned);
+
 void bch2_encrypt_bio(struct bch_fs *, unsigned,
 		    struct nonce, struct bio *);
 
@@ -49,15 +57,16 @@ int bch2_enable_encryption(struct bch_fs *, bool);
 void bch2_fs_encryption_exit(struct bch_fs *);
 int bch2_fs_encryption_init(struct bch_fs *);
 
-static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type)
+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type,
+						       bool data)
 {
 	switch (type) {
 	case BCH_CSUM_OPT_NONE:
 	     return BCH_CSUM_NONE;
 	case BCH_CSUM_OPT_CRC32C:
-	     return BCH_CSUM_CRC32C;
+	     return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO;
 	case BCH_CSUM_OPT_CRC64:
-	     return BCH_CSUM_CRC64;
+	     return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO;
 	default:
 	     BUG();
 	}
@@ -70,7 +79,7 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c)
 			? BCH_CSUM_CHACHA20_POLY1305_128
 			: BCH_CSUM_CHACHA20_POLY1305_80;
 
-	return bch2_csum_opt_to_type(c->opts.data_checksum);
+	return bch2_csum_opt_to_type(c->opts.data_checksum, true);
 }
 
 static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
@@ -78,7 +87,7 @@ static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c)
 	if (c->sb.encryption_type)
 		return BCH_CSUM_CHACHA20_POLY1305_128;
 
-	return bch2_csum_opt_to_type(c->opts.metadata_checksum);
+	return bch2_csum_opt_to_type(c->opts.metadata_checksum, false);
 }
 
 static inline enum bch_compression_type
@@ -134,6 +143,21 @@ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset)
 	return nonce;
 }
 
+static inline struct nonce extent_nonce(struct bversion version,
+					struct bch_extent_crc_unpacked crc)
+{
+	unsigned size = crc.compression_type ? crc.uncompressed_size : 0;
+	struct nonce nonce = (struct nonce) {{
+		[0] = cpu_to_le32(size << 22),
+		[1] = cpu_to_le32(version.lo),
+		[2] = cpu_to_le32(version.lo >> 32),
+		[3] = cpu_to_le32(version.hi|
+				  (crc.compression_type << 24))^BCH_NONCE_EXTENT,
+	}};
+
+	return nonce_add(nonce, crc.nonce << 9);
+}
+
 static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key)
 {
 	return le64_to_cpu(key->magic) != BCH_KEY_MAGIC;
diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c
index 7b45bb7..6407998 100644
--- a/libbcachefs/compress.c
+++ b/libbcachefs/compress.c
@@ -1,4 +1,5 @@
 #include "bcachefs.h"
+#include "checksum.h"
 #include "compress.h"
 #include "extents.h"
 #include "io.h"
@@ -145,11 +146,11 @@ static inline void zlib_set_workspace(z_stream *strm, void *workspace)
 }
 
 static int __bio_uncompress(struct bch_fs *c, struct bio *src,
-			    void *dst_data, struct bch_extent_crc128 crc)
+			    void *dst_data, struct bch_extent_crc_unpacked crc)
 {
 	struct bbuf src_data = { NULL };
 	size_t src_len = src->bi_iter.bi_size;
-	size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
+	size_t dst_len = crc.uncompressed_size << 9;
 	int ret;
 
 	src_data = bio_map_or_bounce(c, src, READ);
@@ -212,65 +213,58 @@ err:
 }
 
 int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio,
-			       unsigned live_data_sectors,
-			       struct bch_extent_crc128 crc)
+				struct bch_extent_crc_unpacked *crc)
 {
-	struct bbuf dst_data = { NULL };
-	size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
-	int ret = -ENOMEM;
+	struct bbuf data = { NULL };
+	size_t dst_len = crc->uncompressed_size << 9;
 
-	BUG_ON(DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS) > bio->bi_max_vecs);
+	/* bio must own its pages: */
+	BUG_ON(!bio->bi_vcnt);
+	BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs);
 
-	if (crc_uncompressed_size(NULL, &crc) > c->sb.encoded_extent_max ||
-	    crc_compressed_size(NULL, &crc)   > c->sb.encoded_extent_max)
+	if (crc->uncompressed_size	> c->sb.encoded_extent_max ||
+	    crc->compressed_size	> c->sb.encoded_extent_max) {
+		bch_err(c, "error rewriting existing data: extent too big");
 		return -EIO;
+	}
 
-	dst_data = __bounce_alloc(c, dst_len, WRITE);
-
-	ret = __bio_uncompress(c, bio, dst_data.b, crc);
-	if (ret)
-		goto err;
-
-	while (bio->bi_vcnt < DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS)) {
-		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
-
-		bv->bv_page = alloc_page(GFP_NOIO);
-		if (!bv->bv_page)
-			goto use_mempool;
+	data = __bounce_alloc(c, dst_len, WRITE);
 
-		bv->bv_len = PAGE_SIZE;
-		bv->bv_offset = 0;
-		bio->bi_vcnt++;
+	if (__bio_uncompress(c, bio, data.b, *crc)) {
+		bch_err(c, "error rewriting existing data: decompression error");
+		bio_unmap_or_unbounce(c, data);
+		return -EIO;
 	}
 
-	bio->bi_iter.bi_size = live_data_sectors << 9;
-copy_data:
-	memcpy_to_bio(bio, bio->bi_iter, dst_data.b + (crc.offset << 9));
-err:
-	bio_unmap_or_unbounce(c, dst_data);
-	return ret;
-use_mempool:
 	/*
-	 * We already allocated from mempool, we can't allocate from it again
-	 * without freeing the pages we already allocated or else we could
-	 * deadlock:
+	 * might have to free existing pages and retry allocation from mempool -
+	 * do this _after_ decompressing:
 	 */
+	bch2_bio_alloc_more_pages_pool(c, bio, crc->live_size << 9);
+
+	memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9));
 
-	bch2_bio_free_pages_pool(c, bio);
-	bch2_bio_alloc_pages_pool(c, bio, live_data_sectors << 9);
-	goto copy_data;
+	crc->csum_type		= 0;
+	crc->compression_type	= 0;
+	crc->compressed_size	= crc->live_size;
+	crc->uncompressed_size	= crc->live_size;
+	crc->offset		= 0;
+	crc->csum		= (struct bch_csum) { 0, 0 };
+
+	bio_unmap_or_unbounce(c, data);
+	return 0;
 }
 
 int bch2_bio_uncompress(struct bch_fs *c, struct bio *src,
 		       struct bio *dst, struct bvec_iter dst_iter,
-		       struct bch_extent_crc128 crc)
+		       struct bch_extent_crc_unpacked crc)
 {
 	struct bbuf dst_data = { NULL };
-	size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9;
+	size_t dst_len = crc.uncompressed_size << 9;
 	int ret = -ENOMEM;
 
-	if (crc_uncompressed_size(NULL, &crc) > c->sb.encoded_extent_max ||
-	    crc_compressed_size(NULL, &crc)   > c->sb.encoded_extent_max)
+	if (crc.uncompressed_size	> c->sb.encoded_extent_max ||
+	    crc.compressed_size		> c->sb.encoded_extent_max)
 		return -EIO;
 
 	dst_data = dst_len == dst_iter.bi_size
@@ -288,21 +282,25 @@ err:
 	return ret;
 }
 
-static int __bio_compress(struct bch_fs *c,
-			  struct bio *dst, size_t *dst_len,
-			  struct bio *src, size_t *src_len,
-			  unsigned *compression_type)
+static unsigned __bio_compress(struct bch_fs *c,
+			       struct bio *dst, size_t *dst_len,
+			       struct bio *src, size_t *src_len,
+			       unsigned compression_type)
 {
 	struct bbuf src_data = { NULL }, dst_data = { NULL };
 	unsigned pad;
 	int ret = 0;
 
+	/* If it's only one block, don't bother trying to compress: */
+	if (bio_sectors(src) <= c->opts.block_size)
+		goto err;
+
 	dst_data = bio_map_or_bounce(c, dst, WRITE);
 	src_data = bio_map_or_bounce(c, src, READ);
 
-	switch (*compression_type) {
+	switch (compression_type) {
 	case BCH_COMPRESSION_LZ4_OLD:
-		*compression_type = BCH_COMPRESSION_LZ4;
+		compression_type = BCH_COMPRESSION_LZ4;
 
 	case BCH_COMPRESSION_LZ4: {
 		void *workspace;
@@ -403,19 +401,24 @@ zlib_err:
 
 	if (dst_data.type != BB_NONE)
 		memcpy_to_bio(dst, dst->bi_iter, dst_data.b);
+
+	BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
+	BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
+	BUG_ON(*dst_len & (block_bytes(c) - 1));
+	BUG_ON(*src_len & (block_bytes(c) - 1));
 out:
 	bio_unmap_or_unbounce(c, src_data);
 	bio_unmap_or_unbounce(c, dst_data);
-	return ret;
+	return compression_type;
 err:
-	ret = -1;
+	compression_type = 0;
 	goto out;
 }
 
-void bch2_bio_compress(struct bch_fs *c,
-		       struct bio *dst, size_t *dst_len,
-		       struct bio *src, size_t *src_len,
-		       unsigned *compression_type)
+unsigned bch2_bio_compress(struct bch_fs *c,
+			   struct bio *dst, size_t *dst_len,
+			   struct bio *src, size_t *src_len,
+			   unsigned compression_type)
 {
 	unsigned orig_dst = dst->bi_iter.bi_size;
 	unsigned orig_src = src->bi_iter.bi_size;
@@ -423,29 +426,15 @@ void bch2_bio_compress(struct bch_fs *c,
 	/* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */
 	src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size,
 				     c->sb.encoded_extent_max << 9);
-
 	/* Don't generate a bigger output than input: */
-	dst->bi_iter.bi_size =
-		min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+	dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+
+	compression_type =
+		__bio_compress(c, dst, dst_len, src, src_len, compression_type);
 
-	/* If it's only one block, don't bother trying to compress: */
-	if (*compression_type != BCH_COMPRESSION_NONE &&
-	    bio_sectors(src) > c->opts.block_size &&
-	    !__bio_compress(c, dst, dst_len, src, src_len, compression_type))
-		goto out;
-
-	/* If compressing failed (didn't get smaller), just copy: */
-	*compression_type = BCH_COMPRESSION_NONE;
-	*dst_len = *src_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
-	bio_copy_data(dst, src);
-out:
 	dst->bi_iter.bi_size = orig_dst;
 	src->bi_iter.bi_size = orig_src;
-
-	BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size);
-	BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size);
-	BUG_ON(*dst_len & (block_bytes(c) - 1));
-	BUG_ON(*src_len & (block_bytes(c) - 1));
+	return compression_type;
 }
 
 /* doesn't write superblock: */
diff --git a/libbcachefs/compress.h b/libbcachefs/compress.h
index ad1ba25..06fff6a 100644
--- a/libbcachefs/compress.h
+++ b/libbcachefs/compress.h
@@ -1,12 +1,14 @@
 #ifndef _BCACHEFS_COMPRESS_H
 #define _BCACHEFS_COMPRESS_H
 
+#include "extents_types.h"
+
 int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
-			       unsigned, struct bch_extent_crc128);
+				struct bch_extent_crc_unpacked *);
 int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
-		       struct bvec_iter, struct bch_extent_crc128);
-void bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
-		      struct bio *, size_t *, unsigned *);
+		       struct bvec_iter, struct bch_extent_crc_unpacked);
+unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
+			   struct bio *, size_t *, unsigned);
 
 int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
 void bch2_fs_compress_exit(struct bch_fs *);
diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c
index 7d2f5cc..6e79f49 100644
--- a/libbcachefs/extents.c
+++ b/libbcachefs/extents.c
@@ -19,6 +19,7 @@
 #include "inode.h"
 #include "journal.h"
 #include "super-io.h"
+#include "util.h"
 #include "xattr.h"
 
 #include <trace/events/bcachefs.h>
@@ -155,6 +156,44 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
 	return nr_ptrs;
 }
 
+unsigned bch2_extent_is_compressed(struct bkey_s_c k)
+{
+	struct bkey_s_c_extent e;
+	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;
+	unsigned ret = 0;
+
+	switch (k.k->type) {
+	case BCH_EXTENT:
+	case BCH_EXTENT_CACHED:
+		e = bkey_s_c_to_extent(k);
+
+		extent_for_each_ptr_crc(e, ptr, crc)
+			if (!ptr->cached &&
+			    crc.compression_type != BCH_COMPRESSION_NONE &&
+			    crc.compressed_size < crc.live_size)
+				ret = max_t(unsigned, ret, crc.compressed_size);
+	}
+
+	return ret;
+}
+
+bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
+			     struct bch_extent_ptr m, u64 offset)
+{
+	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;
+
+	extent_for_each_ptr_crc(e, ptr, crc)
+		if (ptr->dev	== m.dev &&
+		    ptr->gen	== m.gen &&
+		    (s64) ptr->offset + crc.offset - bkey_start_offset(e.k) ==
+		    (s64) m.offset  - offset)
+			return ptr;
+
+	return NULL;
+}
+
 /* Doesn't cleanup redundant crcs */
 void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
 {
@@ -186,24 +225,30 @@ found:
 	bch2_extent_drop_ptr(e, ptr);
 }
 
-/* returns true if equal */
-static bool crc_cmp(union bch_extent_crc *l, union bch_extent_crc *r)
+static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
+				  struct bch_extent_crc_unpacked n)
 {
-	return extent_crc_type(l) == extent_crc_type(r) &&
-		!memcmp(l, r, extent_entry_bytes(to_entry(l)));
+	return !u.compression_type &&
+		u.csum_type &&
+		u.uncompressed_size > u.live_size &&
+		bch2_csum_type_is_encryption(u.csum_type) ==
+		bch2_csum_type_is_encryption(n.csum_type);
 }
 
-/* Increment pointers after @crc by crc's offset until the next crc entry: */
-void bch2_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_crc *crc)
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
+				 struct bch_extent_crc_unpacked n)
 {
-	union bch_extent_entry *entry;
+	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *i;
 
-	extent_for_each_entry_from(e, entry, extent_entry_next(to_entry(crc))) {
-		if (!extent_entry_is_ptr(entry))
-			return;
+	if (!n.csum_type)
+		return false;
 
-		entry->ptr.offset += crc_offset(crc);
-	}
+	extent_for_each_crc(e, crc, i)
+		if (can_narrow_crc(crc, n))
+			return true;
+
+	return false;
 }
 
 /*
@@ -214,96 +259,50 @@ void bch2_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_cr
  * not compressed, we can modify them to point to only the data that is
  * currently live (so that readers won't have to bounce) while we've got the
  * checksum we need:
- *
- * XXX: to guard against data being corrupted while in memory, instead of
- * recomputing the checksum here, it would be better in the read path to instead
- * of computing the checksum of the entire extent:
- *
- * | extent                              |
- *
- * compute the checksums of the live and dead data separately
- * | dead data || live data || dead data |
- *
- * and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then
- * use crc_live here (that we verified was correct earlier)
- *
- * note: doesn't work with encryption
  */
-void bch2_extent_narrow_crcs(struct bkey_s_extent e)
+bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
+			     struct bch_extent_crc_unpacked n)
 {
-	union bch_extent_crc *crc;
-	bool have_wide = false, have_narrow = false;
-	struct bch_csum csum = { 0 };
-	unsigned csum_type = 0;
-
-	extent_for_each_crc(e, crc) {
-		if (crc_compression_type(crc) ||
-		    bch2_csum_type_is_encryption(crc_csum_type(crc)))
-			continue;
-
-		if (crc_uncompressed_size(e.k, crc) != e.k->size) {
-			have_wide = true;
-		} else {
-			have_narrow = true;
-			csum = crc_csum(crc);
-			csum_type = crc_csum_type(crc);
-		}
-	}
-
-	if (!have_wide || !have_narrow)
-		return;
-
-	extent_for_each_crc(e, crc) {
-		if (crc_compression_type(crc))
-			continue;
-
-		if (crc_uncompressed_size(e.k, crc) != e.k->size) {
-			switch (extent_crc_type(crc)) {
-			case BCH_EXTENT_CRC_NONE:
-				BUG();
-			case BCH_EXTENT_CRC32:
-				if (bch_crc_bytes[csum_type] > 4)
-					continue;
-
-				bch2_extent_crc_narrow_pointers(e, crc);
-				crc->crc32._compressed_size	= e.k->size - 1;
-				crc->crc32._uncompressed_size	= e.k->size - 1;
-				crc->crc32.offset		= 0;
-				crc->crc32.csum_type		= csum_type;
-				crc->crc32.csum			= csum.lo;
+	struct bch_extent_crc_unpacked u;
+	struct bch_extent_ptr *ptr;
+	union bch_extent_entry *i;
+
+	/* Find a checksum entry that covers only live data: */
+	if (!n.csum_type)
+		extent_for_each_crc(extent_i_to_s(e), u, i)
+			if (!u.compression_type &&
+			    u.csum_type &&
+			    u.live_size == u.uncompressed_size) {
+				n = u;
 				break;
-			case BCH_EXTENT_CRC64:
-				if (bch_crc_bytes[csum_type] > 10)
-					continue;
+			}
 
-				bch2_extent_crc_narrow_pointers(e, crc);
-				crc->crc64._compressed_size	= e.k->size - 1;
-				crc->crc64._uncompressed_size	= e.k->size - 1;
-				crc->crc64.offset		= 0;
-				crc->crc64.csum_type		= csum_type;
-				crc->crc64.csum_lo		= csum.lo;
-				crc->crc64.csum_hi		= csum.hi;
-				break;
-			case BCH_EXTENT_CRC128:
-				if (bch_crc_bytes[csum_type] > 16)
-					continue;
+	if (!bch2_can_narrow_extent_crcs(extent_i_to_s_c(e), n))
+		return false;
 
-				bch2_extent_crc_narrow_pointers(e, crc);
-				crc->crc128._compressed_size	= e.k->size - 1;
-				crc->crc128._uncompressed_size	= e.k->size - 1;
-				crc->crc128.offset		= 0;
-				crc->crc128.csum_type		= csum_type;
-				crc->crc128.csum		= csum;
-				break;
-			}
+	BUG_ON(n.compression_type);
+	BUG_ON(n.offset);
+	BUG_ON(n.live_size != e->k.size);
+
+	bch2_extent_crc_append(e, n);
+restart_narrow_pointers:
+	extent_for_each_ptr_crc(extent_i_to_s(e), ptr, u)
+		if (can_narrow_crc(u, n)) {
+			ptr->offset += u.offset;
+			extent_ptr_append(e, *ptr);
+			__bch2_extent_drop_ptr(extent_i_to_s(e), ptr);
+			goto restart_narrow_pointers;
 		}
-	}
+
+	bch2_extent_drop_redundant_crcs(extent_i_to_s(e));
+	return true;
 }
 
 void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
 {
 	union bch_extent_entry *entry = e.v->start;
 	union bch_extent_crc *crc, *prev = NULL;
+	struct bch_extent_crc_unpacked u, prev_u;
 
 	while (entry != extent_entry_last(e)) {
 		union bch_extent_entry *next = extent_entry_next(entry);
@@ -313,6 +312,7 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
 			goto next;
 
 		crc = entry_to_crc(entry);
+		u = bch2_extent_crc_unpack(e.k, crc);
 
 		if (next == extent_entry_last(e)) {
 			/* crc entry with no pointers after it: */
@@ -324,20 +324,28 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
 			goto drop;
 		}
 
-		if (prev && crc_cmp(crc, prev)) {
+		if (prev && !memcmp(&u, &prev_u, sizeof(u))) {
 			/* identical to previous crc entry: */
 			goto drop;
 		}
 
 		if (!prev &&
-		    !crc_csum_type(crc) &&
-		    !crc_compression_type(crc)) {
+		    !u.csum_type &&
+		    !u.compression_type) {
 			/* null crc entry: */
-			bch2_extent_crc_narrow_pointers(e, crc);
+			union bch_extent_entry *e2;
+
+			extent_for_each_entry_from(e, e2, extent_entry_next(entry)) {
+				if (!extent_entry_is_ptr(e2))
+					break;
+
+				e2->ptr.offset += u.offset;
+			}
 			goto drop;
 		}
 
 		prev = crc;
+		prev_u = u;
 next:
 		entry = next;
 		continue;
@@ -453,7 +461,7 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
 {
 	char *out = buf, *end = buf + size;
 	const union bch_extent_entry *entry;
-	const union bch_extent_crc *crc;
+	struct bch_extent_crc_unpacked crc;
 	const struct bch_extent_ptr *ptr;
 	struct bch_dev *ca;
 	bool first = true;
@@ -468,13 +476,14 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
 		case BCH_EXTENT_ENTRY_crc32:
 		case BCH_EXTENT_ENTRY_crc64:
 		case BCH_EXTENT_ENTRY_crc128:
-			crc = entry_to_crc(entry);
-
-			p("crc: c_size %u size %u offset %u csum %u compress %u",
-			  crc_compressed_size(e.k, crc),
-			  crc_uncompressed_size(e.k, crc),
-			  crc_offset(crc), crc_csum_type(crc),
-			  crc_compression_type(crc));
+			crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
+
+			p("crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
+			  crc.compressed_size,
+			  crc.uncompressed_size,
+			  crc.offset, crc.nonce,
+			  crc.csum_type,
+			  crc.compression_type);
 			break;
 		case BCH_EXTENT_ENTRY_ptr:
 			ptr = entry_to_ptr(entry);
@@ -499,13 +508,24 @@ out:
 	return out - buf;
 }
 
+static inline bool dev_latency_better(struct bch_dev *dev1,
+				      struct bch_dev *dev2)
+{
+	unsigned l1 = atomic_read(&dev1->latency[READ]);
+	unsigned l2 = atomic_read(&dev2->latency[READ]);
+
+	/* Pick at random, biased in favor of the faster device: */
+
+	return bch2_rand_range(l1 + l2) > l1;
+}
+
 static void extent_pick_read_device(struct bch_fs *c,
 				    struct bkey_s_c_extent e,
 				    struct bch_devs_mask *avoid,
 				    struct extent_pick_ptr *pick)
 {
-	const union bch_extent_crc *crc;
 	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;
 
 	extent_for_each_ptr_crc(e, ptr, crc) {
 		struct bch_dev *ca = c->devs[ptr->dev];
@@ -516,12 +536,18 @@ static void extent_pick_read_device(struct bch_fs *c,
 		if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
 			continue;
 
-		if (avoid && test_bit(ca->dev_idx, avoid->d))
-			continue;
+		if (avoid) {
+			if (test_bit(ca->dev_idx, avoid->d))
+				continue;
 
-		if (pick->ca && pick->ca->mi.tier < ca->mi.tier)
-			continue;
+			if (pick->ca &&
+			    test_bit(pick->ca->dev_idx, avoid->d))
+				goto use;
+		}
 
+		if (pick->ca && !dev_latency_better(ca, pick->ca))
+			continue;
+use:
 		if (!percpu_ref_tryget(&ca->io_ref))
 			continue;
 
@@ -530,11 +556,9 @@ static void extent_pick_read_device(struct bch_fs *c,
 
 		*pick = (struct extent_pick_ptr) {
 			.ptr	= *ptr,
+			.crc	= crc,
 			.ca	= ca,
 		};
-
-		if (e.k->size)
-			pick->crc = crc_to_128(e.k, crc);
 	}
 }
 
@@ -557,14 +581,17 @@ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
 		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 		const union bch_extent_entry *entry;
 		const struct bch_extent_ptr *ptr;
-		const union bch_extent_crc *crc;
 		const char *reason;
 
-		extent_for_each_entry(e, entry)
+		extent_for_each_entry(e, entry) {
 			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
 				return "invalid extent entry type";
 
-		extent_for_each_ptr_crc(e, ptr, crc) {
+			if (extent_entry_is_crc(entry))
+				return "has crc field";
+		}
+
+		extent_for_each_ptr(e, ptr) {
 			reason = extent_ptr_invalid(c, e, ptr,
 						    c->opts.btree_node_size,
 						    true);
@@ -572,9 +599,6 @@ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
 				return reason;
 		}
 
-		if (crc)
-			return "has crc field";
-
 		return NULL;
 	}
 
@@ -699,28 +723,28 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
 		__set_bkey_deleted(k.k);
 	else if (bkey_extent_is_data(k.k)) {
 		struct bkey_s_extent e = bkey_s_to_extent(k);
-		struct bch_extent_ptr *ptr;
-		union bch_extent_crc *crc, *prev_crc = NULL;
+		union bch_extent_entry *entry;
+		bool seen_crc = false;
 
-		extent_for_each_ptr_crc(e, ptr, crc) {
-			switch (extent_crc_type(crc)) {
-			case BCH_EXTENT_CRC_NONE:
-				ptr->offset += e.k->size - len;
+		extent_for_each_entry(e, entry) {
+			switch (extent_entry_type(entry)) {
+			case BCH_EXTENT_ENTRY_ptr:
+				if (!seen_crc)
+					entry->ptr.offset += e.k->size - len;
 				break;
-			case BCH_EXTENT_CRC32:
-				if (prev_crc != crc)
-					crc->crc32.offset += e.k->size - len;
+			case BCH_EXTENT_ENTRY_crc32:
+				entry->crc32.offset += e.k->size - len;
 				break;
-			case BCH_EXTENT_CRC64:
-				if (prev_crc != crc)
-					crc->crc64.offset += e.k->size - len;
+			case BCH_EXTENT_ENTRY_crc64:
+				entry->crc64.offset += e.k->size - len;
 				break;
-			case BCH_EXTENT_CRC128:
-				if (prev_crc != crc)
-					crc->crc128.offset += e.k->size - len;
+			case BCH_EXTENT_ENTRY_crc128:
+				entry->crc128.offset += e.k->size - len;
 				break;
 			}
-			prev_crc = crc;
+
+			if (extent_entry_is_crc(entry))
+				seen_crc = true;
 		}
 	}
 
@@ -989,7 +1013,7 @@ static void bch2_add_sectors(struct extent_insert_state *s,
 		return;
 
 	bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b),
-		     &s->stats, s->trans->journal_res.seq);
+		      &s->stats, s->trans->journal_res.seq, 0);
 }
 
 static void bch2_subtract_sectors(struct extent_insert_state *s,
@@ -1123,7 +1147,7 @@ static void extent_insert_committed(struct extent_insert_state *s)
 
 	if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
 	    bkey_cmp(s->committed, insert->k.p) &&
-	    bkey_extent_is_compressed(bkey_i_to_s_c(insert))) {
+	    bch2_extent_is_compressed(bkey_i_to_s_c(insert))) {
 		/* XXX: possibly need to increase our reservation? */
 		bch2_cut_subtract_back(s, s->committed,
 				      bkey_i_to_s(&split.k));
@@ -1152,46 +1176,24 @@ done:
 	s->trans->did_work		= true;
 }
 
-static enum extent_insert_hook_ret
+static enum btree_insert_ret
 __extent_insert_advance_pos(struct extent_insert_state *s,
 			    struct bpos next_pos,
 			    struct bkey_s_c k)
 {
 	struct extent_insert_hook *hook = s->trans->hook;
-	enum extent_insert_hook_ret ret;
-#if 0
-	/*
-	 * Currently disabled for encryption - broken with fcollapse. Will have
-	 * to reenable when versions are exposed for send/receive - versions
-	 * will have to be monotonic then:
-	 */
-	if (k.k && k.k->size &&
-	    !bversion_zero(s->insert->k->k.version) &&
-	    bversion_cmp(k.k->version, s->insert->k->k.version) > 0) {
-		ret = BTREE_HOOK_NO_INSERT;
-	} else
-#endif
+	enum btree_insert_ret ret;
+
 	if (hook)
 		ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
 	else
-		ret = BTREE_HOOK_DO_INSERT;
+		ret = BTREE_INSERT_OK;
 
 	EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size);
 
-	switch (ret) {
-	case BTREE_HOOK_DO_INSERT:
-		break;
-	case BTREE_HOOK_NO_INSERT:
-		extent_insert_committed(s);
-		bch2_cut_subtract_front(s, next_pos, bkey_i_to_s(s->insert->k));
-
-		bch2_btree_iter_set_pos_same_leaf(s->insert->iter, next_pos);
-		break;
-	case BTREE_HOOK_RESTART_TRANS:
-		return ret;
-	}
+	if (ret == BTREE_INSERT_OK)
+		s->committed = next_pos;
 
-	s->committed = next_pos;
 	return ret;
 }
 
@@ -1199,39 +1201,28 @@ __extent_insert_advance_pos(struct extent_insert_state *s,
  * Update iter->pos, marking how much of @insert we've processed, and call hook
  * fn:
  */
-static enum extent_insert_hook_ret
+static enum btree_insert_ret
 extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
 {
 	struct btree *b = s->insert->iter->nodes[0];
 	struct bpos next_pos = bpos_min(s->insert->k->k.p,
 					k.k ? k.k->p : b->key.k.p);
+	enum btree_insert_ret ret;
+
+	if (race_fault())
+		return BTREE_INSERT_NEED_TRAVERSE;
 
 	/* hole? */
 	if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
-		bool have_uncommitted = bkey_cmp(s->committed,
-				bkey_start_pos(&s->insert->k->k)) > 0;
-
-		switch (__extent_insert_advance_pos(s, bkey_start_pos(k.k),
-						    bkey_s_c_null)) {
-		case BTREE_HOOK_DO_INSERT:
-			break;
-		case BTREE_HOOK_NO_INSERT:
-			/*
-			 * we had to split @insert and insert the committed
-			 * part - need to bail out and recheck journal
-			 * reservation/btree node before we advance pos past @k:
-			 */
-			if (have_uncommitted)
-				return BTREE_HOOK_NO_INSERT;
-			break;
-		case BTREE_HOOK_RESTART_TRANS:
-			return BTREE_HOOK_RESTART_TRANS;
-		}
+		ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k),
+						    bkey_s_c_null);
+		if (ret != BTREE_INSERT_OK)
+			return ret;
 	}
 
 	/* avoid redundant calls to hook fn: */
 	if (!bkey_cmp(s->committed, next_pos))
-		return BTREE_HOOK_DO_INSERT;
+		return BTREE_INSERT_OK;
 
 	return __extent_insert_advance_pos(s, next_pos, k);
 }
@@ -1245,7 +1236,7 @@ extent_insert_check_split_compressed(struct extent_insert_state *s,
 	unsigned sectors;
 
 	if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
-	    (sectors = bkey_extent_is_compressed(k))) {
+	    (sectors = bch2_extent_is_compressed(k))) {
 		int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
 
 		if (s->trans->flags & BTREE_INSERT_NOFAIL)
@@ -1277,6 +1268,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 	struct btree_iter *iter = s->insert->iter;
 	struct btree *b = iter->nodes[0];
 	struct btree_node_iter *node_iter = &iter->node_iters[0];
+	enum btree_insert_ret ret;
 
 	switch (overlap) {
 	case BCH_EXTENT_OVERLAP_FRONT:
@@ -1322,9 +1314,9 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 			k.k->p = orig_pos;
 			extent_save(b, node_iter, _k, k.k);
 
-			if (extent_insert_advance_pos(s, k.s_c) ==
-			    BTREE_HOOK_RESTART_TRANS)
-				return BTREE_INSERT_NEED_TRAVERSE;
+			ret = extent_insert_advance_pos(s, k.s_c);
+			if (ret != BTREE_INSERT_OK)
+				return ret;
 
 			extent_insert_committed(s);
 			/*
@@ -1420,15 +1412,9 @@ bch2_delete_fixup_extent(struct extent_insert_state *s)
 		if (ret != BTREE_INSERT_OK)
 			goto stop;
 
-		switch (extent_insert_advance_pos(s, k.s_c)) {
-		case BTREE_HOOK_DO_INSERT:
-			break;
-		case BTREE_HOOK_NO_INSERT:
-			continue;
-		case BTREE_HOOK_RESTART_TRANS:
-			ret = BTREE_INSERT_NEED_TRAVERSE;
+		ret = extent_insert_advance_pos(s, k.s_c);
+		if (ret)
 			goto stop;
-		}
 
 		s->do_journal = true;
 
@@ -1469,10 +1455,9 @@ next:
 		bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
 	}
 
-	if (bkey_cmp(s->committed, insert->k.p) < 0 &&
-	    ret == BTREE_INSERT_OK &&
-	    extent_insert_advance_pos(s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
-		ret = BTREE_INSERT_NEED_TRAVERSE;
+	if (ret == BTREE_INSERT_OK &&
+	    bkey_cmp(s->committed, insert->k.p) < 0)
+		ret = extent_insert_advance_pos(s, bkey_s_c_null);
 stop:
 	extent_insert_committed(s);
 
@@ -1594,18 +1579,10 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
 
 		/*
 		 * Only call advance pos & call hook for nonzero size extents:
-		 * If hook returned BTREE_HOOK_NO_INSERT, @insert->k no longer
-		 * overlaps with @k:
 		 */
-		switch (extent_insert_advance_pos(&s, k.s_c)) {
-		case BTREE_HOOK_DO_INSERT:
-			break;
-		case BTREE_HOOK_NO_INSERT:
-			continue;
-		case BTREE_HOOK_RESTART_TRANS:
-			ret = BTREE_INSERT_NEED_TRAVERSE;
+		ret = extent_insert_advance_pos(&s, k.s_c);
+		if (ret != BTREE_INSERT_OK)
 			goto stop;
-		}
 
 		if (k.k->size &&
 		    (k.k->needs_whiteout || bset_written(b, bset(b, t))))
@@ -1623,10 +1600,9 @@ squash:
 			goto stop;
 	}
 
-	if (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
-	    ret == BTREE_INSERT_OK &&
-	    extent_insert_advance_pos(&s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
-		ret = BTREE_INSERT_NEED_TRAVERSE;
+	if (ret == BTREE_INSERT_OK &&
+	    bkey_cmp(s.committed, insert->k->k.p) < 0)
+		ret = extent_insert_advance_pos(&s, bkey_s_c_null);
 stop:
 	extent_insert_committed(&s);
 	/*
@@ -1669,29 +1645,37 @@ static const char *bch2_extent_invalid(const struct bch_fs *c,
 	case BCH_EXTENT_CACHED: {
 		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 		const union bch_extent_entry *entry;
-		const union bch_extent_crc *crc;
+		struct bch_extent_crc_unpacked crc;
 		const struct bch_extent_ptr *ptr;
 		unsigned size_ondisk = e.k->size;
 		const char *reason;
+		unsigned nonce = UINT_MAX;
 
 		extent_for_each_entry(e, entry) {
 			if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
 				return "invalid extent entry type";
 
 			if (extent_entry_is_crc(entry)) {
-				crc = entry_to_crc(entry);
+				crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
 
-				if (crc_offset(crc) + e.k->size >
-				    crc_uncompressed_size(e.k, crc))
+				if (crc.offset + e.k->size >
+				    crc.uncompressed_size)
 					return "checksum offset + key size > uncompressed size";
 
-				size_ondisk = crc_compressed_size(e.k, crc);
+				size_ondisk = crc.compressed_size;
 
-				if (!bch2_checksum_type_valid(c, crc_csum_type(crc)))
+				if (!bch2_checksum_type_valid(c, crc.csum_type))
 					return "invalid checksum type";
 
-				if (crc_compression_type(crc) >= BCH_COMPRESSION_NR)
+				if (crc.compression_type >= BCH_COMPRESSION_NR)
 					return "invalid compression type";
+
+				if (bch2_csum_type_is_encryption(crc.csum_type)) {
+					if (nonce == UINT_MAX)
+						nonce = crc.offset + crc.nonce;
+					else if (nonce != crc.offset + crc.nonce)
+						return "incorrect nonce";
+				}
 			} else {
 				ptr = entry_to_ptr(entry);
 
@@ -1864,102 +1848,75 @@ static unsigned PTR_TIER(struct bch_fs *c,
 }
 
 static void bch2_extent_crc_init(union bch_extent_crc *crc,
-				 unsigned compressed_size,
-				 unsigned uncompressed_size,
-				 unsigned compression_type,
-				 unsigned nonce,
-				 struct bch_csum csum, unsigned csum_type)
-{
-	if (bch_crc_bytes[csum_type]	<= 4 &&
-	    uncompressed_size		<= CRC32_SIZE_MAX &&
-	    nonce			<= CRC32_NONCE_MAX) {
+				 struct bch_extent_crc_unpacked new)
+{
+#define common_fields(_crc)						\
+		.csum_type		= _crc.csum_type,		\
+		.compression_type	= _crc.compression_type,	\
+		._compressed_size	= _crc.compressed_size - 1,	\
+		._uncompressed_size	= _crc.uncompressed_size - 1,	\
+		.offset			= _crc.offset
+
+	if (bch_crc_bytes[new.csum_type]	<= 4 &&
+	    new.uncompressed_size		<= CRC32_SIZE_MAX &&
+	    new.nonce				<= CRC32_NONCE_MAX) {
 		crc->crc32 = (struct bch_extent_crc32) {
 			.type = 1 << BCH_EXTENT_ENTRY_crc32,
-			._compressed_size	= compressed_size - 1,
-			._uncompressed_size	= uncompressed_size - 1,
-			.offset			= 0,
-			.compression_type	= compression_type,
-			.csum_type		= csum_type,
-			.csum			= *((__le32 *) &csum.lo),
+			common_fields(new),
+			.csum			= *((__le32 *) &new.csum.lo),
 		};
 		return;
 	}
 
-	if (bch_crc_bytes[csum_type]	<= 10 &&
-	    uncompressed_size		<= CRC64_SIZE_MAX &&
-	    nonce			<= CRC64_NONCE_MAX) {
+	if (bch_crc_bytes[new.csum_type]	<= 10 &&
+	    new.uncompressed_size		<= CRC64_SIZE_MAX &&
+	    new.nonce				<= CRC64_NONCE_MAX) {
 		crc->crc64 = (struct bch_extent_crc64) {
 			.type = 1 << BCH_EXTENT_ENTRY_crc64,
-			._compressed_size	= compressed_size - 1,
-			._uncompressed_size	= uncompressed_size - 1,
-			.offset			= 0,
-			.nonce			= nonce,
-			.compression_type	= compression_type,
-			.csum_type		= csum_type,
-			.csum_lo		= csum.lo,
-			.csum_hi		= *((__le16 *) &csum.hi),
+			common_fields(new),
+			.nonce			= new.nonce,
+			.csum_lo		= new.csum.lo,
+			.csum_hi		= *((__le16 *) &new.csum.hi),
 		};
 		return;
 	}
 
-	if (bch_crc_bytes[csum_type]	<= 16 &&
-	    uncompressed_size		<= CRC128_SIZE_MAX &&
-	    nonce			<= CRC128_NONCE_MAX) {
+	if (bch_crc_bytes[new.csum_type]	<= 16 &&
+	    new.uncompressed_size		<= CRC128_SIZE_MAX &&
+	    new.nonce				<= CRC128_NONCE_MAX) {
 		crc->crc128 = (struct bch_extent_crc128) {
 			.type = 1 << BCH_EXTENT_ENTRY_crc128,
-			._compressed_size	= compressed_size - 1,
-			._uncompressed_size	= uncompressed_size - 1,
-			.offset			= 0,
-			.nonce			= nonce,
-			.compression_type	= compression_type,
-			.csum_type		= csum_type,
-			.csum			= csum,
+			common_fields(new),
+			.nonce			= new.nonce,
+			.csum			= new.csum,
 		};
 		return;
 	}
-
+#undef common_fields
 	BUG();
 }
 
 void bch2_extent_crc_append(struct bkey_i_extent *e,
-			    unsigned compressed_size,
-			    unsigned uncompressed_size,
-			    unsigned compression_type,
-			    unsigned nonce,
-			    struct bch_csum csum, unsigned csum_type)
+			    struct bch_extent_crc_unpacked new)
 {
-	union bch_extent_crc *crc;
+	struct bch_extent_crc_unpacked crc;
+	const union bch_extent_entry *i;
 
-	BUG_ON(compressed_size > uncompressed_size);
-	BUG_ON(uncompressed_size != e->k.size);
-	BUG_ON(!compressed_size || !uncompressed_size);
+	BUG_ON(new.compressed_size > new.uncompressed_size);
+	BUG_ON(new.live_size != e->k.size);
+	BUG_ON(!new.compressed_size || !new.uncompressed_size);
 
 	/*
 	 * Look up the last crc entry, so we can check if we need to add
 	 * another:
 	 */
-	extent_for_each_crc(extent_i_to_s(e), crc)
+	extent_for_each_crc(extent_i_to_s(e), crc, i)
 		;
 
-	if (!crc && !csum_type && !compression_type)
-		return;
-
-	if (crc &&
-	    crc_compressed_size(&e->k, crc)	== compressed_size &&
-	    crc_uncompressed_size(&e->k, crc)	== uncompressed_size &&
-	    crc_offset(crc)			== 0 &&
-	    crc_nonce(crc)			== nonce &&
-	    crc_csum_type(crc)			== csum_type &&
-	    crc_compression_type(crc)		== compression_type &&
-	    crc_csum(crc).lo			== csum.lo &&
-	    crc_csum(crc).hi			== csum.hi)
+	if (!memcmp(&crc, &new, sizeof(crc)))
 		return;
 
-	bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)),
-			    compressed_size,
-			    uncompressed_size,
-			    compression_type,
-			    nonce, csum, csum_type);
+	bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
 	__extent_entry_push(e);
 }
 
@@ -2011,16 +1968,22 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
 }
 
 void bch2_extent_mark_replicas_cached(struct bch_fs *c,
-				      struct bkey_s_extent e,
-				      unsigned nr_cached)
+				      struct bkey_s_extent e)
 {
 	struct bch_extent_ptr *ptr;
+	unsigned tier = 0, nr_cached = 0, nr_good = 0;
 	bool have_higher_tier;
-	unsigned tier = 0;
 
-	if (!nr_cached)
+	extent_for_each_ptr(e, ptr)
+		if (!ptr->cached &&
+		    c->devs[ptr->dev]->mi.state != BCH_MEMBER_STATE_FAILED)
+			nr_good++;
+
+	if (nr_good <= c->opts.data_replicas)
 		return;
 
+	nr_cached = nr_good - c->opts.data_replicas;
+
 	do {
 		have_higher_tier = false;
 
diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h
index 634159f..1ec2db5 100644
--- a/libbcachefs/extents.h
+++ b/libbcachefs/extents.h
@@ -3,7 +3,7 @@
 
 #include "bcachefs.h"
 #include "bkey.h"
-#include "io_types.h"
+#include "extents_types.h"
 
 struct bch_fs;
 struct journal_res;
@@ -38,11 +38,17 @@ bch2_insert_fixup_extent(struct btree_insert *,
 			struct btree_insert_entry *);
 
 bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
-void bch2_extent_mark_replicas_cached(struct bch_fs *,
-				     struct bkey_s_extent, unsigned);
+void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent);
+
+const struct bch_extent_ptr *
+bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
 
 unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent);
 unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c);
+unsigned bch2_extent_is_compressed(struct bkey_s_c);
+
+bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent,
+			     struct bch_extent_ptr, u64);
 
 static inline bool bkey_extent_is_data(const struct bkey *k)
 {
@@ -67,6 +73,12 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
 	}
 }
 
+static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k)
+{
+	return bkey_extent_is_allocation(k.k) &&
+		!bch2_extent_is_compressed(k);
+}
+
 static inline bool bkey_extent_is_cached(const struct bkey *k)
 {
 	return k->type == BCH_EXTENT_CACHED;
@@ -170,6 +182,8 @@ union bch_extent_crc {
 		(struct bch_extent_ptr *) (_entry));			\
 })
 
+/* checksum entries: */
+
 enum bch_extent_crc_type {
 	BCH_EXTENT_CRC_NONE,
 	BCH_EXTENT_CRC32,
@@ -208,6 +222,50 @@ __extent_crc_type(const union bch_extent_crc *crc)
 	: __extent_crc_type((union bch_extent_crc *) _crc);		\
 })
 
+static inline struct bch_extent_crc_unpacked
+bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc)
+{
+#define common_fields(_crc)						\
+		.csum_type		= _crc.csum_type,		\
+		.compression_type	= _crc.compression_type,	\
+		.compressed_size	= _crc._compressed_size + 1,	\
+		.uncompressed_size	= _crc._uncompressed_size + 1,	\
+		.offset			= _crc.offset,			\
+		.live_size		= k->size
+
+	switch (extent_crc_type(crc)) {
+	case BCH_EXTENT_CRC_NONE:
+		return (struct bch_extent_crc_unpacked) {
+			.compressed_size	= k->size,
+			.uncompressed_size	= k->size,
+			.live_size		= k->size,
+		};
+	case BCH_EXTENT_CRC32:
+		return (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc32),
+			.csum.lo		= crc->crc32.csum,
+		};
+	case BCH_EXTENT_CRC64:
+		return (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc64),
+			.nonce			= crc->crc64.nonce,
+			.csum.lo		= crc->crc64.csum_lo,
+			.csum.hi		= crc->crc64.csum_hi,
+		};
+	case BCH_EXTENT_CRC128:
+		return (struct bch_extent_crc_unpacked) {
+			common_fields(crc->crc128),
+			.nonce			= crc->crc128.nonce,
+			.csum			= crc->crc128.csum,
+		};
+	default:
+		BUG();
+	}
+#undef common_fields
+}
+
+/* Extent entry iteration: */
+
 #define extent_entry_next(_entry)					\
 	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
 
@@ -226,7 +284,7 @@ __extent_crc_type(const union bch_extent_crc *crc)
 
 /* Iterate over crcs only: */
 
-#define extent_crc_next(_e, _p)						\
+#define __extent_crc_next(_e, _p)					\
 ({									\
 	typeof(&(_e).v->start[0]) _entry = _p;				\
 									\
@@ -237,25 +295,41 @@ __extent_crc_type(const union bch_extent_crc *crc)
 	entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL);	\
 })
 
-#define extent_for_each_crc(_e, _crc)					\
-	for ((_crc) = extent_crc_next(_e, (_e).v->start);		\
+#define __extent_for_each_crc(_e, _crc)					\
+	for ((_crc) = __extent_crc_next(_e, (_e).v->start);		\
 	     (_crc);							\
-	     (_crc) = extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
+	     (_crc) = __extent_crc_next(_e, extent_entry_next(to_entry(_crc))))
+
+#define extent_crc_next(_e, _crc, _iter)				\
+({									\
+	extent_for_each_entry_from(_e, _iter, _iter)			\
+		if (extent_entry_is_crc(_iter)) {			\
+			(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\
+			break;						\
+		}							\
+									\
+	(_iter) < extent_entry_last(_e);				\
+})
+
+#define extent_for_each_crc(_e, _crc, _iter)				\
+	for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL),		\
+	     (_iter) = (_e).v->start;					\
+	     extent_crc_next(_e, _crc, _iter);				\
+	     (_iter) = extent_entry_next(_iter))
 
 /* Iterate over pointers, with crcs: */
 
-#define extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter)		\
+#define extent_ptr_crc_next(_e, _ptr, _crc)				\
 ({									\
 	__label__ out;							\
 	typeof(&(_e).v->start[0]) _entry;				\
 									\
 	extent_for_each_entry_from(_e, _entry, to_entry(_ptr))		\
 		if (extent_entry_is_crc(_entry)) {			\
-			(_crc) = entry_to_crc(_entry);			\
+			(_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_entry));\
 		} else {						\
 			_ptr = entry_to_ptr(_entry);			\
-			if (_filter)					\
-				goto out;				\
+			goto out;					\
 		}							\
 									\
 	_ptr = NULL;							\
@@ -263,35 +337,26 @@ out:									\
 	_ptr;								\
 })
 
-#define extent_for_each_ptr_crc_filter(_e, _ptr, _crc, _filter)		\
-	for ((_crc) = NULL,						\
+#define extent_for_each_ptr_crc(_e, _ptr, _crc)				\
+	for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL),		\
 	     (_ptr) = &(_e).v->start->ptr;				\
-	     ((_ptr) = extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter));\
+	     ((_ptr) = extent_ptr_crc_next(_e, _ptr, _crc));		\
 	     (_ptr)++)
 
-#define extent_for_each_ptr_crc(_e, _ptr, _crc)				\
-	extent_for_each_ptr_crc_filter(_e, _ptr, _crc, true)
-
 /* Iterate over pointers only, and from a given position: */
 
-#define extent_ptr_next_filter(_e, _ptr, _filter)			\
+#define extent_ptr_next(_e, _ptr)					\
 ({									\
-	typeof(__entry_to_crc(&(_e).v->start[0])) _crc;			\
+	struct bch_extent_crc_unpacked _crc;				\
 									\
-	extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter);		\
+	extent_ptr_crc_next(_e, _ptr, _crc);				\
 })
 
-#define extent_ptr_next(_e, _ptr)					\
-	extent_ptr_next_filter(_e, _ptr, true)
-
-#define extent_for_each_ptr_filter(_e, _ptr, _filter)			\
+#define extent_for_each_ptr(_e, _ptr)					\
 	for ((_ptr) = &(_e).v->start->ptr;				\
-	     ((_ptr) = extent_ptr_next_filter(_e, _ptr, _filter));	\
+	     ((_ptr) = extent_ptr_next(_e, _ptr));			\
 	     (_ptr)++)
 
-#define extent_for_each_ptr(_e, _ptr)					\
-	extent_for_each_ptr_filter(_e, _ptr, true)
-
 #define extent_ptr_prev(_e, _ptr)					\
 ({									\
 	typeof(&(_e).v->start->ptr) _p;					\
@@ -315,8 +380,8 @@ out:									\
 	     (_ptr);							\
 	     (_ptr) = extent_ptr_prev(_e, _ptr))
 
-void bch2_extent_crc_append(struct bkey_i_extent *, unsigned, unsigned,
-			   unsigned, unsigned, struct bch_csum, unsigned);
+void bch2_extent_crc_append(struct bkey_i_extent *,
+			    struct bch_extent_crc_unpacked);
 
 static inline void __extent_entry_push(struct bkey_i_extent *e)
 {
@@ -336,226 +401,26 @@ static inline void extent_ptr_append(struct bkey_i_extent *e,
 	__extent_entry_push(e);
 }
 
-static inline struct bch_extent_crc128 crc_to_128(const struct bkey *k,
-						  const union bch_extent_crc *crc)
+static inline struct bch_devs_list bch2_extent_devs(struct bkey_s_c_extent e)
 {
-	EBUG_ON(!k->size);
-
-	switch (extent_crc_type(crc)) {
-	case BCH_EXTENT_CRC_NONE:
-		return (struct bch_extent_crc128) {
-			._compressed_size	= k->size - 1,
-			._uncompressed_size	= k->size - 1,
-		};
-	case BCH_EXTENT_CRC32:
-		return (struct bch_extent_crc128) {
-			.type			= 1 << BCH_EXTENT_ENTRY_crc128,
-			._compressed_size	= crc->crc32._compressed_size,
-			._uncompressed_size	= crc->crc32._uncompressed_size,
-			.offset			= crc->crc32.offset,
-			.csum_type		= crc->crc32.csum_type,
-			.compression_type	= crc->crc32.compression_type,
-			.csum.lo		= crc->crc32.csum,
-		};
-	case BCH_EXTENT_CRC64:
-		return (struct bch_extent_crc128) {
-			.type			= 1 << BCH_EXTENT_ENTRY_crc128,
-			._compressed_size	= crc->crc64._compressed_size,
-			._uncompressed_size	= crc->crc64._uncompressed_size,
-			.offset			= crc->crc64.offset,
-			.nonce			= crc->crc64.nonce,
-			.csum_type		= crc->crc64.csum_type,
-			.compression_type	= crc->crc64.compression_type,
-			.csum.lo		= crc->crc64.csum_lo,
-			.csum.hi		= crc->crc64.csum_hi,
-		};
-	case BCH_EXTENT_CRC128:
-		return crc->crc128;
-	default:
-		BUG();
-	}
-}
-
-#define crc_compressed_size(_k, _crc)					\
-({									\
-	unsigned _size = 0;						\
-									\
-	switch (extent_crc_type(_crc)) {				\
-	case BCH_EXTENT_CRC_NONE:					\
-		_size = ((const struct bkey *) (_k))->size;		\
-		break;							\
-	case BCH_EXTENT_CRC32:						\
-		_size = ((struct bch_extent_crc32 *) _crc)		\
-			->_compressed_size + 1;				\
-		break;							\
-	case BCH_EXTENT_CRC64:						\
-		_size = ((struct bch_extent_crc64 *) _crc)		\
-			->_compressed_size + 1;				\
-		break;							\
-	case BCH_EXTENT_CRC128:						\
-		_size = ((struct bch_extent_crc128 *) _crc)		\
-			->_compressed_size + 1;				\
-		break;							\
-	}								\
-	_size;								\
-})
-
-#define crc_uncompressed_size(_k, _crc)					\
-({									\
-	unsigned _size = 0;						\
-									\
-	switch (extent_crc_type(_crc)) {				\
-	case BCH_EXTENT_CRC_NONE:					\
-		_size = ((const struct bkey *) (_k))->size;		\
-		break;							\
-	case BCH_EXTENT_CRC32:						\
-		_size = ((struct bch_extent_crc32 *) _crc)		\
-			->_uncompressed_size + 1;			\
-		break;							\
-	case BCH_EXTENT_CRC64:						\
-		_size = ((struct bch_extent_crc64 *) _crc)		\
-			->_uncompressed_size + 1;			\
-		break;							\
-	case BCH_EXTENT_CRC128:						\
-		_size = ((struct bch_extent_crc128 *) _crc)		\
-			->_uncompressed_size + 1;			\
-		break;							\
-	}								\
-	_size;								\
-})
-
-static inline unsigned crc_offset(const union bch_extent_crc *crc)
-{
-	switch (extent_crc_type(crc)) {
-	case BCH_EXTENT_CRC_NONE:
-		return 0;
-	case BCH_EXTENT_CRC32:
-		return crc->crc32.offset;
-	case BCH_EXTENT_CRC64:
-		return crc->crc64.offset;
-	case BCH_EXTENT_CRC128:
-		return crc->crc128.offset;
-	default:
-		BUG();
-	}
-}
-
-static inline unsigned crc_nonce(const union bch_extent_crc *crc)
-{
-	switch (extent_crc_type(crc)) {
-	case BCH_EXTENT_CRC_NONE:
-	case BCH_EXTENT_CRC32:
-		return 0;
-	case BCH_EXTENT_CRC64:
-		return crc->crc64.nonce;
-	case BCH_EXTENT_CRC128:
-		return crc->crc128.nonce;
-	default:
-		BUG();
-	}
-}
-
-static inline unsigned crc_csum_type(const union bch_extent_crc *crc)
-{
-	switch (extent_crc_type(crc)) {
-	case BCH_EXTENT_CRC_NONE:
-		return 0;
-	case BCH_EXTENT_CRC32:
-		return crc->crc32.csum_type;
-	case BCH_EXTENT_CRC64:
-		return crc->crc64.csum_type;
-	case BCH_EXTENT_CRC128:
-		return crc->crc128.csum_type;
-	default:
-		BUG();
-	}
-}
-
-static inline unsigned crc_compression_type(const union bch_extent_crc *crc)
-{
-	switch (extent_crc_type(crc)) {
-	case BCH_EXTENT_CRC_NONE:
-		return 0;
-	case BCH_EXTENT_CRC32:
-		return crc->crc32.compression_type;
-	case BCH_EXTENT_CRC64:
-		return crc->crc64.compression_type;
-	case BCH_EXTENT_CRC128:
-		return crc->crc128.compression_type;
-	default:
-		BUG();
-	}
-}
-
-static inline struct bch_csum crc_csum(const union bch_extent_crc *crc)
-{
-	switch (extent_crc_type(crc)) {
-	case BCH_EXTENT_CRC_NONE:
-		return (struct bch_csum) { 0 };
-	case BCH_EXTENT_CRC32:
-		return (struct bch_csum) { .lo = crc->crc32.csum };
-	case BCH_EXTENT_CRC64:
-		return (struct bch_csum) {
-			.lo = crc->crc64.csum_lo,
-			.hi = crc->crc64.csum_hi,
-		};
-	case BCH_EXTENT_CRC128:
-		return crc->crc128.csum;
-	default:
-		BUG();
-	}
-}
-
-static inline unsigned bkey_extent_is_compressed(struct bkey_s_c k)
-{
-	struct bkey_s_c_extent e;
+	struct bch_devs_list ret = (struct bch_devs_list) { 0 };
 	const struct bch_extent_ptr *ptr;
-	const union bch_extent_crc *crc;
-	unsigned ret = 0;
 
-	switch (k.k->type) {
-	case BCH_EXTENT:
-	case BCH_EXTENT_CACHED:
-		e = bkey_s_c_to_extent(k);
-
-		extent_for_each_ptr_crc(e, ptr, crc)
-			if (!ptr->cached &&
-			    crc_compression_type(crc) != BCH_COMPRESSION_NONE &&
-			    crc_compressed_size(e.k, crc) < k.k->size)
-				ret = max_t(unsigned, ret,
-					    crc_compressed_size(e.k, crc));
-	}
+	extent_for_each_ptr(e, ptr)
+		ret.devs[ret.nr++] = ptr->dev;
 
 	return ret;
 }
 
-static inline unsigned extent_current_nonce(struct bkey_s_c_extent e)
-{
-	const union bch_extent_crc *crc;
-
-	extent_for_each_crc(e, crc)
-		if (bch2_csum_type_is_encryption(crc_csum_type(crc)))
-			return crc_offset(crc) + crc_nonce(crc);
-
-	return 0;
-}
-
-void bch2_extent_narrow_crcs(struct bkey_s_extent);
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent,
+				 struct bch_extent_crc_unpacked);
+bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked);
 void bch2_extent_drop_redundant_crcs(struct bkey_s_extent);
 
 void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
 void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *);
 void bch2_extent_drop_ptr_idx(struct bkey_s_extent, unsigned);
 
-const struct bch_extent_ptr *
-bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
-struct bch_extent_ptr *
-bch2_extent_find_ptr(struct bch_fs *, struct bkey_s_extent,
-		     struct bch_extent_ptr);
-struct bch_extent_ptr *
-bch2_extent_find_matching_ptr(struct bch_fs *, struct bkey_s_extent,
-			      struct bkey_s_c_extent);
-
 bool bch2_cut_front(struct bpos, struct bkey_i *);
 bool bch2_cut_back(struct bpos, struct bkey *);
 void bch2_key_resize(struct bkey *, unsigned);
diff --git a/libbcachefs/extents_types.h b/libbcachefs/extents_types.h
new file mode 100644
index 0000000..15805cd
--- /dev/null
+++ b/libbcachefs/extents_types.h
@@ -0,0 +1,27 @@
+#ifndef _BCACHEFS_EXTENTS_TYPES_H
+#define _BCACHEFS_EXTENTS_TYPES_H
+
+#include "bcachefs_format.h"
+
+struct bch_extent_crc_unpacked {
+	u8			csum_type;
+	u8			compression_type;
+
+	u16			compressed_size;
+	u16			uncompressed_size;
+
+	u16			offset;
+	u16			live_size;
+
+	u16			nonce;
+
+	struct bch_csum		csum;
+};
+
+struct extent_pick_ptr {
+	struct bch_extent_ptr		ptr;
+	struct bch_extent_crc_unpacked	crc;
+	struct bch_dev			*ca;
+};
+
+#endif /* _BCACHEFS_EXTENTS_TYPES_H */
diff --git a/libbcachefs/eytzinger.h b/libbcachefs/eytzinger.h
index 04dcfc5..66fa227 100644
--- a/libbcachefs/eytzinger.h
+++ b/libbcachefs/eytzinger.h
@@ -80,7 +80,7 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
 	EBUG_ON(i >= size);
 
 	if (eytzinger1_left_child(i) < size) {
-		i = eytzinger1_left_child(i);
+		i = eytzinger1_left_child(i) + 1;
 
 		i <<= __fls(size) - __fls(i);
 		i -= 1;
@@ -163,38 +163,6 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
 	     (_i) != 0;					\
 	     (_i) = eytzinger1_next((_i), (_size)))
 
-#if 0
-void eytzinger0_test(void)
-{
-	unsigned i, j, size;
-
-	for (size = 2;
-	     size < 65536000;
-	     size++) {
-		if (!(size % 4096))
-			printk(KERN_INFO "tree size %u\n", size);
-
-		assert(eytzinger1_prev(0, size) == eytzinger1_last(size));
-		assert(eytzinger1_next(0, size) == eytzinger1_first(size));
-
-		assert(eytzinger1_prev(eytzinger1_first(size), size) == 0);
-		assert(eytzinger1_next(eytzinger1_last(size), size) == 0);
-
-		eytzinger1_for_each(j, size) {
-			assert(from_inorder(i, size) == j);
-			assert(to_inorder(j, size) == i);
-
-			if (j != eytzinger1_last(size)) {
-				unsigned next = eytzinger1_next(j, size);
-
-				assert(eytzinger1_prev(next, size) == j);
-			}
-		}
-	}
-
-}
-#endif
-
 /* Zero based indexing version: */
 
 static inline unsigned eytzinger0_child(unsigned i, unsigned child)
@@ -214,27 +182,29 @@ static inline unsigned eytzinger0_right_child(unsigned i)
 	return eytzinger0_child(i, 1);
 }
 
-#if 0
 static inline unsigned eytzinger0_first(unsigned size)
 {
+	return eytzinger1_first(size + 1) - 1;
 }
 
 static inline unsigned eytzinger0_last(unsigned size)
 {
+	return eytzinger1_last(size + 1) - 1;
 }
 
 static inline unsigned eytzinger0_next(unsigned i, unsigned size)
 {
+	return eytzinger1_next(i + 1, size + 1) - 1;
 }
 
 static inline unsigned eytzinger0_prev(unsigned i, unsigned size)
 {
+	return eytzinger1_prev(i + 1, size + 1) - 1;
 }
-#endif
 
 static inline unsigned eytzinger0_extra(unsigned size)
 {
-	return (size + 1 - rounddown_pow_of_two(size)) << 1;
+	return eytzinger1_extra(size + 1);
 }
 
 static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size,
@@ -259,10 +229,41 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
 	return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size));
 }
 
+#define eytzinger0_for_each(_i, _size)			\
+	for ((_i) = eytzinger0_first((_size));		\
+	     (_i) != -1;				\
+	     (_i) = eytzinger0_next((_i), (_size)))
+
 typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
 
+/* return greatest node <= @search, or -1 if not found */
+static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
+					 eytzinger_cmp_fn cmp, const void *search)
+{
+	unsigned i, n = 0;
+
+	if (!nr)
+		return -1;
+
+	do {
+		i = n;
+		n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+	} while (n < nr);
+
+	if (n & 1) {
+		/* @i was greater than @search, return previous node: */
+
+		if (i == eytzinger0_first(nr))
+			return -1;
+
+		return eytzinger0_prev(i, nr);
+	} else {
+		return i;
+	}
+}
+
 static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
-				     eytzinger_cmp_fn cmp, void *search)
+				     eytzinger_cmp_fn cmp, const void *search)
 {
 	size_t i = 0;
 	int res;
@@ -271,17 +272,6 @@ static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
 	       (res = cmp(search, base + i * size, size)))
 		i = eytzinger0_child(i, res > 0);
 
-	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
-		bool found1 = i < nr, found2 = false;
-		size_t j;
-
-		for (j = 0; j < nr; j++)
-			if (!cmp(base + j * size, search, size))
-				found2 = true;
-
-		BUG_ON(found1 != found2);
-	}
-
 	return i;
 }
 
diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c
index 8b41be8..298e359 100644
--- a/libbcachefs/fs-io.c
+++ b/libbcachefs/fs-io.c
@@ -26,9 +26,67 @@
 #include <trace/events/bcachefs.h>
 #include <trace/events/writeback.h>
 
-struct bio_set *bch2_writepage_bioset;
-struct bio_set *bch2_dio_read_bioset;
-struct bio_set *bch2_dio_write_bioset;
+struct i_sectors_hook {
+	struct extent_insert_hook	hook;
+	s64				sectors;
+	struct bch_inode_info		*inode;
+};
+
+struct bchfs_write_op {
+	struct bch_inode_info		*inode;
+	s64				sectors_added;
+	bool				is_dio;
+	bool				unalloc;
+	u64				new_i_size;
+
+	/* must be last: */
+	struct bch_write_op		op;
+};
+
+static inline void bch2_fswrite_op_init(struct bchfs_write_op *op,
+					struct bch_inode_info *inode,
+					bool is_dio)
+{
+	op->inode		= inode;
+	op->sectors_added	= 0;
+	op->is_dio		= is_dio;
+	op->unalloc		= false;
+	op->new_i_size		= U64_MAX;
+}
+
+struct bch_writepage_io {
+	struct closure			cl;
+
+	/* must be last: */
+	struct bchfs_write_op		op;
+};
+
+struct dio_write {
+	struct closure			cl;
+	struct kiocb			*req;
+	struct bch_fs			*c;
+	long				written;
+	long				error;
+	loff_t				offset;
+
+	struct disk_reservation		res;
+
+	struct iovec			*iovec;
+	struct iovec			inline_vecs[UIO_FASTIOV];
+	struct iov_iter			iter;
+
+	struct task_struct		*task;
+
+	/* must be last: */
+	struct bchfs_write_op		iop;
+};
+
+struct dio_read {
+	struct closure			cl;
+	struct kiocb			*req;
+	long				ret;
+	struct bch_read_bio		rbio;
+};
 
 /* pagecache_block must be held */
 static int write_invalidate_inode_pages_range(struct address_space *mapping,
@@ -101,7 +159,7 @@ static inline void i_size_dirty_get(struct bch_inode_info *inode)
 
 /* i_sectors accounting: */
 
-static enum extent_insert_hook_ret
+static enum btree_insert_ret
 i_sectors_hook_fn(struct extent_insert_hook *hook,
 		  struct bpos committed_pos,
 		  struct bpos next_pos,
@@ -119,7 +177,7 @@ i_sectors_hook_fn(struct extent_insert_hook *hook,
 
 	h->sectors += sectors * sign;
 
-	return BTREE_HOOK_DO_INSERT;
+	return BTREE_INSERT_OK;
 }
 
 static int inode_set_i_sectors_dirty(struct bch_inode_info *inode,
@@ -208,7 +266,7 @@ struct bchfs_extent_trans_hook {
 	bool				need_inode_update;
 };
 
-static enum extent_insert_hook_ret
+static enum btree_insert_ret
 bchfs_extent_update_hook(struct extent_insert_hook *hook,
 			 struct bpos committed_pos,
 			 struct bpos next_pos,
@@ -224,6 +282,10 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
 	u64 offset = min(next_pos.offset << 9, h->op->new_i_size);
 	bool do_pack = false;
 
+	if (h->op->unalloc &&
+	    !bch2_extent_is_fully_allocated(k))
+		return BTREE_INSERT_ENOSPC;
+
 	BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE));
 
 	/* XXX: inode->i_size locking */
@@ -232,7 +294,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
 
 		if (!h->need_inode_update) {
 			h->need_inode_update = true;
-			return BTREE_HOOK_RESTART_TRANS;
+			return BTREE_INSERT_NEED_TRAVERSE;
 		}
 
 		h->inode_u.bi_size = offset;
@@ -247,7 +309,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
 	if (sectors) {
 		if (!h->need_inode_update) {
 			h->need_inode_update = true;
-			return BTREE_HOOK_RESTART_TRANS;
+			return BTREE_INSERT_NEED_TRAVERSE;
 		}
 
 		h->inode_u.bi_sectors += sectors;
@@ -267,7 +329,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook,
 	if (do_pack)
 		bch2_inode_pack(&h->inode_p, &h->inode_u);
 
-	return BTREE_HOOK_DO_INSERT;
+	return BTREE_INSERT_OK;
 }
 
 static int bchfs_write_index_update(struct bch_write_op *wop)
@@ -352,12 +414,16 @@ static int bchfs_write_index_update(struct bch_write_op *wop)
 					BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC,
 					BTREE_INSERT_ENTRY(&extent_iter, k));
 		}
+
+		BUG_ON(bkey_cmp(extent_iter.pos, bkey_start_pos(&k->k)));
+		BUG_ON(!ret != !k->k.size);
 err:
 		if (ret == -EINTR)
 			continue;
 		if (ret)
 			break;
 
+		BUG_ON(bkey_cmp(extent_iter.pos, k->k.p) < 0);
 		bch2_keylist_pop_front(keys);
 	} while (!bch2_keylist_empty(keys));
 
@@ -748,8 +814,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
 		if (bkey_extent_is_allocation(k.k))
 			bch2_add_page_sectors(bio, k);
 
-		if (!bkey_extent_is_allocation(k.k) ||
-		    bkey_extent_is_compressed(k))
+		if (!bch2_extent_is_fully_allocated(k))
 			bch2_mark_pages_unalloc(bio);
 
 		if (pick.ca) {
@@ -759,7 +824,8 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter,
 				trace_read_split(&rbio->bio);
 			}
 
-			bch2_read_extent(c, rbio, k, &pick, flags);
+			bch2_read_extent(c, rbio, bkey_s_c_to_extent(k),
+					 &pick, flags);
 		} else {
 			zero_fill_bio(bio);
 
@@ -963,22 +1029,20 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
 alloc_io:
 		w->io = container_of(bio_alloc_bioset(GFP_NOFS,
 						      BIO_MAX_PAGES,
-						      bch2_writepage_bioset),
+						      &c->writepage_bioset),
 				     struct bch_writepage_io, op.op.wbio.bio);
 
 		closure_init(&w->io->cl, NULL);
-		w->io->op.inode		= inode;
-		w->io->op.sectors_added	= 0;
-		w->io->op.is_dio	= false;
+		bch2_fswrite_op_init(&w->io->op, inode, false);
 		bch2_write_op_init(&w->io->op.op, c,
 				(struct disk_reservation) {
 					.nr_replicas = c->opts.data_replicas,
 				},
 				c->fastest_devs,
-				inode->ei_last_dirtied,
+				writepoint_hashed(inode->ei_last_dirtied),
 				POS(inum, 0),
 				&inode->ei_journal_seq,
-				BCH_WRITE_THROTTLE);
+				0);
 		w->io->op.op.index_update_fn = bchfs_write_index_update;
 	}
 
@@ -1409,7 +1473,7 @@ static int bch2_direct_IO_read(struct bch_fs *c, struct kiocb *req,
 
 	bio = bio_alloc_bioset(GFP_KERNEL,
 			       iov_iter_npages(iter, BIO_MAX_PAGES),
-			       bch2_dio_read_bioset);
+			       &c->dio_read_bioset);
 
 	bio->bi_end_io = bch2_direct_IO_read_endio;
 
@@ -1541,20 +1605,19 @@ static void bch2_do_direct_IO_write(struct dio_write *dio)
 		return;
 	}
 
-	dio->iop.inode		= inode;
 	dio->iop.sectors_added	= 0;
-	dio->iop.is_dio		= true;
-	dio->iop.new_i_size	= U64_MAX;
 	bch2_write_op_init(&dio->iop.op, dio->c, dio->res,
 			   dio->c->fastest_devs,
-			   (unsigned long) dio->task,
+			   writepoint_hashed((unsigned long) dio->task),
 			   POS(inode->v.i_ino, (dio->offset + dio->written) >> 9),
 			   &inode->ei_journal_seq,
-			   flags|BCH_WRITE_THROTTLE);
+			   flags);
 	dio->iop.op.index_update_fn = bchfs_write_index_update;
 
-	dio->res.sectors -= bio_sectors(bio);
-	dio->iop.op.res.sectors = bio_sectors(bio);
+	if (!dio->iop.unalloc) {
+		dio->res.sectors -= bio_sectors(bio);
+		dio->iop.op.res.sectors = bio_sectors(bio);
+	}
 
 	task_io_account_write(bio->bi_iter.bi_size);
 
@@ -1589,6 +1652,31 @@ static void bch2_dio_write_loop_async(struct closure *cl)
 	}
 }
 
+static int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos,
+				      u64 size)
+{
+	struct btree_iter iter;
+	struct bpos end = pos;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	end.offset += size;
+
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
+			     BTREE_ITER_WITH_HOLES, k) {
+		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+			break;
+
+		if (!bch2_extent_is_fully_allocated(k)) {
+			ret = -ENOSPC;
+			break;
+		}
+	}
+	bch2_btree_iter_unlock(&iter);
+
+	return ret;
+}
+
 static int bch2_direct_IO_write(struct bch_fs *c,
 				struct kiocb *req, struct file *file,
 				struct bch_inode_info *inode,
@@ -1610,17 +1698,18 @@ static int bch2_direct_IO_write(struct bch_fs *c,
 
 	bio = bio_alloc_bioset(GFP_KERNEL,
 			       iov_iter_npages(iter, BIO_MAX_PAGES),
-			       bch2_dio_write_bioset);
+			       &c->dio_write_bioset);
 	dio = container_of(bio, struct dio_write, iop.op.wbio.bio);
-	dio->req	= req;
-	dio->c		= c;
-	dio->written	= 0;
-	dio->error	= 0;
-	dio->offset	= offset;
-	dio->iovec	= NULL;
-	dio->iter	= *iter;
-	dio->task	= current;
 	closure_init(&dio->cl, NULL);
+	dio->req		= req;
+	dio->c			= c;
+	dio->written		= 0;
+	dio->error		= 0;
+	dio->offset		= offset;
+	dio->iovec		= NULL;
+	dio->iter		= *iter;
+	dio->task		= current;
+	bch2_fswrite_op_init(&dio->iop, inode, true);
 
 	if (offset + iter->count > inode->v.i_size)
 		sync = true;
@@ -1635,9 +1724,15 @@ static int bch2_direct_IO_write(struct bch_fs *c,
 	 */
 	ret = bch2_disk_reservation_get(c, &dio->res, iter->count >> 9, 0);
 	if (unlikely(ret)) {
-		closure_debug_destroy(&dio->cl);
-		bio_put(bio);
-		return ret;
+		if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
+						      offset >> 9),
+					       iter->count >> 9)) {
+			closure_debug_destroy(&dio->cl);
+			bio_put(bio);
+			return ret;
+		}
+
+		dio->iop.unalloc = true;
 	}
 
 	inode_dio_begin(&inode->v);
@@ -2318,7 +2413,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode,
 		reservation.v.nr_replicas = bch2_extent_nr_dirty_ptrs(k);
 
 		if (reservation.v.nr_replicas < replicas ||
-		    bkey_extent_is_compressed(k)) {
+		    bch2_extent_is_compressed(k)) {
 			ret = bch2_disk_reservation_get(c, &disk_res,
 						       sectors, 0);
 			if (ret)
@@ -2564,4 +2659,24 @@ loff_t bch2_llseek(struct file *file, loff_t offset, int whence)
 	return -EINVAL;
 }
 
+void bch2_fs_fsio_exit(struct bch_fs *c)
+{
+	bioset_exit(&c->dio_write_bioset);
+	bioset_exit(&c->dio_read_bioset);
+	bioset_exit(&c->writepage_bioset);
+}
+
+int bch2_fs_fsio_init(struct bch_fs *c)
+{
+	if (bioset_init(&c->writepage_bioset,
+			4, offsetof(struct bch_writepage_io, op.op.wbio.bio)) ||
+	    bioset_init(&c->dio_read_bioset,
+			4, offsetof(struct dio_read, rbio.bio)) ||
+	    bioset_init(&c->dio_write_bioset,
+			4, offsetof(struct dio_write, iop.op.wbio.bio)))
+		return -ENOMEM;
+
+	return 0;
+}
+
 #endif /* NO_BCACHEFS_FS */
diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h
index 505cea7..30d1ea9 100644
--- a/libbcachefs/fs-io.h
+++ b/libbcachefs/fs-io.h
@@ -1,7 +1,11 @@
 #ifndef _BCACHEFS_FS_IO_H
 #define _BCACHEFS_FS_IO_H
 
+#ifndef NO_BCACHEFS_FS
+
 #include "buckets.h"
+#include "io_types.h"
+
 #include <linux/uio.h>
 
 int bch2_set_page_dirty(struct page *);
@@ -35,60 +39,11 @@ int bch2_releasepage(struct page *, gfp_t);
 int bch2_migrate_page(struct address_space *, struct page *,
 		      struct page *, enum migrate_mode);
 
-struct i_sectors_hook {
-	struct extent_insert_hook	hook;
-	s64				sectors;
-	struct bch_inode_info		*inode;
-};
-
-struct bchfs_write_op {
-	struct bch_inode_info		*inode;
-	s64				sectors_added;
-	bool				is_dio;
-	u64				new_i_size;
-
-	/* must be last: */
-	struct bch_write_op		op;
-};
-
-struct bch_writepage_io {
-	struct closure			cl;
-
-	/* must be last: */
-	struct bchfs_write_op		op;
-};
-
-extern struct bio_set *bch2_writepage_bioset;
-
-struct dio_write {
-	struct closure			cl;
-	struct kiocb			*req;
-	struct bch_fs			*c;
-	long				written;
-	long				error;
-	loff_t				offset;
-
-	struct disk_reservation		res;
-
-	struct iovec			*iovec;
-	struct iovec			inline_vecs[UIO_FASTIOV];
-	struct iov_iter			iter;
-
-	struct task_struct		*task;
-
-	/* must be last: */
-	struct bchfs_write_op		iop;
-};
-
-extern struct bio_set *bch2_dio_write_bioset;
-
-struct dio_read {
-	struct closure			cl;
-	struct kiocb			*req;
-	long				ret;
-	struct bch_read_bio		rbio;
-};
-
-extern struct bio_set *bch2_dio_read_bioset;
+void bch2_fs_fsio_exit(struct bch_fs *);
+int bch2_fs_fsio_init(struct bch_fs *);
+#else
+static inline void bch2_fs_fsio_exit(struct bch_fs *c) {}
+static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; }
+#endif
 
 #endif /* _BCACHEFS_FS_IO_H */
diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c
index 081ae14..43688cd 100644
--- a/libbcachefs/fs.c
+++ b/libbcachefs/fs.c
@@ -654,17 +654,17 @@ static int bch2_fill_extent(struct fiemap_extent_info *info,
 	if (bkey_extent_is_data(&k->k)) {
 		struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k);
 		const struct bch_extent_ptr *ptr;
-		const union bch_extent_crc *crc;
+		struct bch_extent_crc_unpacked crc;
 		int ret;
 
 		extent_for_each_ptr_crc(e, ptr, crc) {
 			int flags2 = 0;
 			u64 offset = ptr->offset;
 
-			if (crc_compression_type(crc))
+			if (crc.compression_type)
 				flags2 |= FIEMAP_EXTENT_ENCODED;
 			else
-				offset += crc_offset(crc);
+				offset += crc.offset;
 
 			if ((offset & (PAGE_SECTORS - 1)) ||
 			    (e.k->size & (PAGE_SECTORS - 1)))
@@ -1336,12 +1336,6 @@ MODULE_ALIAS_FS("bcachefs");
 void bch2_vfs_exit(void)
 {
 	unregister_filesystem(&bcache_fs_type);
-	if (bch2_dio_write_bioset)
-		bioset_free(bch2_dio_write_bioset);
-	if (bch2_dio_read_bioset)
-		bioset_free(bch2_dio_read_bioset);
-	if (bch2_writepage_bioset)
-		bioset_free(bch2_writepage_bioset);
 	if (bch2_inode_cache)
 		kmem_cache_destroy(bch2_inode_cache);
 }
@@ -1354,20 +1348,6 @@ int __init bch2_vfs_init(void)
 	if (!bch2_inode_cache)
 		goto err;
 
-	bch2_writepage_bioset =
-		bioset_create(4, offsetof(struct bch_writepage_io, op.op.wbio.bio));
-	if (!bch2_writepage_bioset)
-		goto err;
-
-	bch2_dio_read_bioset = bioset_create(4, offsetof(struct dio_read, rbio.bio));
-	if (!bch2_dio_read_bioset)
-		goto err;
-
-	bch2_dio_write_bioset =
-		bioset_create(4, offsetof(struct dio_write, iop.op.wbio.bio));
-	if (!bch2_dio_write_bioset)
-		goto err;
-
 	ret = register_filesystem(&bcache_fs_type);
 	if (ret)
 		goto err;
diff --git a/libbcachefs/io.c b/libbcachefs/io.c
index e5fc72d..0c41e41 100644
--- a/libbcachefs/io.c
+++ b/libbcachefs/io.c
@@ -29,6 +29,29 @@
 
 /* Allocate, free from mempool: */
 
+void bch2_latency_acct(struct bch_dev *ca, unsigned submit_time_us, int rw)
+{
+	u64 now = local_clock();
+	unsigned io_latency = (now >> 10) - submit_time_us;
+	atomic_t *latency = &ca->latency[rw];
+	unsigned old, new, v = atomic_read(latency);
+
+	do {
+		old = v;
+
+		/*
+		 * If the io latency was reasonably close to the current
+		 * latency, skip doing the update and atomic operation - most of
+		 * the time:
+		 */
+		if (abs((int) (old - io_latency)) < (old >> 1) &&
+		    now & ~(~0 << 5))
+			break;
+
+		new = ewma_add((u64) old, io_latency, 6);
+	} while ((v = atomic_cmpxchg(latency, old, new)) != old);
+}
+
 void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio)
 {
 	struct bio_vec *bv;
@@ -63,10 +86,12 @@ pool_alloc:
 }
 
 void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
-			      size_t bytes)
+			       size_t bytes)
 {
 	bool using_mempool = false;
 
+	BUG_ON(DIV_ROUND_UP(bytes, PAGE_SIZE) > bio->bi_max_vecs);
+
 	bio->bi_iter.bi_size = bytes;
 
 	while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE))
@@ -76,7 +101,35 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
 		mutex_unlock(&c->bio_bounce_pages_lock);
 }
 
-/* Bios with headers */
+void bch2_bio_alloc_more_pages_pool(struct bch_fs *c, struct bio *bio,
+				    size_t bytes)
+{
+	while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) {
+		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
+
+		BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs);
+
+		bv->bv_page = alloc_page(GFP_NOIO);
+		if (!bv->bv_page) {
+			/*
+			 * We already allocated from mempool, we can't allocate from it again
+			 * without freeing the pages we already allocated or else we could
+			 * deadlock:
+			 */
+			bch2_bio_free_pages_pool(c, bio);
+			bch2_bio_alloc_pages_pool(c, bio, bytes);
+			return;
+		}
+
+		bv->bv_len = PAGE_SIZE;
+		bv->bv_offset = 0;
+		bio->bi_vcnt++;
+	}
+
+	bio->bi_iter.bi_size = bytes;
+}
+
+/* Writes */
 
 void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 			       enum bch_data_type type,
@@ -137,17 +190,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
 	}
 }
 
-/* IO errors */
-
-/* Writes */
-
-static struct workqueue_struct *index_update_wq(struct bch_write_op *op)
-{
-	return op->alloc_reserve == RESERVE_MOVINGGC
-		? op->c->copygc_wq
-		: op->c->wq;
-}
-
 static void __bch2_write(struct closure *);
 
 static void bch2_write_done(struct closure *cl)
@@ -176,7 +218,7 @@ static u64 keylist_sectors(struct keylist *keys)
 	return ret;
 }
 
-static int bch2_write_index_default(struct bch_write_op *op)
+int bch2_write_index_default(struct bch_write_op *op)
 {
 	struct keylist *keys = &op->insert_keys;
 	struct btree_iter iter;
@@ -202,7 +244,6 @@ static void bch2_write_index(struct closure *cl)
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 	struct bch_fs *c = op->c;
 	struct keylist *keys = &op->insert_keys;
-	unsigned i;
 
 	op->flags |= BCH_WRITE_LOOPED;
 
@@ -220,13 +261,7 @@ static void bch2_write_index(struct closure *cl)
 		}
 	}
 
-	for (i = 0; i < ARRAY_SIZE(op->open_buckets); i++)
-		if (op->open_buckets[i]) {
-			bch2_open_bucket_put(c,
-					     c->open_buckets +
-					     op->open_buckets[i]);
-			op->open_buckets[i] = 0;
-		}
+	bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets);
 
 	if (!(op->flags & BCH_WRITE_DONE))
 		continue_at(cl, __bch2_write, op->io_wq);
@@ -287,6 +322,8 @@ static void bch2_write_endio(struct bio *bio)
 	struct bch_fs *c		= wbio->c;
 	struct bch_dev *ca		= wbio->ca;
 
+	bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
+
 	if (bch2_dev_io_err_on(bio->bi_error, ca, "data write")) {
 		set_bit(ca->dev_idx, op->failed.d);
 		set_closure_fn(cl, bch2_write_io_error, index_update_wq(op));
@@ -307,179 +344,364 @@ static void bch2_write_endio(struct bio *bio)
 		closure_put(cl);
 }
 
-static struct nonce extent_nonce(struct bversion version,
-				 unsigned nonce,
-				 unsigned uncompressed_size,
-				 unsigned compression_type)
-{
-	return (struct nonce) {{
-		[0] = cpu_to_le32((nonce		<< 12) |
-				  (uncompressed_size	<< 22)),
-		[1] = cpu_to_le32(version.lo),
-		[2] = cpu_to_le32(version.lo >> 32),
-		[3] = cpu_to_le32(version.hi|
-				  (compression_type << 24))^BCH_NONCE_EXTENT,
-	}};
-}
-
 static void init_append_extent(struct bch_write_op *op,
-			       unsigned compressed_size,
-			       unsigned uncompressed_size,
-			       unsigned compression_type,
-			       unsigned nonce,
-			       struct bch_csum csum, unsigned csum_type,
-			       struct open_bucket *ob)
+			       struct write_point *wp,
+			       struct bversion version,
+			       struct bch_extent_crc_unpacked crc)
 {
 	struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top);
 
-	op->pos.offset += uncompressed_size;
+	op->pos.offset += crc.uncompressed_size;
 	e->k.p = op->pos;
-	e->k.size = uncompressed_size;
-	e->k.version = op->version;
+	e->k.size = crc.uncompressed_size;
+	e->k.version = version;
 	bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED);
 
-	bch2_extent_crc_append(e, compressed_size,
-			      uncompressed_size,
-			      compression_type,
-			      nonce, csum, csum_type);
-
-	bch2_alloc_sectors_append_ptrs(op->c, e, op->nr_replicas,
-				      ob, compressed_size);
+	bch2_extent_crc_append(e, crc);
+	bch2_alloc_sectors_append_ptrs(op->c, wp, e, crc.compressed_size);
 
 	bkey_extent_set_cached(&e->k, (op->flags & BCH_WRITE_CACHED));
 	bch2_keylist_push(&op->insert_keys);
 }
 
-static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
+static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
+					struct write_point *wp,
+					struct bio *src,
+					bool *page_alloc_failed)
 {
-	struct bch_fs *c = op->c;
-	struct bio *orig = &op->wbio.bio;
-	struct bio *bio;
 	struct bch_write_bio *wbio;
-	unsigned key_to_write_offset = op->insert_keys.top_p -
-		op->insert_keys.keys_p;
-	struct bkey_i *key_to_write;
-	unsigned csum_type = op->csum_type;
-	unsigned compression_type = op->compression_type;
-	int ret, more;
+	struct bio *bio;
+	unsigned output_available =
+		min(wp->sectors_free << 9, src->bi_iter.bi_size);
+	unsigned pages = DIV_ROUND_UP(output_available, PAGE_SIZE);
+
+	bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
+	wbio			= wbio_init(bio);
+	wbio->bounce		= true;
+	wbio->put_bio		= true;
+	/* copy WRITE_SYNC flag */
+	wbio->bio.bi_opf	= src->bi_opf;
+
+	/*
+	 * We can't use mempool for more than c->sb.encoded_extent_max
+	 * worth of pages, but we'd like to allocate more if we can:
+	 */
+	while (bio->bi_iter.bi_size < output_available) {
+		unsigned len = min_t(unsigned, PAGE_SIZE,
+				     output_available - bio->bi_iter.bi_size);
+		struct page *p;
+
+		p = alloc_page(GFP_NOIO);
+		if (!p) {
+			unsigned pool_max =
+				min_t(unsigned, output_available,
+				      c->sb.encoded_extent_max << 9);
+
+			if (bio_sectors(bio) < pool_max)
+				bch2_bio_alloc_pages_pool(c, bio, pool_max);
+			break;
+		}
+
+		bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) {
+			.bv_page	= p,
+			.bv_len		= len,
+			.bv_offset	= 0,
+		};
+		bio->bi_iter.bi_size += len;
+	}
 
-	/* don't refetch csum type/compression type */
-	barrier();
+	*page_alloc_failed = bio->bi_vcnt < pages;
+	return bio;
+}
+
+static int bch2_write_rechecksum(struct bch_fs *c,
+				 struct bch_write_op *op,
+				 unsigned new_csum_type)
+{
+	struct bio *bio = &op->wbio.bio;
+	struct bch_extent_crc_unpacked new_crc;
+	int ret;
 
-	BUG_ON(!bio_sectors(orig));
+	/* bch2_rechecksum_bio() can't encrypt or decrypt data: */
+
+	if (bch2_csum_type_is_encryption(op->crc.csum_type) !=
+	    bch2_csum_type_is_encryption(new_csum_type))
+		new_csum_type = op->crc.csum_type;
+
+	ret = bch2_rechecksum_bio(c, bio, op->version, op->crc,
+				  NULL, &new_crc,
+				  op->crc.offset, op->crc.live_size,
+				  new_csum_type);
+	if (ret)
+		return ret;
+
+	bio_advance(bio, op->crc.offset << 9);
+	bio->bi_iter.bi_size = op->crc.live_size << 9;
+	op->crc = new_crc;
+	return 0;
+}
+
+static int bch2_write_decrypt(struct bch_write_op *op)
+{
+	struct bch_fs *c = op->c;
+	struct nonce nonce = extent_nonce(op->version, op->crc);
+	struct bch_csum csum;
+
+	if (!bch2_csum_type_is_encryption(op->crc.csum_type))
+		return 0;
+
+	/*
+	 * If we need to decrypt data in the write path, we'll no longer be able
+	 * to verify the existing checksum (poly1305 mac, in this case) after
+	 * it's decrypted - this is the last point we'll be able to reverify the
+	 * checksum:
+	 */
+	csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+	if (bch2_crc_cmp(op->crc.csum, csum))
+		return -EIO;
+
+	bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio);
+	op->crc.csum_type = 0;
+	op->crc.csum = (struct bch_csum) { 0, 0 };
+	return 0;
+}
+
+static enum prep_encoded_ret {
+	PREP_ENCODED_OK,
+	PREP_ENCODED_ERR,
+	PREP_ENCODED_CHECKSUM_ERR,
+	PREP_ENCODED_DO_WRITE,
+} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp)
+{
+	struct bch_fs *c = op->c;
+	struct bio *bio = &op->wbio.bio;
 
-	/* Need to decompress data? */
-	if ((op->flags & BCH_WRITE_DATA_COMPRESSED) &&
-	    (crc_uncompressed_size(NULL, &op->crc) != op->size ||
-	     crc_compressed_size(NULL, &op->crc) > wp->sectors_free)) {
-		int ret;
+	if (!(op->flags & BCH_WRITE_DATA_ENCODED))
+		return PREP_ENCODED_OK;
 
-		ret = bch2_bio_uncompress_inplace(c, orig, op->size, op->crc);
-		if (ret)
-			return ret;
+	BUG_ON(bio_sectors(bio) != op->crc.compressed_size);
 
-		op->flags &= ~BCH_WRITE_DATA_COMPRESSED;
+	/* Can we just write the entire extent as is? */
+	if (op->crc.uncompressed_size == op->crc.live_size &&
+	    op->crc.compressed_size <= wp->sectors_free &&
+	    op->crc.compression_type == op->compression_type) {
+		if (!op->crc.compression_type &&
+		    op->csum_type != op->crc.csum_type &&
+		    bch2_write_rechecksum(c, op, op->csum_type))
+			return PREP_ENCODED_CHECKSUM_ERR;
+
+		return PREP_ENCODED_DO_WRITE;
 	}
 
-	if (op->flags & BCH_WRITE_DATA_COMPRESSED) {
-		init_append_extent(op,
-				   crc_compressed_size(NULL, &op->crc),
-				   crc_uncompressed_size(NULL, &op->crc),
-				   op->crc.compression_type,
-				   op->crc.nonce,
-				   op->crc.csum,
-				   op->crc.csum_type,
-				   wp->ob);
-
-		bio			= orig;
-		wbio			= wbio_init(bio);
-		more			= 0;
-	} else if (csum_type != BCH_CSUM_NONE ||
-		   compression_type != BCH_COMPRESSION_NONE) {
-		/* all units here in bytes */
-		unsigned total_output = 0, output_available =
-			min(wp->sectors_free << 9, orig->bi_iter.bi_size);
-		unsigned crc_nonce = bch2_csum_type_is_encryption(csum_type)
-			? op->nonce : 0;
+	/*
+	 * If the data is compressed and we couldn't write the entire extent as
+	 * is, we have to decompress it:
+	 */
+	if (op->crc.compression_type) {
 		struct bch_csum csum;
-		struct nonce nonce;
 
-		bio = bio_alloc_bioset(GFP_NOIO,
-				       DIV_ROUND_UP(output_available, PAGE_SIZE),
-				       &c->bio_write);
-		wbio			= wbio_init(bio);
-		wbio->bounce		= true;
-		wbio->put_bio		= true;
-		/* copy WRITE_SYNC flag */
-		wbio->bio.bi_opf	= orig->bi_opf;
+		if (bch2_write_decrypt(op))
+			return PREP_ENCODED_CHECKSUM_ERR;
 
-		/*
-		 * XXX: can't use mempool for more than
-		 * BCH_COMPRESSED_EXTENT_MAX worth of pages
-		 */
-		bch2_bio_alloc_pages_pool(c, bio, output_available);
+		/* Last point we can still verify checksum: */
+		csum = bch2_checksum_bio(c, op->crc.csum_type,
+					 extent_nonce(op->version, op->crc),
+					 bio);
+		if (bch2_crc_cmp(op->crc.csum, csum))
+			return PREP_ENCODED_CHECKSUM_ERR;
 
-		do {
-			unsigned fragment_compression_type = compression_type;
-			size_t dst_len, src_len;
+		if (bch2_bio_uncompress_inplace(c, bio, &op->crc))
+			return PREP_ENCODED_ERR;
+	}
 
-			bch2_bio_compress(c, bio, &dst_len,
-					 orig, &src_len,
-					 &fragment_compression_type);
+	/*
+	 * No longer have compressed data after this point - data might be
+	 * encrypted:
+	 */
 
-			nonce = extent_nonce(op->version,
-					     crc_nonce,
-					     src_len >> 9,
-					     fragment_compression_type);
+	/*
+	 * If the data is checksummed and we're only writing a subset,
+	 * rechecksum and adjust bio to point to currently live data:
+	 */
+	if ((op->crc.live_size != op->crc.uncompressed_size ||
+	     op->crc.csum_type != op->csum_type) &&
+	    bch2_write_rechecksum(c, op, op->csum_type))
+		return PREP_ENCODED_CHECKSUM_ERR;
 
-			swap(bio->bi_iter.bi_size, dst_len);
-			bch2_encrypt_bio(c, csum_type, nonce, bio);
+	/*
+	 * If we want to compress the data, it has to be decrypted:
+	 */
+	if ((op->compression_type ||
+	     bch2_csum_type_is_encryption(op->crc.csum_type) !=
+	     bch2_csum_type_is_encryption(op->csum_type)) &&
+	    bch2_write_decrypt(op))
+		return PREP_ENCODED_CHECKSUM_ERR;
 
-			csum = bch2_checksum_bio(c, csum_type, nonce, bio);
-			swap(bio->bi_iter.bi_size, dst_len);
+	return PREP_ENCODED_OK;
+}
 
-			init_append_extent(op,
-					   dst_len >> 9, src_len >> 9,
-					   fragment_compression_type,
-					   crc_nonce, csum, csum_type, wp->ob);
+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
+{
+	struct bch_fs *c = op->c;
+	struct bio *src = &op->wbio.bio, *dst = src;
+	struct bvec_iter saved_iter;
+	struct bkey_i *key_to_write;
+	unsigned key_to_write_offset = op->insert_keys.top_p -
+		op->insert_keys.keys_p;
+	unsigned total_output = 0;
+	bool bounce = false, page_alloc_failed = false;
+	int ret, more = 0;
 
-			total_output += dst_len;
-			bio_advance(bio, dst_len);
-			bio_advance(orig, src_len);
-		} while (bio->bi_iter.bi_size &&
-			 orig->bi_iter.bi_size &&
-			 !bch2_keylist_realloc(&op->insert_keys,
-					      op->inline_keys,
-					      ARRAY_SIZE(op->inline_keys),
-					      BKEY_EXTENT_U64s_MAX));
+	BUG_ON(!bio_sectors(src));
 
-		BUG_ON(total_output > output_available);
+	switch (bch2_write_prep_encoded_data(op, wp)) {
+	case PREP_ENCODED_OK:
+		break;
+	case PREP_ENCODED_ERR:
+		ret = -EIO;
+		goto err;
+	case PREP_ENCODED_CHECKSUM_ERR:
+		goto csum_err;
+	case PREP_ENCODED_DO_WRITE:
+		init_append_extent(op, wp, op->version, op->crc);
+		goto do_write;
+	}
 
-		memset(&bio->bi_iter, 0, sizeof(bio->bi_iter));
-		bio->bi_iter.bi_size = total_output;
+	if (op->compression_type ||
+	    (op->csum_type &&
+	     !(op->flags & BCH_WRITE_PAGES_STABLE)) ||
+	    (bch2_csum_type_is_encryption(op->csum_type) &&
+	     !(op->flags & BCH_WRITE_PAGES_OWNED))) {
+		dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed);
+		bounce = true;
+	}
 
-		/*
-		 * Free unneeded pages after compressing:
-		 */
-		while (bio->bi_vcnt * PAGE_SIZE >
-		       round_up(bio->bi_iter.bi_size, PAGE_SIZE))
-			mempool_free(bio->bi_io_vec[--bio->bi_vcnt].bv_page,
-				     &c->bio_bounce_pages);
+	saved_iter = dst->bi_iter;
 
-		more = orig->bi_iter.bi_size != 0;
-	} else {
-		bio = bio_next_split(orig, wp->sectors_free, GFP_NOIO,
-				     &c->bio_write);
-		wbio			= wbio_init(bio);
-		wbio->put_bio		= bio != orig;
+	do {
+		struct bch_extent_crc_unpacked crc =
+			(struct bch_extent_crc_unpacked) { 0 };
+		struct bversion version = op->version;
+		size_t dst_len, src_len;
+
+		if (page_alloc_failed &&
+		    bio_sectors(dst) < wp->sectors_free &&
+		    bio_sectors(dst) < c->sb.encoded_extent_max)
+			break;
 
-		init_append_extent(op, bio_sectors(bio), bio_sectors(bio),
-				   compression_type, 0,
-				   (struct bch_csum) { 0 }, csum_type, wp->ob);
+		BUG_ON(op->compression_type &&
+		       (op->flags & BCH_WRITE_DATA_ENCODED) &&
+		       bch2_csum_type_is_encryption(op->crc.csum_type));
+		BUG_ON(op->compression_type && !bounce);
+
+		crc.compression_type = op->compression_type
+			?  bch2_bio_compress(c, dst, &dst_len, src, &src_len,
+					     op->compression_type)
+			: 0;
+		if (!crc.compression_type) {
+			dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size);
+			dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9);
+
+			if (op->csum_type)
+				dst_len = min_t(unsigned, dst_len,
+						c->sb.encoded_extent_max << 9);
+
+			if (bounce) {
+				swap(dst->bi_iter.bi_size, dst_len);
+				bio_copy_data(dst, src);
+				swap(dst->bi_iter.bi_size, dst_len);
+			}
 
-		more = bio != orig;
+			src_len = dst_len;
+		}
+
+		BUG_ON(!src_len || !dst_len);
+
+		if (bch2_csum_type_is_encryption(op->csum_type)) {
+			if (bversion_zero(version)) {
+				version.lo = atomic64_inc_return(&c->key_version) + 1;
+			} else {
+				crc.nonce = op->nonce;
+				op->nonce += src_len >> 9;
+			}
+		}
+
+		if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+		    !crc.compression_type &&
+		    bch2_csum_type_is_encryption(op->crc.csum_type) ==
+		    bch2_csum_type_is_encryption(op->csum_type)) {
+			/*
+			 * Note: when we're using rechecksum(), we need to be
+			 * checksumming @src because it has all the data our
+			 * existing checksum covers - if we bounced (because we
+			 * were trying to compress), @dst will only have the
+			 * part of the data the new checksum will cover.
+			 *
+			 * But normally we want to be checksumming post bounce,
+			 * because part of the reason for bouncing is so the
+			 * data can't be modified (by userspace) while it's in
+			 * flight.
+			 */
+			if (bch2_rechecksum_bio(c, src, version, op->crc,
+					&crc, &op->crc,
+					src_len >> 9,
+					bio_sectors(src) - (src_len >> 9),
+					op->csum_type))
+				goto csum_err;
+		} else {
+			if ((op->flags & BCH_WRITE_DATA_ENCODED) &&
+			    bch2_rechecksum_bio(c, src, version, op->crc,
+					NULL, &op->crc,
+					src_len >> 9,
+					bio_sectors(src) - (src_len >> 9),
+					op->crc.csum_type))
+				goto csum_err;
+
+			crc.compressed_size	= dst_len >> 9;
+			crc.uncompressed_size	= src_len >> 9;
+			crc.live_size		= src_len >> 9;
+
+			swap(dst->bi_iter.bi_size, dst_len);
+			bch2_encrypt_bio(c, op->csum_type,
+					 extent_nonce(version, crc), dst);
+			crc.csum = bch2_checksum_bio(c, op->csum_type,
+					 extent_nonce(version, crc), dst);
+			crc.csum_type = op->csum_type;
+			swap(dst->bi_iter.bi_size, dst_len);
+		}
+
+		init_append_extent(op, wp, version, crc);
+
+		if (dst != src)
+			bio_advance(dst, dst_len);
+		bio_advance(src, src_len);
+		total_output += dst_len;
+	} while (dst->bi_iter.bi_size &&
+		 src->bi_iter.bi_size &&
+		 wp->sectors_free &&
+		 !bch2_keylist_realloc(&op->insert_keys,
+				      op->inline_keys,
+				      ARRAY_SIZE(op->inline_keys),
+				      BKEY_EXTENT_U64s_MAX));
+
+	more = src->bi_iter.bi_size != 0;
+
+	dst->bi_iter = saved_iter;
+
+	if (!bounce && more) {
+		dst = bio_split(src, total_output >> 9,
+				GFP_NOIO, &c->bio_write);
+		wbio_init(dst)->put_bio = true;
 	}
 
+	dst->bi_iter.bi_size = total_output;
+
+	/* Free unneeded pages after compressing: */
+	if (bounce)
+		while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE))
+			mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page,
+				     &c->bio_bounce_pages);
+do_write:
 	/* might have done a realloc... */
 
 	key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset);
@@ -487,30 +709,40 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp)
 	ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write),
 				    BCH_DATA_USER);
 	if (ret)
-		return ret;
+		goto err;
 
-	bio->bi_end_io	= bch2_write_endio;
-	bio->bi_private	= &op->cl;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+	dst->bi_end_io	= bch2_write_endio;
+	dst->bi_private	= &op->cl;
+	bio_set_op_attrs(dst, REQ_OP_WRITE, 0);
 
-	closure_get(bio->bi_private);
+	closure_get(dst->bi_private);
 
-	bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER,
+	bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER,
 				  key_to_write);
 	return more;
+csum_err:
+	bch_err(c, "error verifying existing checksum while "
+		"rewriting existing data (memory corruption?)");
+	ret = -EIO;
+err:
+	if (bounce) {
+		bch2_bio_free_pages_pool(c, dst);
+		bio_put(dst);
+	}
+
+	return ret;
 }
 
 static void __bch2_write(struct closure *cl)
 {
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
 	struct bch_fs *c = op->c;
-	unsigned open_bucket_nr = 0;
 	struct write_point *wp;
-	struct open_bucket *ob;
 	int ret;
 
 	do {
-		if (open_bucket_nr == ARRAY_SIZE(op->open_buckets))
+		if (op->open_buckets_nr + op->nr_replicas >
+		    ARRAY_SIZE(op->open_buckets))
 			continue_at(cl, bch2_write_index, index_update_wq(op));
 
 		/* for the device pointers and 1 for the chksum */
@@ -520,11 +752,12 @@ static void __bch2_write(struct closure *cl)
 					BKEY_EXTENT_U64s_MAX))
 			continue_at(cl, bch2_write_index, index_update_wq(op));
 
-		wp = bch2_alloc_sectors_start(c, BCH_DATA_USER,
+		wp = bch2_alloc_sectors_start(c,
 			op->devs,
 			op->write_point,
+			&op->devs_have,
 			op->nr_replicas,
-			c->opts.data_replicas_required,
+			op->nr_replicas_required,
 			op->alloc_reserve,
 			op->flags,
 			(op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl);
@@ -565,14 +798,13 @@ static void __bch2_write(struct closure *cl)
 			continue;
 		}
 
-		ob = wp->ob;
-
-		BUG_ON(ob - c->open_buckets == 0 ||
-		       ob - c->open_buckets > U8_MAX);
-		op->open_buckets[open_bucket_nr++] = ob - c->open_buckets;
-
 		ret = bch2_write_extent(op, wp);
 
+		BUG_ON(op->open_buckets_nr + wp->nr_ptrs_can_use >
+		       ARRAY_SIZE(op->open_buckets));
+		bch2_open_bucket_get(c, wp,
+				     &op->open_buckets_nr,
+				     op->open_buckets);
 		bch2_alloc_sectors_done(c, wp);
 
 		if (ret < 0)
@@ -603,30 +835,6 @@ err:
 		    : bch2_write_done, index_update_wq(op));
 }
 
-void bch2_wake_delayed_writes(unsigned long data)
-{
-	struct bch_fs *c = (void *) data;
-	struct bch_write_op *op;
-	unsigned long flags;
-
-	spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
-
-	while ((op = c->write_wait_head)) {
-		if (time_after(op->expires, jiffies)) {
-			mod_timer(&c->foreground_write_wakeup, op->expires);
-			break;
-		}
-
-		c->write_wait_head = op->next;
-		if (!c->write_wait_head)
-			c->write_wait_tail = NULL;
-
-		closure_put(&op->cl);
-	}
-
-	spin_unlock_irqrestore(&c->foreground_write_pd_lock, flags);
-}
-
 /**
  * bch_write - handle a write to a cache device or flash only volume
  *
@@ -646,9 +854,17 @@ void bch2_wake_delayed_writes(unsigned long data)
 void bch2_write(struct closure *cl)
 {
 	struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
-	struct bio *bio = &op->wbio.bio;
 	struct bch_fs *c = op->c;
-	u64 inode = op->pos.inode;
+
+	BUG_ON(!op->nr_replicas);
+	BUG_ON(!op->write_point.v);
+	BUG_ON(!bkey_cmp(op->pos, POS_MAX));
+	BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX);
+
+	memset(&op->failed, 0, sizeof(op->failed));
+
+	bch2_keylist_init(&op->insert_keys, op->inline_keys);
+	wbio_init(&op->wbio.bio)->put_bio = false;
 
 	if (c->opts.nochanges ||
 	    !percpu_ref_tryget(&c->writes)) {
@@ -658,102 +874,11 @@ void bch2_write(struct closure *cl)
 		closure_return(cl);
 	}
 
-	if (bversion_zero(op->version) &&
-	    bch2_csum_type_is_encryption(op->csum_type))
-		op->version.lo =
-			atomic64_inc_return(&c->key_version) + 1;
-
-	bch2_increment_clock(c, bio_sectors(bio), WRITE);
-
-	/* Don't call bch2_next_delay() if rate is >= 1 GB/sec */
-
-	if ((op->flags & BCH_WRITE_THROTTLE) &&
-	    c->foreground_write_ratelimit_enabled &&
-	    c->foreground_write_pd.rate.rate < (1 << 30)) {
-		unsigned long flags;
-		u64 delay;
-
-		spin_lock_irqsave(&c->foreground_write_pd_lock, flags);
-		bch2_ratelimit_increment(&c->foreground_write_pd.rate,
-					 bio->bi_iter.bi_size);
-
-		delay = bch2_ratelimit_delay(&c->foreground_write_pd.rate);
-
-		if (delay >= HZ / 100) {
-			trace_write_throttle(c, inode, bio, delay);
-
-			closure_get(&op->cl); /* list takes a ref */
-
-			op->expires = jiffies + delay;
-			op->next = NULL;
-
-			if (c->write_wait_tail)
-				c->write_wait_tail->next = op;
-			else
-				c->write_wait_head = op;
-			c->write_wait_tail = op;
-
-			if (!timer_pending(&c->foreground_write_wakeup))
-				mod_timer(&c->foreground_write_wakeup,
-					  op->expires);
-
-			spin_unlock_irqrestore(&c->foreground_write_pd_lock,
-					       flags);
-			continue_at(cl, __bch2_write, index_update_wq(op));
-		}
-
-		spin_unlock_irqrestore(&c->foreground_write_pd_lock, flags);
-	}
+	bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE);
 
 	continue_at_nobarrier(cl, __bch2_write, NULL);
 }
 
-void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
-			struct disk_reservation res,
-			struct bch_devs_mask *devs,
-			unsigned long write_point,
-			struct bpos pos,
-			u64 *journal_seq, unsigned flags)
-{
-	EBUG_ON(res.sectors && !res.nr_replicas);
-
-	op->c		= c;
-	op->io_wq	= index_update_wq(op);
-	op->written	= 0;
-	op->error	= 0;
-	op->flags	= flags;
-	op->csum_type	= bch2_data_checksum_type(c);
-	op->compression_type =
-		bch2_compression_opt_to_type(c->opts.compression);
-	op->nr_replicas	= res.nr_replicas;
-	op->alloc_reserve = RESERVE_NONE;
-	op->nonce	= 0;
-	op->pos		= pos;
-	op->version	= ZERO_VERSION;
-	op->res		= res;
-	op->devs	= devs;
-	op->write_point	= write_point;
-
-	if (journal_seq) {
-		op->journal_seq_p = journal_seq;
-		op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
-	} else {
-		op->journal_seq = 0;
-	}
-
-	op->index_update_fn = bch2_write_index_default;
-
-	memset(op->open_buckets, 0, sizeof(op->open_buckets));
-	memset(&op->failed, 0, sizeof(op->failed));
-
-	bch2_keylist_init(&op->insert_keys,
-			  op->inline_keys,
-			  ARRAY_SIZE(op->inline_keys));
-
-	if (version_stress_test(c))
-		get_random_bytes(&op->version, sizeof(op->version));
-}
-
 /* Cache promotion on read */
 
 struct promote_op {
@@ -787,11 +912,20 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
 	trace_promote(&rbio->bio);
 
 	/* we now own pages: */
+	BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs);
 	swap(bio->bi_vcnt, rbio->bio.bi_vcnt);
-	memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
-	       sizeof(struct bio_vec) * bio->bi_vcnt);
 	rbio->promote = NULL;
 
+	__bch2_write_op_init(&op->write.op, c);
+
+	op->write.move_dev	= -1;
+	op->write.op.devs	= c->fastest_devs;
+	op->write.op.write_point = writepoint_hashed((unsigned long) current);
+	op->write.op.flags	|= BCH_WRITE_ALLOC_NOWAIT;
+	op->write.op.flags	|= BCH_WRITE_CACHED;
+
+	bch2_migrate_write_init(&op->write, rbio);
+
 	closure_init(cl, NULL);
 	closure_call(&op->write.op.cl, bch2_write, c->wq, cl);
 	closure_return_with_destructor(cl, promote_done);
@@ -801,57 +935,27 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio)
  * XXX: multiple promotes can race with each other, wastefully. Keep a list of
  * outstanding promotes?
  */
-static struct promote_op *promote_alloc(struct bch_fs *c,
-					struct bvec_iter iter,
-					struct bkey_s_c k,
-					struct extent_pick_ptr *pick,
-					bool read_full)
+static struct promote_op *promote_alloc(struct bch_read_bio *rbio)
 {
 	struct promote_op *op;
 	struct bio *bio;
-	/*
-	 * biovec needs to be big enough to hold decompressed data, if
-	 * bch2_write_extent() has to decompress/recompress it:
-	 */
-	unsigned sectors = max_t(unsigned, k.k->size,
-		      crc_uncompressed_size(NULL, &pick->crc));
-	unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+	/* data might have to be decompressed in the write path: */
+	unsigned pages = DIV_ROUND_UP(rbio->pick.crc.uncompressed_size,
+				      PAGE_SECTORS);
 
-	op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO);
+	BUG_ON(!rbio->bounce);
+	BUG_ON(pages < rbio->bio.bi_vcnt);
+
+	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages,
+		     GFP_NOIO);
 	if (!op)
 		return NULL;
 
 	bio = &op->write.op.wbio.bio;
 	bio_init(bio, bio->bi_inline_vecs, pages);
 
-	bio->bi_iter = iter;
-
-	if (pick->crc.compression_type) {
-		op->write.op.flags     |= BCH_WRITE_DATA_COMPRESSED;
-		op->write.op.crc	= pick->crc;
-		op->write.op.size	= k.k->size;
-	} else if (read_full) {
-		/*
-		 * Adjust bio to correspond to _live_ portion of @k -
-		 * which might be less than what we're actually reading:
-		 */
-		bio->bi_iter.bi_size = sectors << 9;
-		bio_advance(bio, pick->crc.offset << 9);
-		BUG_ON(bio_sectors(bio) < k.k->size);
-		bio->bi_iter.bi_size = k.k->size << 9;
-	} else {
-		/*
-		 * Set insert pos to correspond to what we're actually
-		 * reading:
-		 */
-		op->write.op.pos.offset = iter.bi_sector;
-	}
-	bch2_migrate_write_init(c, &op->write,
-				c->fastest_devs,
-				k, NULL,
-				BCH_WRITE_ALLOC_NOWAIT|
-				BCH_WRITE_CACHED);
-	op->write.promote = true;
+	memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec,
+	       sizeof(struct bio_vec) * rbio->bio.bi_vcnt);
 
 	return op;
 }
@@ -863,9 +967,6 @@ static bool should_promote(struct bch_fs *c,
 	if (!(flags & BCH_READ_MAY_PROMOTE))
 		return false;
 
-	if (flags & BCH_READ_IN_RETRY)
-		return false;
-
 	if (percpu_ref_is_dying(&c->writes))
 		return false;
 
@@ -875,10 +976,20 @@ static bool should_promote(struct bch_fs *c,
 
 /* Read */
 
+static void bch2_read_nodecode_retry(struct bch_fs *, struct bch_read_bio *,
+				     struct bvec_iter, u64,
+				     struct bch_devs_mask *, unsigned);
+
 #define READ_RETRY_AVOID	1
 #define READ_RETRY		2
 #define READ_ERR		3
 
+enum rbio_context {
+	RBIO_CONTEXT_NULL,
+	RBIO_CONTEXT_HIGHPRI,
+	RBIO_CONTEXT_UNBOUND,
+};
+
 static inline struct bch_read_bio *
 bch2_rbio_parent(struct bch_read_bio *rbio)
 {
@@ -887,14 +998,14 @@ bch2_rbio_parent(struct bch_read_bio *rbio)
 
 __always_inline
 static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn,
+			   enum rbio_context context,
 			   struct workqueue_struct *wq)
 {
-
-	if (!wq || rbio->process_context) {
+	if (context <= rbio->context) {
 		fn(&rbio->work);
 	} else {
 		rbio->work.func		= fn;
-		rbio->process_context	= true;
+		rbio->context		= context;
 		queue_work(wq, &rbio->work);
 	}
 }
@@ -932,7 +1043,7 @@ static void bch2_rbio_retry(struct work_struct *work)
 	struct bch_fs *c		= rbio->c;
 	struct bvec_iter iter		= rbio->bvec_iter;
 	unsigned flags			= rbio->flags;
-	u64 inode			= rbio->inode;
+	u64 inode			= rbio->pos.inode;
 	struct bch_devs_mask avoid;
 
 	trace_read_retry(&rbio->bio);
@@ -942,15 +1053,24 @@ static void bch2_rbio_retry(struct work_struct *work)
 	if (rbio->retry == READ_RETRY_AVOID)
 		__set_bit(rbio->pick.ca->dev_idx, avoid.d);
 
+	if (rbio->promote)
+		kfree(rbio->promote);
+	rbio->promote = NULL;
+
 	if (rbio->split)
 		rbio = bch2_rbio_free(rbio);
 	else
 		rbio->bio.bi_error = 0;
 
-	flags |= BCH_READ_MUST_CLONE;
+	if (!(flags & BCH_READ_NODECODE))
+		flags |= BCH_READ_MUST_CLONE;
 	flags |= BCH_READ_IN_RETRY;
+	flags &= ~BCH_READ_MAY_PROMOTE;
 
-	__bch2_read(c, rbio, iter, inode, &avoid, flags);
+	if (flags & BCH_READ_NODECODE)
+		bch2_read_nodecode_retry(c, rbio, iter, inode, &avoid, flags);
+	else
+		__bch2_read(c, rbio, iter, inode, &avoid, flags);
 }
 
 static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
@@ -964,108 +1084,175 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error)
 		bch2_rbio_parent(rbio)->bio.bi_error = error;
 		bch2_rbio_done(rbio);
 	} else {
-		bch2_rbio_punt(rbio, bch2_rbio_retry, rbio->c->wq);
+		bch2_rbio_punt(rbio, bch2_rbio_retry,
+			       RBIO_CONTEXT_UNBOUND, system_unbound_wq);
+	}
+}
+
+static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
+{
+	struct bch_fs *c = rbio->c;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bkey_i_extent *e;
+	BKEY_PADDED(k) new;
+	struct bch_extent_crc_unpacked new_crc;
+	unsigned offset;
+	int ret;
+
+	if (rbio->pick.crc.compression_type)
+		return;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, rbio->pos,
+			     BTREE_ITER_INTENT);
+retry:
+	k = bch2_btree_iter_peek(&iter);
+	if (IS_ERR_OR_NULL(k.k))
+		goto out;
+
+	if (!bkey_extent_is_data(k.k))
+		goto out;
+
+	bkey_reassemble(&new.k, k);
+	e = bkey_i_to_extent(&new.k);
+
+	if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e),
+				     rbio->pick.ptr,
+				     rbio->pos.offset -
+				     rbio->pick.crc.offset) ||
+	    bversion_cmp(e->k.version, rbio->version))
+		goto out;
+
+	/* Extent was merged? */
+	if (bkey_start_offset(&e->k) < rbio->pos.offset ||
+	    e->k.p.offset > rbio->pos.offset + rbio->pick.crc.uncompressed_size)
+		goto out;
+
+	/* The extent might have been partially overwritten since we read it: */
+	offset = rbio->pick.crc.offset + (bkey_start_offset(&e->k) - rbio->pos.offset);
+
+	if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version,
+				rbio->pick.crc, NULL, &new_crc,
+				offset, e->k.size,
+				rbio->pick.crc.csum_type)) {
+		bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)");
+		goto out;
 	}
+
+	if (!bch2_extent_narrow_crcs(e, new_crc))
+		goto out;
+
+	ret = bch2_btree_insert_at(c, NULL, NULL, NULL,
+				   BTREE_INSERT_ATOMIC|
+				   BTREE_INSERT_NOFAIL|
+				   BTREE_INSERT_NOWAIT,
+				   BTREE_INSERT_ENTRY(&iter, &e->k_i));
+	if (ret == -EINTR)
+		goto retry;
+out:
+	bch2_btree_iter_unlock(&iter);
+}
+
+static bool should_narrow_crcs(struct bkey_s_c_extent e,
+			       struct extent_pick_ptr *pick,
+			       unsigned flags)
+{
+	return !(flags & BCH_READ_IN_RETRY) &&
+		bch2_can_narrow_extent_crcs(e, pick->crc);
 }
 
-static int bch2_rbio_checksum_uncompress(struct bio *dst,
-					 struct bch_read_bio *rbio)
+/* Inner part that may run in process context */
+static void __bch2_read_endio(struct work_struct *work)
 {
+	struct bch_read_bio *rbio =
+		container_of(work, struct bch_read_bio, work);
 	struct bch_fs *c = rbio->c;
-	struct bio *src = &rbio->bio;
+	struct bio *src = &rbio->bio, *dst = &bch2_rbio_parent(rbio)->bio;
 	struct bvec_iter dst_iter = rbio->bvec_iter;
-	struct nonce nonce = extent_nonce(rbio->version,
-				rbio->pick.crc.nonce,
-				crc_uncompressed_size(NULL, &rbio->pick.crc),
-				rbio->pick.crc.compression_type);
+	struct bch_extent_crc_unpacked crc = rbio->pick.crc;
+	struct nonce nonce = extent_nonce(rbio->version, crc);
 	struct bch_csum csum;
-	int ret = 0;
 
-	/*
-	 * reset iterator for checksumming and copying bounced data: here we've
-	 * set rbio->compressed_size to the amount of data we actually read,
-	 * which was not necessarily the full extent if we were only bouncing
-	 * in order to promote
-	 */
+	/* Reset iterator for checksumming and copying bounced data: */
 	if (rbio->bounce) {
-		src->bi_iter.bi_size	= crc_compressed_size(NULL, &rbio->pick.crc) << 9;
-		src->bi_iter.bi_idx	= 0;
-		src->bi_iter.bi_bvec_done = 0;
+		src->bi_iter.bi_size		= crc.compressed_size << 9;
+		src->bi_iter.bi_idx		= 0;
+		src->bi_iter.bi_bvec_done	= 0;
 	} else {
-		src->bi_iter = rbio->bvec_iter;
+		src->bi_iter			= rbio->bvec_iter;
 	}
 
-	csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, nonce, src);
-	if (bch2_dev_io_err_on(bch2_crc_cmp(rbio->pick.crc.csum, csum),
-			       rbio->pick.ca,
-			"data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
-			rbio->inode, (u64) rbio->bvec_iter.bi_sector << 9,
-			rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
-			csum.hi, csum.lo,
-			rbio->pick.crc.csum_type))
-		ret = -EIO;
+	csum = bch2_checksum_bio(c, crc.csum_type, nonce, src);
+	if (bch2_crc_cmp(csum, rbio->pick.crc.csum))
+		goto csum_err;
 
-	/*
-	 * If there was a checksum error, still copy the data back - unless it
-	 * was compressed, we don't want to decompress bad data:
-	 */
-	if (rbio->pick.crc.compression_type != BCH_COMPRESSION_NONE) {
-		if (!ret) {
-			bch2_encrypt_bio(c, rbio->pick.crc.csum_type, nonce, src);
-			ret = bch2_bio_uncompress(c, src, dst,
-						 dst_iter, rbio->pick.crc);
-			if (ret)
-				__bcache_io_error(c, "decompression error");
-		}
-	} else if (rbio->bounce) {
-		bio_advance(src, rbio->pick.crc.offset << 9);
-
-		/* don't need to decrypt the entire bio: */
-		BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
-		src->bi_iter.bi_size = dst_iter.bi_size;
+	if (unlikely(rbio->narrow_crcs))
+		bch2_rbio_narrow_crcs(rbio);
 
-		nonce = nonce_add(nonce, rbio->pick.crc.offset << 9);
+	if (rbio->flags & BCH_READ_NODECODE)
+		goto nodecode;
 
-		bch2_encrypt_bio(c, rbio->pick.crc.csum_type,
-				nonce, src);
+	/* Adjust crc to point to subset of data we want: */
+	crc.offset     += rbio->bvec_iter.bi_sector - rbio->pos.offset;
+	crc.live_size	= bvec_iter_sectors(rbio->bvec_iter);
 
-		bio_copy_data_iter(dst, &dst_iter,
-				   src, &src->bi_iter);
+	if (crc.compression_type != BCH_COMPRESSION_NONE) {
+		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
+		if (bch2_bio_uncompress(c, src, dst, dst_iter, crc))
+			goto decompression_err;
 	} else {
-		bch2_encrypt_bio(c, rbio->pick.crc.csum_type, nonce, src);
-	}
+		/* don't need to decrypt the entire bio: */
+		nonce = nonce_add(nonce, crc.offset << 9);
+		bio_advance(src, crc.offset << 9);
 
-	return ret;
-}
+		BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size);
+		src->bi_iter.bi_size = dst_iter.bi_size;
 
-/* Inner part that may run in process context */
-static void __bch2_read_endio(struct work_struct *work)
-{
-	struct bch_read_bio *rbio =
-		container_of(work, struct bch_read_bio, work);
-	int ret;
+		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
 
-	ret = bch2_rbio_checksum_uncompress(&bch2_rbio_parent(rbio)->bio, rbio);
-	if (ret) {
-		/*
-		 * Checksum error: if the bio wasn't bounced, we may have been
-		 * reading into buffers owned by userspace (that userspace can
-		 * scribble over) - retry the read, bouncing it this time:
-		 */
-		if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
-			rbio->flags |= BCH_READ_MUST_BOUNCE;
-			bch2_rbio_error(rbio, READ_RETRY, ret);
-		} else {
-			bch2_rbio_error(rbio, READ_RETRY_AVOID, ret);
+		if (rbio->bounce) {
+			struct bvec_iter src_iter = src->bi_iter;
+			bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
 		}
-		return;
 	}
 
-	if (rbio->promote)
+	if (rbio->promote) {
+		/*
+		 * Re encrypt data we decrypted, so it's consistent with
+		 * rbio->crc:
+		 */
+		bch2_encrypt_bio(c, crc.csum_type, nonce, src);
 		promote_start(rbio->promote, rbio);
-
+	}
+nodecode:
 	if (likely(!(rbio->flags & BCH_READ_IN_RETRY)))
 		bch2_rbio_done(rbio);
+	return;
+csum_err:
+	/*
+	 * Checksum error: if the bio wasn't bounced, we may have been
+	 * reading into buffers owned by userspace (that userspace can
+	 * scribble over) - retry the read, bouncing it this time:
+	 */
+	if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) {
+		rbio->flags |= BCH_READ_MUST_BOUNCE;
+		bch2_rbio_error(rbio, READ_RETRY, -EIO);
+		return;
+	}
+
+	bch2_dev_io_error(rbio->pick.ca,
+		"data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)",
+		rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector,
+		rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
+		csum.hi, csum.lo, crc.csum_type);
+	bch2_rbio_error(rbio, READ_RETRY_AVOID, -EIO);
+	return;
+decompression_err:
+	__bcache_io_error(c, "decompression error, inode %llu offset %llu",
+			  rbio->pos.inode,
+			  (u64) rbio->bvec_iter.bi_sector);
+	bch2_rbio_error(rbio, READ_ERR, -EIO);
+	return;
 }
 
 static void bch2_read_endio(struct bio *bio)
@@ -1074,6 +1261,9 @@ static void bch2_read_endio(struct bio *bio)
 		container_of(bio, struct bch_read_bio, bio);
 	struct bch_fs *c = rbio->c;
 	struct workqueue_struct *wq = NULL;
+	enum rbio_context context = RBIO_CONTEXT_NULL;
+
+	bch2_latency_acct(rbio->pick.ca, rbio->submit_time_us, READ);
 
 	percpu_ref_put(&rbio->pick.ca->io_ref);
 
@@ -1097,38 +1287,45 @@ static void bch2_read_endio(struct bio *bio)
 		return;
 	}
 
-	if (rbio->pick.crc.compression_type ||
+	if (rbio->narrow_crcs ||
+	    rbio->pick.crc.compression_type ||
 	    bch2_csum_type_is_encryption(rbio->pick.crc.csum_type))
-		wq = system_unbound_wq;
+		context = RBIO_CONTEXT_UNBOUND,	wq = system_unbound_wq;
 	else if (rbio->pick.crc.csum_type)
-		wq = system_highpri_wq;
+		context = RBIO_CONTEXT_HIGHPRI,	wq = system_highpri_wq;
 
-	bch2_rbio_punt(rbio, __bch2_read_endio, wq);
+	bch2_rbio_punt(rbio, __bch2_read_endio, context, wq);
 }
 
 int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
-		       struct bvec_iter iter, struct bkey_s_c k,
+		       struct bvec_iter iter, struct bkey_s_c_extent e,
 		       struct extent_pick_ptr *pick, unsigned flags)
 {
 	struct bch_read_bio *rbio;
-	struct promote_op *promote_op = NULL;
-	unsigned skip = iter.bi_sector - bkey_start_offset(k.k);
-	bool bounce = false, split, read_full = false;
+	bool split = false, bounce = false, read_full = false;
+	bool promote = false, narrow_crcs = false;
+	struct bpos pos = bkey_start_pos(e.k);
 	int ret = 0;
 
-	bch2_increment_clock(c, bio_sectors(&orig->bio), READ);
 	PTR_BUCKET(pick->ca, &pick->ptr)->prio[READ] = c->prio_clock[READ].hand;
 
-	EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector ||
-		k.k->p.offset < bvec_iter_end_sector(iter));
+	narrow_crcs = should_narrow_crcs(e, pick, flags);
+
+	if (flags & BCH_READ_NODECODE) {
+		BUG_ON(iter.bi_size < pick->crc.compressed_size << 9);
+		iter.bi_size = pick->crc.compressed_size << 9;
+		goto noclone;
+	}
+
+	if (narrow_crcs && (flags & BCH_READ_USER_MAPPED))
+		flags |= BCH_READ_MUST_BOUNCE;
+
+	EBUG_ON(bkey_start_offset(e.k) > iter.bi_sector ||
+		e.k->p.offset < bvec_iter_end_sector(iter));
 
-	/*
-	 * note: if compression_type and crc_type both == none, then
-	 * compressed/uncompressed size is zero
-	 */
 	if (pick->crc.compression_type != BCH_COMPRESSION_NONE ||
 	    (pick->crc.csum_type != BCH_CSUM_NONE &&
-	     (bvec_iter_sectors(iter) != crc_uncompressed_size(NULL, &pick->crc) ||
+	     (bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
 	      (bch2_csum_type_is_encryption(pick->crc.csum_type) &&
 	       (flags & BCH_READ_USER_MAPPED)) ||
 	      (flags & BCH_READ_MUST_BOUNCE)))) {
@@ -1136,17 +1333,30 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 		bounce = true;
 	}
 
-	if (should_promote(c, pick, flags))
-		promote_op = promote_alloc(c, iter, k, pick, read_full);
-
+	promote = should_promote(c, pick, flags);
 	/* could also set read_full */
-	if (promote_op)
+	if (promote)
 		bounce = true;
 
+	if (!read_full) {
+		EBUG_ON(pick->crc.compression_type);
+		EBUG_ON(pick->crc.csum_type &&
+			(bvec_iter_sectors(iter) != pick->crc.uncompressed_size ||
+			 bvec_iter_sectors(iter) != pick->crc.live_size ||
+			 pick->crc.offset ||
+			 iter.bi_sector != pos.offset));
+
+		pick->ptr.offset += pick->crc.offset +
+			(iter.bi_sector - pos.offset);
+		pick->crc.compressed_size	= bvec_iter_sectors(iter);
+		pick->crc.uncompressed_size	= bvec_iter_sectors(iter);
+		pick->crc.offset		= 0;
+		pick->crc.live_size		= bvec_iter_sectors(iter);
+		pos.offset			= iter.bi_sector;
+	}
+
 	if (bounce) {
-		unsigned sectors = read_full
-			? (crc_compressed_size(NULL, &pick->crc) ?: k.k->size)
-			: bvec_iter_sectors(iter);
+		unsigned sectors = pick->crc.compressed_size;
 
 		rbio = rbio_init(bio_alloc_bioset(GFP_NOIO,
 					DIV_ROUND_UP(sectors, PAGE_SECTORS),
@@ -1163,41 +1373,38 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 		 * from the whole bio, in which case we don't want to retry and
 		 * lose the error)
 		 */
-		rbio = rbio_init(bio_clone_fast(&orig->bio,
-					      GFP_NOIO, &c->bio_read_split));
+		rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO,
+						&c->bio_read_split));
 		rbio->bio.bi_iter = iter;
 		split = true;
 	} else {
+noclone:
 		rbio = orig;
 		rbio->bio.bi_iter = iter;
 		split = false;
 		BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN));
 	}
 
-	rbio->c			= c;
+	BUG_ON(bio_sectors(&rbio->bio) != pick->crc.compressed_size);
 
+	rbio->c			= c;
 	if (split)
 		rbio->parent	= orig;
 	else
 		rbio->end_io	= orig->bio.bi_end_io;
-
 	rbio->bvec_iter		= iter;
+	rbio->submit_time_us	= local_clock_us();
 	rbio->flags		= flags;
 	rbio->bounce		= bounce;
 	rbio->split		= split;
-	rbio->process_context	= false;
+	rbio->narrow_crcs	= narrow_crcs;
 	rbio->retry		= 0;
+	rbio->context		= 0;
+	rbio->devs_have		= bch2_extent_devs(e);
 	rbio->pick		= *pick;
-	/*
-	 * crc.compressed_size will be 0 if there wasn't any checksum
-	 * information, also we need to stash the original size of the bio if we
-	 * bounced (which isn't necessarily the original key size, if we bounced
-	 * only for promoting)
-	 */
-	rbio->pick.crc._compressed_size = bio_sectors(&rbio->bio) - 1;
-	rbio->version		= k.k->version;
-	rbio->promote		= promote_op;
-	rbio->inode		= k.k->p.inode;
+	rbio->pos		= pos;
+	rbio->version		= e.k->version;
+	rbio->promote		= promote ? promote_alloc(rbio) : NULL;
 	INIT_WORK(&rbio->work, NULL);
 
 	rbio->bio.bi_bdev	= pick->ca->disk_sb.bdev;
@@ -1205,16 +1412,10 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 	rbio->bio.bi_iter.bi_sector = pick->ptr.offset;
 	rbio->bio.bi_end_io	= bch2_read_endio;
 
-	if (read_full)
-		rbio->pick.crc.offset += skip;
-	else
-		rbio->bio.bi_iter.bi_sector += skip;
-
-	rbio->submit_time_us = local_clock_us();
-
 	if (bounce)
 		trace_read_bounce(&rbio->bio);
 
+	bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
 	this_cpu_add(pick->ca->io_done->sectors[READ][BCH_DATA_USER],
 		     bio_sectors(&rbio->bio));
 
@@ -1223,7 +1424,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 	} else {
 		submit_bio_wait(&rbio->bio);
 
-		rbio->process_context = true;
+		rbio->context = RBIO_CONTEXT_UNBOUND;
 		bch2_read_endio(&rbio->bio);
 
 		ret = rbio->retry;
@@ -1234,6 +1435,79 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
 	return ret;
 }
 
+static void bch2_read_nodecode_retry(struct bch_fs *c, struct bch_read_bio *rbio,
+				     struct bvec_iter bvec_iter, u64 inode,
+				     struct bch_devs_mask *avoid, unsigned flags)
+{
+	struct extent_pick_ptr pick;
+	struct btree_iter iter;
+	BKEY_PADDED(k) tmp;
+	struct bkey_s_c k;
+	int ret;
+
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS,
+			     POS(inode, bvec_iter.bi_sector),
+			     BTREE_ITER_WITH_HOLES);
+retry:
+	k = bch2_btree_iter_peek_with_holes(&iter);
+	if (btree_iter_err(k)) {
+		bch2_btree_iter_unlock(&iter);
+		goto err;
+	}
+
+	bkey_reassemble(&tmp.k, k);
+	k = bkey_i_to_s_c(&tmp.k);
+	bch2_btree_iter_unlock(&iter);
+
+	if (!bkey_extent_is_data(k.k) ||
+	    !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k),
+				     rbio->pick.ptr,
+				     rbio->pos.offset -
+				     rbio->pick.crc.offset) ||
+	    bkey_start_offset(k.k) != bvec_iter.bi_sector)
+		goto err;
+
+	bch2_extent_pick_ptr(c, k, avoid, &pick);
+	if (IS_ERR(pick.ca)) {
+		bcache_io_error(c, &rbio->bio, "no device to read from");
+		bio_endio(&rbio->bio);
+		return;
+	}
+
+	if (!pick.ca)
+		goto err;
+
+	if (pick.crc.compressed_size > bvec_iter_sectors(bvec_iter)) {
+		percpu_ref_put(&pick.ca->io_ref);
+		goto err;
+
+	}
+
+	ret = __bch2_read_extent(c, rbio, bvec_iter, bkey_s_c_to_extent(k),
+				 &pick, flags);
+	switch (ret) {
+	case READ_RETRY_AVOID:
+		__set_bit(pick.ca->dev_idx, avoid->d);
+	case READ_RETRY:
+		goto retry;
+	case READ_ERR:
+		bio_endio(&rbio->bio);
+		return;
+	};
+
+	return;
+err:
+	/*
+	 * extent we wanted to read no longer exists, or
+	 * was merged or partially overwritten (and thus
+	 * possibly bigger than the memory that was
+	 * originally allocated)
+	 */
+	rbio->bio.bi_error = -EINTR;
+	bio_endio(&rbio->bio);
+	return;
+}
+
 void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 		 struct bvec_iter bvec_iter, u64 inode,
 		 struct bch_devs_mask *avoid, unsigned flags)
@@ -1241,6 +1515,8 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
 	struct btree_iter iter;
 	struct bkey_s_c k;
 	int ret;
+
+	EBUG_ON(flags & BCH_READ_NODECODE);
 retry:
 	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS,
 			   POS(inode, bvec_iter.bi_sector),
@@ -1277,7 +1553,8 @@ retry:
 			}
 
 			ret = __bch2_read_extent(c, rbio, fragment,
-						 k, &pick, flags);
+						 bkey_s_c_to_extent(k),
+						 &pick, flags);
 			switch (ret) {
 			case READ_RETRY_AVOID:
 				__set_bit(pick.ca->dev_idx, avoid->d);
diff --git a/libbcachefs/io.h b/libbcachefs/io.h
index 658c15a..bd0d7c4 100644
--- a/libbcachefs/io.h
+++ b/libbcachefs/io.h
@@ -2,6 +2,8 @@
 #define _BCACHEFS_IO_H
 
 #include <linux/hash.h>
+#include "alloc.h"
+#include "checksum.h"
 #include "io_types.h"
 
 #define to_wbio(_bio)			\
@@ -12,6 +14,9 @@
 
 void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *);
 void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t);
+void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t);
+
+void bch2_latency_acct(struct bch_dev *, unsigned, int);
 
 void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *,
 			       enum bch_data_type, const struct bkey_i *);
@@ -20,14 +25,15 @@ enum bch_write_flags {
 	BCH_WRITE_ALLOC_NOWAIT		= (1 << 0),
 	BCH_WRITE_CACHED		= (1 << 1),
 	BCH_WRITE_FLUSH			= (1 << 2),
-	BCH_WRITE_DATA_COMPRESSED	= (1 << 3),
-	BCH_WRITE_THROTTLE		= (1 << 4),
-	BCH_WRITE_ONLY_SPECIFIED_DEVS	= (1 << 5),
+	BCH_WRITE_DATA_ENCODED		= (1 << 3),
+	BCH_WRITE_PAGES_STABLE		= (1 << 4),
+	BCH_WRITE_PAGES_OWNED		= (1 << 5),
+	BCH_WRITE_ONLY_SPECIFIED_DEVS	= (1 << 6),
 
 	/* Internal: */
-	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 6),
-	BCH_WRITE_DONE			= (1 << 7),
-	BCH_WRITE_LOOPED		= (1 << 8),
+	BCH_WRITE_JOURNAL_SEQ_PTR	= (1 << 7),
+	BCH_WRITE_DONE			= (1 << 8),
+	BCH_WRITE_LOOPED		= (1 << 9),
 };
 
 static inline u64 *op_journal_seq(struct bch_write_op *op)
@@ -36,11 +42,60 @@ static inline u64 *op_journal_seq(struct bch_write_op *op)
 		? op->journal_seq_p : &op->journal_seq;
 }
 
-void bch2_write_op_init(struct bch_write_op *, struct bch_fs *,
-			struct disk_reservation,
-			struct bch_devs_mask *,
-			unsigned long,
-			struct bpos, u64 *, unsigned);
+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
+{
+	return op->alloc_reserve == RESERVE_MOVINGGC
+		? op->c->copygc_wq
+		: op->c->wq;
+}
+
+int bch2_write_index_default(struct bch_write_op *);
+
+static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c)
+{
+	op->c			= c;
+	op->io_wq		= index_update_wq(op);
+	op->flags		= 0;
+	op->written		= 0;
+	op->error		= 0;
+	op->csum_type		= bch2_data_checksum_type(c);
+	op->compression_type	=
+		bch2_compression_opt_to_type(c->opts.compression);
+	op->nr_replicas		= 0;
+	op->nr_replicas_required = c->opts.data_replicas_required;
+	op->alloc_reserve	= RESERVE_NONE;
+	op->open_buckets_nr	= 0;
+	op->devs_have.nr	= 0;
+	op->pos			= POS_MAX;
+	op->version		= ZERO_VERSION;
+	op->devs		= NULL;
+	op->write_point		= (struct write_point_specifier) { 0 };
+	op->res			= (struct disk_reservation) { 0 };
+	op->journal_seq		= 0;
+	op->index_update_fn	= bch2_write_index_default;
+}
+
+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c,
+				      struct disk_reservation res,
+				      struct bch_devs_mask *devs,
+				      struct write_point_specifier write_point,
+				      struct bpos pos,
+				      u64 *journal_seq, unsigned flags)
+{
+	__bch2_write_op_init(op, c);
+	op->flags	= flags;
+	op->nr_replicas	= res.nr_replicas;
+	op->pos		= pos;
+	op->res		= res;
+	op->devs	= devs;
+	op->write_point	= write_point;
+
+	if (journal_seq) {
+		op->journal_seq_p = journal_seq;
+		op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR;
+	}
+}
+
 void bch2_write(struct closure *);
 
 static inline struct bch_write_bio *wbio_init(struct bio *bio)
@@ -51,14 +106,13 @@ static inline struct bch_write_bio *wbio_init(struct bio *bio)
 	return wbio;
 }
 
-void bch2_wake_delayed_writes(unsigned long data);
-
 struct bch_devs_mask;
 struct cache_promote_op;
 struct extent_pick_ptr;
 
 int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
-		       struct bkey_s_c k, struct extent_pick_ptr *, unsigned);
+		       struct bkey_s_c_extent e, struct extent_pick_ptr *,
+		       unsigned);
 void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter,
 		 u64, struct bch_devs_mask *, unsigned);
 
@@ -66,21 +120,22 @@ enum bch_read_flags {
 	BCH_READ_RETRY_IF_STALE		= 1 << 0,
 	BCH_READ_MAY_PROMOTE		= 1 << 1,
 	BCH_READ_USER_MAPPED		= 1 << 2,
+	BCH_READ_NODECODE		= 1 << 3,
 
 	/* internal: */
-	BCH_READ_MUST_BOUNCE		= 1 << 3,
-	BCH_READ_MUST_CLONE		= 1 << 4,
-	BCH_READ_IN_RETRY		= 1 << 5,
+	BCH_READ_MUST_BOUNCE		= 1 << 4,
+	BCH_READ_MUST_CLONE		= 1 << 5,
+	BCH_READ_IN_RETRY		= 1 << 6,
 };
 
 static inline void bch2_read_extent(struct bch_fs *c,
 				    struct bch_read_bio *rbio,
-				    struct bkey_s_c k,
+				    struct bkey_s_c_extent e,
 				    struct extent_pick_ptr *pick,
 				    unsigned flags)
 {
 	rbio->_state = 0;
-	__bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, pick, flags);
+	__bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags);
 }
 
 static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio,
diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h
index f77106b..ed9a4bb 100644
--- a/libbcachefs/io_types.h
+++ b/libbcachefs/io_types.h
@@ -1,20 +1,16 @@
 #ifndef _BCACHEFS_IO_TYPES_H
 #define _BCACHEFS_IO_TYPES_H
 
+#include "alloc_types.h"
 #include "btree_types.h"
 #include "buckets_types.h"
+#include "extents_types.h"
 #include "keylist_types.h"
 #include "super_types.h"
 
 #include <linux/llist.h>
 #include <linux/workqueue.h>
 
-struct extent_pick_ptr {
-	struct bch_extent_crc128	crc;
-	struct bch_extent_ptr		ptr;
-	struct bch_dev			*ca;
-};
-
 struct bch_read_bio {
 	struct bch_fs		*c;
 
@@ -44,26 +40,22 @@ struct bch_read_bio {
 	struct {
 	u8			bounce:1,
 				split:1,
-				process_context:1,
-				retry:2;
+				narrow_crcs:1,
+				retry:2,
+				context:2;
 	};
 	u8			_state;
 	};
 
+	struct bch_devs_list	devs_have;
+
 	struct extent_pick_ptr	pick;
+	/* start pos of data we read (may not be pos of data we want) */
+	struct bpos		pos;
 	struct bversion		version;
 
 	struct promote_op	*promote;
 
-	/*
-	 * If we have to retry the read (IO error, checksum failure, read stale
-	 * data (raced with allocator), we retry the portion of the parent bio
-	 * that failed (i.e. this bio's portion, bvec_iter).
-	 *
-	 * But we need to stash the inode somewhere:
-	 */
-	u64			inode;
-
 	struct work_struct	work;
 
 	struct bio		bio;
@@ -98,36 +90,33 @@ struct bch_write_op {
 	struct bch_fs		*c;
 	struct workqueue_struct	*io_wq;
 
-	unsigned		written; /* sectors */
-
-	short			error;
-
 	u16			flags;
+	u16			written; /* sectors */
+	s8			error;
+
 	unsigned		csum_type:4;
 	unsigned		compression_type:4;
 	unsigned		nr_replicas:4;
+	unsigned		nr_replicas_required:4;
 	unsigned		alloc_reserve:4;
-	unsigned		nonce:14;
+
+	u8			open_buckets_nr;
+	struct bch_devs_list	devs_have;
+	u16			target;
+	u16			nonce;
 
 	struct bpos		pos;
 	struct bversion		version;
 
-	/* For BCH_WRITE_DATA_COMPRESSED: */
-	struct bch_extent_crc128 crc;
-	unsigned		size;
+	/* For BCH_WRITE_DATA_ENCODED: */
+	struct bch_extent_crc_unpacked crc;
 
 	struct bch_devs_mask	*devs;
-	unsigned long		write_point;
+	struct write_point_specifier write_point;
 
 	struct disk_reservation	res;
 
-	union {
 	u8			open_buckets[16];
-	struct {
-	struct bch_write_op	*next;
-	unsigned long		expires;
-	};
-	};
 
 	/*
 	 * If caller wants to flush but hasn't passed us a journal_seq ptr, we
diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c
index 37b342b..5d9a298 100644
--- a/libbcachefs/journal.c
+++ b/libbcachefs/journal.c
@@ -464,7 +464,8 @@ static int journal_validate_key(struct bch_fs *c, struct jset *j,
 	if (invalid) {
 		bch2_bkey_val_to_text(c, key_type, buf, sizeof(buf),
 				     bkey_i_to_s_c(k));
-		mustfix_fsck_err(c, "invalid %s in journal: %s", type, buf);
+		mustfix_fsck_err(c, "invalid %s in journal: %s\n%s",
+				 type, invalid, buf);
 
 		le16_add_cpu(&entry->u64s, -k->k.u64s);
 		memmove(k, bkey_next(k), next - (void *) bkey_next(k));
@@ -1568,35 +1569,31 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 	memcpy(new_bucket_seq,	ja->bucket_seq,	ja->nr * sizeof(u64));
 	swap(new_buckets,	ja->buckets);
 	swap(new_bucket_seq,	ja->bucket_seq);
+	spin_unlock(&j->lock);
 
 	while (ja->nr < nr) {
-		/* must happen under journal lock, to avoid racing with gc: */
-		long b = bch2_bucket_alloc(c, ca, RESERVE_ALLOC);
-		if (b < 0) {
-			if (!closure_wait(&c->freelist_wait, &cl)) {
-				spin_unlock(&j->lock);
+		struct open_bucket *ob;
+		size_t bucket;
+		int ob_idx;
+
+		ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, &cl);
+		if (ob_idx < 0) {
+			if (!closure_wait(&c->freelist_wait, &cl))
 				closure_sync(&cl);
-				spin_lock(&j->lock);
-			}
 			continue;
 		}
 
-		bch2_mark_metadata_bucket(ca, &ca->buckets[b],
-					 BUCKET_JOURNAL, false);
-		bch2_mark_alloc_bucket(ca, &ca->buckets[b], false);
+		ob = c->open_buckets + ob_idx;
+		bucket = sector_to_bucket(ca, ob->ptr.offset);
 
-		memmove(ja->buckets + ja->last_idx + 1,
-			ja->buckets + ja->last_idx,
-			(ja->nr - ja->last_idx) * sizeof(u64));
-		memmove(ja->bucket_seq + ja->last_idx + 1,
-			ja->bucket_seq + ja->last_idx,
-			(ja->nr - ja->last_idx) * sizeof(u64));
-		memmove(journal_buckets->buckets + ja->last_idx + 1,
-			journal_buckets->buckets + ja->last_idx,
-			(ja->nr - ja->last_idx) * sizeof(u64));
+		spin_lock(&j->lock);
+		__array_insert_item(ja->buckets,		ja->nr, ja->last_idx);
+		__array_insert_item(ja->bucket_seq,		ja->nr, ja->last_idx);
+		__array_insert_item(journal_buckets->buckets,	ja->nr, ja->last_idx);
 
-		ja->buckets[ja->last_idx] = b;
-		journal_buckets->buckets[ja->last_idx] = cpu_to_le64(b);
+		ja->buckets[ja->last_idx] = bucket;
+		ja->bucket_seq[ja->last_idx] = 0;
+		journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket);
 
 		if (ja->last_idx < ja->nr) {
 			if (ja->cur_idx >= ja->last_idx)
@@ -1604,9 +1601,14 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
 			ja->last_idx++;
 		}
 		ja->nr++;
+		spin_unlock(&j->lock);
+
+		bch2_mark_metadata_bucket(c, ca, &ca->buckets[bucket],
+					  BUCKET_JOURNAL,
+					  gc_phase(GC_PHASE_SB), 0);
 
+		bch2_open_bucket_put(c, ob);
 	}
-	spin_unlock(&j->lock);
 
 	BUG_ON(bch2_sb_validate_journal(ca->disk_sb.sb, ca->mi));
 
@@ -1623,6 +1625,8 @@ err:
 	if (!ret)
 		bch2_dev_allocator_add(c, ca);
 
+	closure_sync(&cl);
+
 	return ret;
 }
 
diff --git a/libbcachefs/keylist.h b/libbcachefs/keylist.h
index ea65f8e..b7c8a86 100644
--- a/libbcachefs/keylist.h
+++ b/libbcachefs/keylist.h
@@ -7,8 +7,7 @@ int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t);
 void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *);
 void bch2_keylist_pop_front(struct keylist *);
 
-static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys,
-				    size_t nr_inline_u64s)
+static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys)
 {
 	l->top_p = l->keys_p = inline_keys;
 }
@@ -17,7 +16,7 @@ static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys)
 {
 	if (l->keys_p != inline_keys)
 		kfree(l->keys_p);
-	memset(l, 0, sizeof(*l));
+	bch2_keylist_init(l, inline_keys);
 }
 
 static inline void bch2_keylist_push(struct keylist *l)
diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c
index d7f27a3..8d1c0ee 100644
--- a/libbcachefs/migrate.c
+++ b/libbcachefs/migrate.c
@@ -13,31 +13,16 @@
 #include "move.h"
 #include "super-io.h"
 
-static int issue_migration_move(struct bch_dev *ca,
-				struct moving_context *ctxt,
-				struct bch_devs_mask *devs,
-				struct bkey_s_c k)
+static bool migrate_pred(void *arg, struct bkey_s_c_extent e)
 {
-	struct bch_fs *c = ca->fs;
-	struct disk_reservation res;
+	struct bch_dev *ca = arg;
 	const struct bch_extent_ptr *ptr;
-	int ret;
-
-	if (bch2_disk_reservation_get(c, &res, k.k->size, 0))
-		return -ENOSPC;
 
-	extent_for_each_ptr(bkey_s_c_to_extent(k), ptr)
+	extent_for_each_ptr(e, ptr)
 		if (ptr->dev == ca->dev_idx)
-			goto found;
+			return true;
 
-	BUG();
-found:
-	/* XXX: we need to be doing something with the disk reservation */
-
-	ret = bch2_data_move(c, ctxt, devs, k, ptr);
-	if (ret)
-		bch2_disk_reservation_put(c, &res);
-	return ret;
+	return false;
 }
 
 #define MAX_DATA_OFF_ITER	10
@@ -58,10 +43,11 @@ found:
 
 int bch2_move_data_off_device(struct bch_dev *ca)
 {
-	struct moving_context ctxt;
 	struct bch_fs *c = ca->fs;
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	u64 keys_moved, sectors_moved;
 	unsigned pass = 0;
-	u64 seen_key_count;
 	int ret = 0;
 
 	BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
@@ -69,12 +55,6 @@ int bch2_move_data_off_device(struct bch_dev *ca)
 	if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_USER)))
 		return 0;
 
-	mutex_lock(&c->replicas_gc_lock);
-	bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
-
-	bch2_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE);
-	__set_bit(ca->dev_idx, ctxt.avoid.d);
-
 	/*
 	 * In theory, only one pass should be necessary as we've
 	 * quiesced all writes before calling this.
@@ -91,69 +71,43 @@ int bch2_move_data_off_device(struct bch_dev *ca)
 	 * Thus this scans the tree one more time than strictly necessary,
 	 * but that can be viewed as a verification pass.
 	 */
-
 	do {
-		struct btree_iter iter;
-		struct bkey_s_c k;
-
-		seen_key_count = 0;
-		atomic_set(&ctxt.error_count, 0);
-		atomic_set(&ctxt.error_flags, 0);
-
-		bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
-				     BTREE_ITER_PREFETCH);
-
-		while (!bch2_move_ctxt_wait(&ctxt) &&
-		       (k = bch2_btree_iter_peek(&iter)).k &&
-		       !(ret = btree_iter_err(k))) {
-			if (!bkey_extent_is_data(k.k) ||
-			    !bch2_extent_has_device(bkey_s_c_to_extent(k),
-						   ca->dev_idx))
-				goto next;
-
-			ret = issue_migration_move(ca, &ctxt, NULL, k);
-			if (ret == -ENOMEM) {
-				bch2_btree_iter_unlock(&iter);
-
-				/*
-				 * memory allocation failure, wait for some IO
-				 * to finish
-				 */
-				bch2_move_ctxt_wait_for_io(&ctxt);
-				continue;
-			}
-			if (ret == -ENOSPC)
-				break;
-			BUG_ON(ret);
+		ret = bch2_move_data(c, NULL,
+				     SECTORS_IN_FLIGHT_PER_DEVICE,
+				     NULL,
+				     writepoint_hashed((unsigned long) current),
+				     0,
+				     ca->dev_idx,
+				     migrate_pred, ca,
+				     &keys_moved,
+				     &sectors_moved);
+		if (ret) {
+			bch_err(c, "error migrating data: %i", ret);
+			return ret;
+		}
+	} while (keys_moved && pass++ < MAX_DATA_OFF_ITER);
 
-			seen_key_count++;
-			continue;
-next:
-			if (bkey_extent_is_data(k.k)) {
-				ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k),
-							    BCH_DATA_USER);
-				if (ret)
-					break;
-			}
-			bch2_btree_iter_advance_pos(&iter);
-			bch2_btree_iter_cond_resched(&iter);
+	if (keys_moved) {
+		bch_err(c, "unable to migrate all data in %d iterations",
+			MAX_DATA_OFF_ITER);
+		return -1;
+	}
 
-		}
-		bch2_btree_iter_unlock(&iter);
-		bch2_move_ctxt_exit(&ctxt);
+	mutex_lock(&c->replicas_gc_lock);
+	bch2_replicas_gc_start(c, 1 << BCH_DATA_USER);
 
-		if (ret)
-			goto err;
-	} while (seen_key_count && pass++ < MAX_DATA_OFF_ITER);
+	for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH, k) {
+		if (!bkey_extent_is_data(k.k))
+			continue;
 
-	if (seen_key_count) {
-		pr_err("Unable to migrate all data in %d iterations.",
-		       MAX_DATA_OFF_ITER);
-		ret = -1;
-		goto err;
+		ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k),
+					    BCH_DATA_USER);
+		if (ret) {
+			bch_err(c, "error migrating data %i from check_mark_super()", ret);
+			break;
+		}
 	}
 
-err:
 	bch2_replicas_gc_end(c, ret);
 	mutex_unlock(&c->replicas_gc_lock);
 	return ret;
@@ -167,14 +121,11 @@ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca,
 			       enum btree_id id)
 {
 	struct btree_iter iter;
-	struct closure cl;
 	struct btree *b;
 	int ret;
 
 	BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW);
 
-	closure_init_stack(&cl);
-
 	for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) {
 		struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
 
diff --git a/libbcachefs/move.c b/libbcachefs/move.c
index 0c5b924..5eaf0cf 100644
--- a/libbcachefs/move.c
+++ b/libbcachefs/move.c
@@ -9,41 +9,38 @@
 #include "keylist.h"
 
 #include <linux/ioprio.h>
+#include <linux/kthread.h>
 
 #include <trace/events/bcachefs.h>
 
-static struct bch_extent_ptr *bkey_find_ptr(struct bch_fs *c,
-					    struct bkey_s_extent e,
-					    struct bch_extent_ptr ptr)
-{
-	struct bch_extent_ptr *ptr2;
-	struct bch_dev *ca = c->devs[ptr.dev];
+struct moving_io {
+	struct list_head	list;
+	struct closure		cl;
+	bool			read_completed;
+	unsigned		sectors;
 
-	extent_for_each_ptr(e, ptr2)
-		if (ptr2->dev == ptr.dev &&
-		    ptr2->gen == ptr.gen &&
-		    PTR_BUCKET_NR(ca, ptr2) ==
-		    PTR_BUCKET_NR(ca, &ptr))
-			return ptr2;
+	struct bch_read_bio	rbio;
 
-	return NULL;
-}
+	struct migrate_write	write;
+	/* Must be last since it is variable size */
+	struct bio_vec		bi_inline_vecs[0];
+};
 
-static struct bch_extent_ptr *bch2_migrate_matching_ptr(struct migrate_write *m,
-							struct bkey_s_extent e)
-{
-	const struct bch_extent_ptr *ptr;
-	struct bch_extent_ptr *ret;
+struct moving_context {
+	/* Closure for waiting on all reads and writes to complete */
+	struct closure		cl;
 
-	if (m->move)
-		ret = bkey_find_ptr(m->op.c, e, m->move_ptr);
-	else
-		extent_for_each_ptr(bkey_i_to_s_c_extent(&m->key), ptr)
-			if ((ret = bkey_find_ptr(m->op.c, e, *ptr)))
-				break;
+	/* Key and sector moves issued, updated from submission context */
+	u64			keys_moved;
+	u64			sectors_moved;
+	atomic64_t		sectors_raced;
 
-	return ret;
-}
+	struct list_head	reads;
+
+	atomic_t		sectors_in_flight;
+
+	wait_queue_head_t	wait;
+};
 
 static int bch2_migrate_index_update(struct bch_write_op *op)
 {
@@ -59,71 +56,78 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 			     BTREE_ITER_INTENT);
 
 	while (1) {
-		struct bkey_s_extent insert =
-			bkey_i_to_s_extent(bch2_keylist_front(keys));
 		struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter);
+		struct bkey_i_extent *insert, *new =
+			bkey_i_to_extent(bch2_keylist_front(keys));
+		BKEY_PADDED(k) _new, _insert;
 		struct bch_extent_ptr *ptr;
-		struct bkey_s_extent e;
-		BKEY_PADDED(k) new;
+		struct bch_extent_crc_unpacked crc;
+		bool did_work = false;
 
-		if (!k.k) {
+		if (btree_iter_err(k)) {
 			ret = bch2_btree_iter_unlock(&iter);
 			break;
 		}
 
-		if (!bkey_extent_is_data(k.k))
+		if (bversion_cmp(k.k->version, new->k.version) ||
+		    !bkey_extent_is_data(k.k) ||
+		    !bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k),
+					     m->ptr, m->offset))
 			goto nomatch;
 
-		bkey_reassemble(&new.k, k);
-		bch2_cut_front(iter.pos, &new.k);
-		bch2_cut_back(insert.k->p, &new.k.k);
-		e = bkey_i_to_s_extent(&new.k);
-
-		/* hack - promotes can race: */
-		if (m->promote)
-			extent_for_each_ptr(insert, ptr)
-				if (bch2_extent_has_device(e.c, ptr->dev))
-					goto nomatch;
-
-		ptr = bch2_migrate_matching_ptr(m, e);
-		if (ptr) {
-			int nr_new_dirty = bch2_extent_nr_dirty_ptrs(insert.s_c);
-			unsigned insert_flags =
-				BTREE_INSERT_ATOMIC|
-				BTREE_INSERT_NOFAIL;
+		bkey_reassemble(&_insert.k, k);
+		insert = bkey_i_to_extent(&_insert.k);
+
+		bkey_copy(&_new.k, bch2_keylist_front(keys));
+		new = bkey_i_to_extent(&_new.k);
+
+		bch2_cut_front(iter.pos, &insert->k_i);
+		bch2_cut_back(new->k.p, &insert->k);
+		bch2_cut_back(insert->k.p, &new->k);
+
+		if (m->move_dev >= 0 &&
+		    (ptr = (struct bch_extent_ptr *)
+		     bch2_extent_has_device(extent_i_to_s_c(insert),
+					    m->move_dev)))
+			bch2_extent_drop_ptr(extent_i_to_s(insert), ptr);
 
-			/* copygc uses btree node reserve: */
-			if (m->move)
-				insert_flags |= BTREE_INSERT_USE_RESERVE;
 
-			if (m->move) {
-				nr_new_dirty -= !ptr->cached;
-				__bch2_extent_drop_ptr(e, ptr);
+		extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) {
+			if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) {
+				/*
+				 * raced with another move op? extent already
+				 * has a pointer to the device we just wrote
+				 * data to
+				 */
+				continue;
 			}
 
-			BUG_ON(nr_new_dirty < 0);
-
-			memcpy_u64s(extent_entry_last(e),
-				    insert.v,
-				    bkey_val_u64s(insert.k));
-			e.k->u64s += bkey_val_u64s(insert.k);
-
-			bch2_extent_narrow_crcs(e);
-			bch2_extent_drop_redundant_crcs(e);
-			bch2_extent_normalize(c, e.s);
-			bch2_extent_mark_replicas_cached(c, e, nr_new_dirty);
-
-			ret = bch2_btree_insert_at(c, &op->res,
-					NULL, op_journal_seq(op),
-					insert_flags,
-					BTREE_INSERT_ENTRY(&iter, &new.k));
-			if (ret && ret != -EINTR)
-				break;
-		} else {
-nomatch:
-			bch2_btree_iter_advance_pos(&iter);
+			bch2_extent_crc_append(insert, crc);
+			extent_ptr_append(insert, *ptr);
+			did_work = true;
 		}
 
+		if (!did_work)
+			goto nomatch;
+
+		bch2_extent_narrow_crcs(insert,
+				(struct bch_extent_crc_unpacked) { 0 });
+		bch2_extent_normalize(c, extent_i_to_s(insert).s);
+		bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert));
+
+		ret = bch2_btree_insert_at(c, &op->res,
+				NULL, op_journal_seq(op),
+				BTREE_INSERT_ATOMIC|
+				BTREE_INSERT_NOFAIL|
+				m->btree_insert_flags,
+				BTREE_INSERT_ENTRY(&iter, &insert->k_i));
+		if (!ret)
+			atomic_long_inc(&c->extent_migrate_done);
+		if (ret == -EINTR)
+			ret = 0;
+		if (ret)
+			break;
+next:
 		while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) {
 			bch2_keylist_pop_front(keys);
 			if (bch2_keylist_empty(keys))
@@ -131,96 +135,83 @@ nomatch:
 		}
 
 		bch2_cut_front(iter.pos, bch2_keylist_front(keys));
+		continue;
+nomatch:
+		if (m->ctxt)
+			atomic64_add(k.k->p.offset - iter.pos.offset,
+				     &m->ctxt->sectors_raced);
+		atomic_long_inc(&c->extent_migrate_raced);
+		trace_move_race(&new->k);
+		bch2_btree_iter_advance_pos(&iter);
+		goto next;
 	}
 out:
 	bch2_btree_iter_unlock(&iter);
 	return ret;
 }
 
-void bch2_migrate_write_init(struct bch_fs *c,
-			     struct migrate_write *m,
-			     struct bch_devs_mask *devs,
-			     struct bkey_s_c k,
-			     const struct bch_extent_ptr *move_ptr,
-			     unsigned flags)
+void bch2_migrate_write_init(struct migrate_write *m,
+			     struct bch_read_bio *rbio)
 {
-	bkey_reassemble(&m->key, k);
-
-	m->promote = false;
-	m->move = move_ptr != NULL;
-	if (move_ptr)
-		m->move_ptr = *move_ptr;
-
-	if (bkey_extent_is_cached(k.k) ||
-	    (move_ptr && move_ptr->cached))
-		flags |= BCH_WRITE_CACHED;
+	/* write bio must own pages: */
+	BUG_ON(!m->op.wbio.bio.bi_vcnt);
+
+	m->ptr		= rbio->pick.ptr;
+	m->offset	= rbio->pos.offset - rbio->pick.crc.offset;
+	m->op.devs_have	= rbio->devs_have;
+	m->op.pos	= rbio->pos;
+	m->op.version	= rbio->version;
+	m->op.crc	= rbio->pick.crc;
+
+	if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) {
+		m->op.nonce	= m->op.crc.nonce + m->op.crc.offset;
+		m->op.csum_type = m->op.crc.csum_type;
+	}
 
-	bch2_write_op_init(&m->op, c, (struct disk_reservation) { 0 },
-			   devs, (unsigned long) current,
-			   bkey_start_pos(k.k), NULL,
-			   flags|BCH_WRITE_ONLY_SPECIFIED_DEVS);
+	if (m->move_dev >= 0)
+		bch2_dev_list_drop_dev(&m->op.devs_have, m->move_dev);
 
-	if (m->move)
+	if (m->btree_insert_flags & BTREE_INSERT_USE_RESERVE)
 		m->op.alloc_reserve = RESERVE_MOVINGGC;
 
-	m->op.nonce		= extent_current_nonce(bkey_s_c_to_extent(k));
+	m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS|
+		BCH_WRITE_PAGES_STABLE|
+		BCH_WRITE_PAGES_OWNED|
+		BCH_WRITE_DATA_ENCODED;
+
+	m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9;
 	m->op.nr_replicas	= 1;
+	m->op.nr_replicas_required = 1;
 	m->op.index_update_fn	= bch2_migrate_index_update;
 }
 
-static void migrate_bio_init(struct moving_io *io, struct bio *bio,
-			     unsigned sectors)
+static void move_free(struct closure *cl)
 {
-	bio_init(bio, io->bi_inline_vecs,
-		 DIV_ROUND_UP(sectors, PAGE_SECTORS));
-	bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
-
-	bio->bi_iter.bi_size	= sectors << 9;
-	bio->bi_private		= &io->cl;
-	bch2_bio_map(bio, NULL);
-}
-
-static void moving_io_free(struct moving_io *io)
-{
-	struct moving_context *ctxt = io->ctxt;
+	struct moving_io *io = container_of(cl, struct moving_io, cl);
+	struct moving_context *ctxt = io->write.ctxt;
 	struct bio_vec *bv;
 	int i;
 
-	atomic_sub(io->write.key.k.size, &ctxt->sectors_in_flight);
-	wake_up(&ctxt->wait);
-
 	bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i)
 		if (bv->bv_page)
 			__free_page(bv->bv_page);
-	kfree(io);
-}
-
-static void moving_error(struct moving_context *ctxt, unsigned flag)
-{
-	atomic_inc(&ctxt->error_count);
-	//atomic_or(flag, &ctxt->error_flags);
-}
 
-static void moving_write_done(struct closure *cl)
-{
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
-
-	if (io->write.op.error)
-		moving_error(io->ctxt, MOVING_FLAG_WRITE);
-
-	//if (io->replace.failures)
-	//	trace_copy_collision(q, &io->key.k);
+	atomic_sub(io->sectors, &ctxt->sectors_in_flight);
+	wake_up(&ctxt->wait);
 
-	moving_io_free(io);
+	kfree(io);
 }
 
-static void write_moving(struct closure *cl)
+static void move_write(struct closure *cl)
 {
 	struct moving_io *io = container_of(cl, struct moving_io, cl);
-	struct bch_write_op *op = &io->write.op;
 
-	closure_call(&op->cl, bch2_write, NULL, &io->cl);
-	closure_return_with_destructor(&io->cl, moving_write_done);
+	if (likely(!io->rbio.bio.bi_error)) {
+		bch2_migrate_write_init(&io->write, &io->rbio);
+		closure_call(&io->write.op.cl, bch2_write, NULL, cl);
+	}
+
+	closure_return_with_destructor(cl, move_free);
 }
 
 static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
@@ -231,16 +222,10 @@ static inline struct moving_io *next_pending_write(struct moving_context *ctxt)
 	return io && io->read_completed ? io : NULL;
 }
 
-static void read_moving_endio(struct bio *bio)
+static void move_read_endio(struct bio *bio)
 {
-	struct closure *cl = bio->bi_private;
-	struct moving_io *io = container_of(cl, struct moving_io, cl);
-	struct moving_context *ctxt = io->ctxt;
-
-	trace_move_read_done(&io->write.key.k);
-
-	if (bio->bi_error)
-		moving_error(io->ctxt, MOVING_FLAG_READ);
+	struct moving_io *io = container_of(bio, struct moving_io, rbio.bio);
+	struct moving_context *ctxt = io->write.ctxt;
 
 	io->read_completed = true;
 	if (next_pending_write(ctxt))
@@ -249,58 +234,81 @@ static void read_moving_endio(struct bio *bio)
 	closure_put(&ctxt->cl);
 }
 
-int bch2_data_move(struct bch_fs *c,
-		   struct moving_context *ctxt,
-		   struct bch_devs_mask *devs,
-		   struct bkey_s_c k,
-		   const struct bch_extent_ptr *move_ptr)
+static int bch2_move_extent(struct bch_fs *c,
+			  struct moving_context *ctxt,
+			  struct bch_devs_mask *devs,
+			  struct write_point_specifier wp,
+			  int btree_insert_flags,
+			  int move_device,
+			  struct bkey_s_c k)
 {
 	struct extent_pick_ptr pick;
 	struct moving_io *io;
+	const struct bch_extent_ptr *ptr;
+	struct bch_extent_crc_unpacked crc;
+	unsigned sectors = k.k->size, pages;
 
-	bch2_extent_pick_ptr(c, k, &ctxt->avoid, &pick);
+	bch2_extent_pick_ptr(c, k, NULL, &pick);
 	if (IS_ERR_OR_NULL(pick.ca))
 		return pick.ca ? PTR_ERR(pick.ca) : 0;
 
-	io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) *
-		     DIV_ROUND_UP(k.k->size, PAGE_SECTORS), GFP_KERNEL);
-	if (!io)
-		return -ENOMEM;
+	/* write path might have to decompress data: */
+	extent_for_each_ptr_crc(bkey_s_c_to_extent(k), ptr, crc)
+		sectors = max_t(unsigned, sectors, crc.uncompressed_size);
 
-	io->ctxt = ctxt;
+	pages = DIV_ROUND_UP(sectors, PAGE_SECTORS);
+	io = kzalloc(sizeof(struct moving_io) +
+		     sizeof(struct bio_vec) * pages, GFP_KERNEL);
+	if (!io)
+		goto err;
 
-	migrate_bio_init(io, &io->rbio.bio, k.k->size);
+	io->write.ctxt	= ctxt;
+	io->sectors	= k.k->size;
 
-	bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
-	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(k.k);
-	io->rbio.bio.bi_end_io		= read_moving_endio;
+	bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages);
+	bio_set_prio(&io->write.op.wbio.bio,
+		     IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+	io->write.op.wbio.bio.bi_iter.bi_size = sectors << 9;
 
-	if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) {
+	bch2_bio_map(&io->write.op.wbio.bio, NULL);
+	if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL)) {
 		kfree(io);
-		return -ENOMEM;
+		goto err;
 	}
 
-	migrate_bio_init(io, &io->write.op.wbio.bio, k.k->size);
+	bio_init(&io->rbio.bio, io->bi_inline_vecs, pages);
+	bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+	io->rbio.bio.bi_iter.bi_size = sectors << 9;
 
-	bch2_migrate_write_init(c, &io->write, devs, k, move_ptr, 0);
+	bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0);
+	io->rbio.bio.bi_iter.bi_sector	= bkey_start_offset(k.k);
+	io->rbio.bio.bi_end_io		= move_read_endio;
 
-	trace_move_read(&io->write.key.k);
+	__bch2_write_op_init(&io->write.op, c);
+	io->write.btree_insert_flags = btree_insert_flags;
+	io->write.move_dev	= move_device;
+	io->write.op.devs	= devs;
+	io->write.op.write_point = wp;
 
 	ctxt->keys_moved++;
 	ctxt->sectors_moved += k.k->size;
-	if (ctxt->rate)
-		bch2_ratelimit_increment(ctxt->rate, k.k->size);
 
-	atomic_add(k.k->size, &ctxt->sectors_in_flight);
+	trace_move_extent(k.k);
+
+	atomic_add(io->sectors, &ctxt->sectors_in_flight);
 	list_add_tail(&io->list, &ctxt->reads);
 
 	/*
-	 * dropped by read_moving_endio() - guards against use after free of
+	 * dropped by move_read_endio() - guards against use after free of
 	 * ctxt when doing wakeup
 	 */
-	closure_get(&io->ctxt->cl);
-	bch2_read_extent(c, &io->rbio, k, &pick, 0);
+	closure_get(&ctxt->cl);
+	bch2_read_extent(c, &io->rbio, bkey_s_c_to_extent(k),
+			 &pick, BCH_READ_NODECODE);
 	return 0;
+err:
+	trace_move_alloc_fail(k.k);
+	return -ENOMEM;
 }
 
 static void do_pending_writes(struct moving_context *ctxt)
@@ -309,14 +317,7 @@ static void do_pending_writes(struct moving_context *ctxt)
 
 	while ((io = next_pending_write(ctxt))) {
 		list_del(&io->list);
-
-		if (io->rbio.bio.bi_error) {
-			moving_io_free(io);
-			continue;
-		}
-
-		trace_move_write(&io->write.key.k);
-		closure_call(&io->cl, write_moving, NULL, &ctxt->cl);
+		closure_call(&io->cl, move_write, NULL, &ctxt->cl);
 	}
 }
 
@@ -330,18 +331,7 @@ do {								\
 		     next_pending_write(_ctxt) || (_cond));	\
 } while (1)
 
-int bch2_move_ctxt_wait(struct moving_context *ctxt)
-{
-	move_ctxt_wait_event(ctxt,
-			     atomic_read(&ctxt->sectors_in_flight) <
-			     ctxt->max_sectors_in_flight);
-
-	return ctxt->rate
-		? bch2_ratelimit_wait_freezable_stoppable(ctxt->rate)
-		: 0;
-}
-
-void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
 {
 	unsigned sectors_pending = atomic_read(&ctxt->sectors_in_flight);
 
@@ -350,7 +340,7 @@ void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
 		atomic_read(&ctxt->sectors_in_flight) != sectors_pending);
 }
 
-void bch2_move_ctxt_exit(struct moving_context *ctxt)
+static void bch2_move_ctxt_exit(struct moving_context *ctxt)
 {
 	move_ctxt_wait_event(ctxt, !atomic_read(&ctxt->sectors_in_flight));
 	closure_sync(&ctxt->cl);
@@ -359,16 +349,92 @@ void bch2_move_ctxt_exit(struct moving_context *ctxt)
 	EBUG_ON(atomic_read(&ctxt->sectors_in_flight));
 }
 
-void bch2_move_ctxt_init(struct moving_context *ctxt,
-			struct bch_ratelimit *rate,
-			unsigned max_sectors_in_flight)
+static void bch2_move_ctxt_init(struct moving_context *ctxt)
 {
 	memset(ctxt, 0, sizeof(*ctxt));
 	closure_init_stack(&ctxt->cl);
 
-	ctxt->rate = rate;
-	ctxt->max_sectors_in_flight = max_sectors_in_flight;
-
 	INIT_LIST_HEAD(&ctxt->reads);
 	init_waitqueue_head(&ctxt->wait);
 }
+
+int bch2_move_data(struct bch_fs *c,
+		   struct bch_ratelimit *rate,
+		   unsigned sectors_in_flight,
+		   struct bch_devs_mask *devs,
+		   struct write_point_specifier wp,
+		   int btree_insert_flags,
+		   int move_device,
+		   move_pred_fn pred, void *arg,
+		   u64 *keys_moved,
+		   u64 *sectors_moved)
+{
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+	struct moving_context ctxt;
+	struct btree_iter iter;
+	BKEY_PADDED(k) tmp;
+	struct bkey_s_c k;
+	int ret = 0;
+
+	bch2_move_ctxt_init(&ctxt);
+	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
+			     BTREE_ITER_PREFETCH);
+
+	if (rate)
+		bch2_ratelimit_reset(rate);
+
+	while (!kthread || !(ret = kthread_should_stop())) {
+		if (atomic_read(&ctxt.sectors_in_flight) >= sectors_in_flight) {
+			bch2_btree_iter_unlock(&iter);
+			move_ctxt_wait_event(&ctxt,
+					     atomic_read(&ctxt.sectors_in_flight) <
+					     sectors_in_flight);
+		}
+
+		if (rate &&
+		    bch2_ratelimit_delay(rate) &&
+		    (bch2_btree_iter_unlock(&iter),
+		     (ret = bch2_ratelimit_wait_freezable_stoppable(rate))))
+			break;
+
+		k = bch2_btree_iter_peek(&iter);
+		if (!k.k)
+			break;
+		ret = btree_iter_err(k);
+		if (ret)
+			break;
+
+		if (!bkey_extent_is_data(k.k) ||
+		    !pred(arg, bkey_s_c_to_extent(k)))
+			goto next;
+
+		/* unlock before doing IO: */
+		bkey_reassemble(&tmp.k, k);
+		k = bkey_i_to_s_c(&tmp.k);
+		bch2_btree_iter_unlock(&iter);
+
+		if (bch2_move_extent(c, &ctxt, devs, wp,
+				     btree_insert_flags,
+				     move_device, k)) {
+			/* memory allocation failure, wait for some IO to finish */
+			bch2_move_ctxt_wait_for_io(&ctxt);
+			continue;
+		}
+
+		if (rate)
+			bch2_ratelimit_increment(rate, k.k->size);
+next:
+		bch2_btree_iter_advance_pos(&iter);
+		bch2_btree_iter_cond_resched(&iter);
+	}
+
+	bch2_btree_iter_unlock(&iter);
+	bch2_move_ctxt_exit(&ctxt);
+
+	trace_move_data(c, ctxt.sectors_moved, ctxt.keys_moved);
+
+	*keys_moved	= ctxt.keys_moved;
+	*sectors_moved	= ctxt.sectors_moved;
+
+	return ret;
+}
diff --git a/libbcachefs/move.h b/libbcachefs/move.h
index a756a46..2e884ce 100644
--- a/libbcachefs/move.h
+++ b/libbcachefs/move.h
@@ -4,77 +4,31 @@
 #include "buckets.h"
 #include "io_types.h"
 
-enum moving_flag_bitnos {
-	MOVING_FLAG_BITNO_READ = 0,
-	MOVING_FLAG_BITNO_WRITE,
-};
-
-#define MOVING_FLAG_READ	(1U << MOVING_FLAG_BITNO_READ)
-#define MOVING_FLAG_WRITE	(1U << MOVING_FLAG_BITNO_WRITE)
+struct bch_read_bio;
+struct moving_context;
 
 struct migrate_write {
-	BKEY_PADDED(key);
-	bool			promote;
-	bool			move;
-	struct bch_extent_ptr	move_ptr;
-	struct bch_write_op	op;
-};
-
-void bch2_migrate_write_init(struct bch_fs *, struct migrate_write *,
-			     struct bch_devs_mask *, struct bkey_s_c,
-			     const struct bch_extent_ptr *, unsigned);
-
-#define SECTORS_IN_FLIGHT_PER_DEVICE	2048
-
-struct moving_context {
-	/* Closure for waiting on all reads and writes to complete */
-	struct closure		cl;
-
-	/* Number and types of errors reported */
-	atomic_t		error_count;
-	atomic_t		error_flags;
-
-	/* Key and sector moves issued, updated from submission context */
-	u64			keys_moved;
-	u64			sectors_moved;
-
-	/* Rate-limiter counting submitted reads */
-	struct bch_ratelimit	*rate;
-
-	/* Try to avoid reading the following device */
-	struct bch_devs_mask	avoid;
-
-	struct list_head	reads;
+	struct moving_context	*ctxt;
 
-	/* Configuration */
-	unsigned		max_sectors_in_flight;
-	atomic_t		sectors_in_flight;
+	/* what we read: */
+	struct bch_extent_ptr	ptr;
+	u64			offset;
 
-	wait_queue_head_t	wait;
+	int			move_dev;
+	int			btree_insert_flags;
+	struct bch_write_op	op;
 };
 
-struct moving_io {
-	struct list_head	list;
-	struct rb_node		node;
-	struct closure		cl;
-	struct moving_context	*ctxt;
-	struct migrate_write	write;
-	bool			read_completed;
-
-	struct bch_read_bio	rbio;
-	/* Must be last since it is variable size */
-	struct bio_vec		bi_inline_vecs[0];
-};
+void bch2_migrate_write_init(struct migrate_write *, struct bch_read_bio *);
 
-int bch2_data_move(struct bch_fs *, struct moving_context *,
-		   struct bch_devs_mask *, struct bkey_s_c,
-		   const struct bch_extent_ptr *);
+#define SECTORS_IN_FLIGHT_PER_DEVICE	2048
 
-int bch2_move_ctxt_wait(struct moving_context *);
-void bch2_move_ctxt_wait_for_io(struct moving_context *);
+typedef bool (*move_pred_fn)(void *, struct bkey_s_c_extent);
 
-void bch2_move_ctxt_exit(struct moving_context *);
-void bch2_move_ctxt_init(struct moving_context *, struct bch_ratelimit *,
-			unsigned);
+int bch2_move_data(struct bch_fs *, struct bch_ratelimit *,
+		   unsigned, struct bch_devs_mask *,
+		   struct write_point_specifier,
+		   int, int, move_pred_fn, void *,
+		   u64 *, u64 *);
 
 #endif /* _BCACHEFS_MOVE_H */
diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c
index 125159e..728be2b 100644
--- a/libbcachefs/movinggc.c
+++ b/libbcachefs/movinggc.c
@@ -6,6 +6,7 @@
 
 #include "bcachefs.h"
 #include "btree_iter.h"
+#include "btree_update.h"
 #include "buckets.h"
 #include "clock.h"
 #include "extents.h"
@@ -23,137 +24,63 @@
 #include <linux/sort.h>
 #include <linux/wait.h>
 
-/* Moving GC - IO loop */
-
-static int bucket_idx_cmp(const void *_l, const void *_r, size_t size)
-{
-	const struct bucket_heap_entry *l = _l;
-	const struct bucket_heap_entry *r = _r;
+/*
+ * We can't use the entire copygc reserve in one iteration of copygc: we may
+ * need the buckets we're freeing up to go back into the copygc reserve to make
+ * forward progress, but if the copygc reserve is full they'll be available for
+ * any allocation - and it's possible that in a given iteration, we free up most
+ * of the buckets we're going to free before we allocate most of the buckets
+ * we're going to allocate.
+ *
+ * If we only use half of the reserve per iteration, then in steady state we'll
+ * always have room in the reserve for the buckets we're going to need in the
+ * next iteration:
+ */
+#define COPYGC_BUCKETS_PER_ITER(ca)					\
+	((ca)->free[RESERVE_MOVINGGC].size / 2)
 
-	if (l->bucket < r->bucket)
-		return -1;
-	if (l->bucket > r->bucket)
-		return 1;
-	return 0;
-}
+/*
+ * Max sectors to move per iteration: Have to take into account internal
+ * fragmentation from the multiple write points for each generation:
+ */
+#define COPYGC_SECTORS_PER_ITER(ca)					\
+	((ca)->mi.bucket_size *	COPYGC_BUCKETS_PER_ITER(ca))
 
-static const struct bch_extent_ptr *moving_pred(struct bch_dev *ca,
-						struct bkey_s_c k)
+static inline int sectors_used_cmp(copygc_heap *heap,
+				   struct copygc_heap_entry l,
+				   struct copygc_heap_entry r)
 {
-	bucket_heap *h = &ca->copygc_heap;
-	const struct bch_extent_ptr *ptr;
-
-	if (bkey_extent_is_data(k.k) &&
-	    (ptr = bch2_extent_has_device(bkey_s_c_to_extent(k),
-					  ca->dev_idx))) {
-		struct bucket_heap_entry search = {
-			.bucket = PTR_BUCKET_NR(ca, ptr)
-		};
-
-		size_t i = eytzinger0_find(h->data, h->used,
-					   sizeof(h->data[0]),
-					   bucket_idx_cmp, &search);
-
-		if (i < h->used)
-			return ptr;
-	}
-
-	return NULL;
+	return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark);
 }
 
-static int issue_moving_gc_move(struct bch_dev *ca,
-				struct moving_context *ctxt,
-				struct bkey_s_c k)
+static int bucket_offset_cmp(const void *_l, const void *_r, size_t size)
 {
-	struct bch_fs *c = ca->fs;
-	const struct bch_extent_ptr *ptr;
-	int ret;
+	const struct copygc_heap_entry *l = _l;
+	const struct copygc_heap_entry *r = _r;
 
-	ptr = moving_pred(ca, k);
-	if (!ptr) /* We raced - bucket's been reused */
-		return 0;
-
-	ret = bch2_data_move(c, ctxt, &ca->self, k, ptr);
-	if (!ret)
-		trace_gc_copy(k.k);
-	else
-		trace_moving_gc_alloc_fail(c, k.k->size);
-	return ret;
+	return (l->offset > r->offset) - (l->offset < r->offset);
 }
 
-static void read_moving(struct bch_dev *ca, size_t buckets_to_move,
-			u64 sectors_to_move)
+static bool copygc_pred(void *arg, struct bkey_s_c_extent e)
 {
-	struct bch_fs *c = ca->fs;
-	bucket_heap *h = &ca->copygc_heap;
-	struct moving_context ctxt;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	u64 sectors_not_moved = 0;
-	size_t buckets_not_moved = 0;
-	struct bucket_heap_entry *i;
-
-	bch2_ratelimit_reset(&ca->moving_gc_pd.rate);
-	bch2_move_ctxt_init(&ctxt, &ca->moving_gc_pd.rate,
-				SECTORS_IN_FLIGHT_PER_DEVICE);
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
-			     BTREE_ITER_PREFETCH);
-
-	while (1) {
-		if (kthread_should_stop())
-			goto out;
-		if (bch2_move_ctxt_wait(&ctxt))
-			goto out;
-		k = bch2_btree_iter_peek(&iter);
-		if (!k.k)
-			break;
-		if (btree_iter_err(k))
-			goto out;
-
-		if (!moving_pred(ca, k))
-			goto next;
+	struct bch_dev *ca = arg;
+	copygc_heap *h = &ca->copygc_heap;
+	const struct bch_extent_ptr *ptr =
+		bch2_extent_has_device(e, ca->dev_idx);
 
-		if (issue_moving_gc_move(ca, &ctxt, k)) {
-			bch2_btree_iter_unlock(&iter);
+	if (ptr) {
+		struct copygc_heap_entry search = { .offset = ptr->offset };
 
-			/* memory allocation failure, wait for some IO to finish */
-			bch2_move_ctxt_wait_for_io(&ctxt);
-			continue;
-		}
-next:
-		bch2_btree_iter_advance_pos(&iter);
-		//bch2_btree_iter_cond_resched(&iter);
+		size_t i = eytzinger0_find_le(h->data, h->used,
+					      sizeof(h->data[0]),
+					      bucket_offset_cmp, &search);
 
-		/* unlock before calling moving_context_wait() */
-		bch2_btree_iter_unlock(&iter);
-		cond_resched();
+		return (i >= 0 &&
+			ptr->offset < h->data[i].offset + ca->mi.bucket_size &&
+			ptr->gen == h->data[i].mark.gen);
 	}
 
-	bch2_btree_iter_unlock(&iter);
-	bch2_move_ctxt_exit(&ctxt);
-	trace_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
-				   buckets_to_move);
-
-	/* don't check this if we bailed out early: */
-	for (i = h->data; i < h->data + h->used; i++) {
-		struct bucket_mark m = READ_ONCE(ca->buckets[i->bucket].mark);
-
-		if (i->mark.gen == m.gen && bucket_sectors_used(m)) {
-			sectors_not_moved += bucket_sectors_used(m);
-			buckets_not_moved++;
-		}
-	}
-
-	if (sectors_not_moved)
-		bch_warn(c, "copygc finished but %llu/%llu sectors, %zu/%zu buckets not moved",
-			 sectors_not_moved, sectors_to_move,
-			 buckets_not_moved, buckets_to_move);
-	return;
-out:
-	bch2_btree_iter_unlock(&iter);
-	bch2_move_ctxt_exit(&ctxt);
-	trace_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved,
-				   buckets_to_move);
+	return false;
 }
 
 static bool have_copygc_reserve(struct bch_dev *ca)
@@ -168,38 +95,17 @@ static bool have_copygc_reserve(struct bch_dev *ca)
 	return ret;
 }
 
-static inline int sectors_used_cmp(bucket_heap *heap,
-				   struct bucket_heap_entry l,
-				   struct bucket_heap_entry r)
-{
-	return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark);
-}
-
-static void bch2_moving_gc(struct bch_dev *ca)
+static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca)
 {
-	struct bch_fs *c = ca->fs;
+	copygc_heap *h = &ca->copygc_heap;
+	struct copygc_heap_entry e, *i;
 	struct bucket *g;
-	u64 sectors_to_move = 0;
-	size_t buckets_to_move, buckets_unused = 0;
-	struct bucket_heap_entry e, *i;
-	int reserve_sectors;
-
-	if (!have_copygc_reserve(ca)) {
-		struct closure cl;
-
-		closure_init_stack(&cl);
-		while (1) {
-			closure_wait(&c->freelist_wait, &cl);
-			if (have_copygc_reserve(ca))
-				break;
-			closure_sync(&cl);
-		}
-		closure_wake_up(&c->freelist_wait);
-	}
-
-	reserve_sectors = COPYGC_SECTORS_PER_ITER(ca);
+	u64 keys_moved, sectors_moved;
+	u64 sectors_to_move = 0, sectors_not_moved = 0;
+	u64 buckets_to_move, buckets_not_moved = 0;
+	int ret;
 
-	trace_moving_gc_start(ca);
+	closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca));
 
 	/*
 	 * Find buckets with lowest sector counts, skipping completely
@@ -213,48 +119,73 @@ static void bch2_moving_gc(struct bch_dev *ca)
 	 * them:
 	 */
 	down_read(&c->gc_lock);
-	ca->copygc_heap.used = 0;
+	h->used = 0;
 	for_each_bucket(g, ca) {
 		struct bucket_mark m = READ_ONCE(g->mark);
-		struct bucket_heap_entry e = { g - ca->buckets, m };
-
-		if (bucket_unused(m)) {
-			buckets_unused++;
-			continue;
-		}
+		struct copygc_heap_entry e;
 
 		if (m.owned_by_allocator ||
-		    m.data_type != BUCKET_DATA)
+		    m.data_type != BUCKET_DATA ||
+		    !bucket_sectors_used(m) ||
+		    bucket_sectors_used(m) >= ca->mi.bucket_size)
 			continue;
 
-		if (bucket_sectors_used(m) >= ca->mi.bucket_size)
-			continue;
-
-		heap_add_or_replace(&ca->copygc_heap, e, -sectors_used_cmp);
+		e = (struct copygc_heap_entry) {
+			.offset = bucket_to_sector(ca, g - ca->buckets),
+			.mark	= m
+		};
+		heap_add_or_replace(h, e, -sectors_used_cmp);
 	}
 	up_read(&c->gc_lock);
 
-	for (i = ca->copygc_heap.data;
-	     i < ca->copygc_heap.data + ca->copygc_heap.used;
-	     i++)
+	for (i = h->data; i < h->data + h->used; i++)
 		sectors_to_move += bucket_sectors_used(i->mark);
 
 	while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) {
-		BUG_ON(!heap_pop(&ca->copygc_heap, e, -sectors_used_cmp));
+		BUG_ON(!heap_pop(h, e, -sectors_used_cmp));
 		sectors_to_move -= bucket_sectors_used(e.mark);
 	}
 
-	buckets_to_move = ca->copygc_heap.used;
+	buckets_to_move = h->used;
+
+	if (!buckets_to_move)
+		return;
+
+	eytzinger0_sort(h->data, h->used,
+			sizeof(h->data[0]),
+			bucket_offset_cmp, NULL);
+
+	ret = bch2_move_data(c, &ca->copygc_pd.rate,
+			     SECTORS_IN_FLIGHT_PER_DEVICE,
+			     &ca->self,
+			     writepoint_ptr(&ca->copygc_write_point),
+			     BTREE_INSERT_USE_RESERVE,
+			     ca->dev_idx,
+			     copygc_pred, ca,
+			     &keys_moved,
+			     &sectors_moved);
+
+	for (i = h->data; i < h->data + h->used; i++) {
+		size_t bucket = sector_to_bucket(ca, i->offset);
+		struct bucket_mark m = READ_ONCE(ca->buckets[bucket].mark);
+
+		if (i->mark.gen == m.gen && bucket_sectors_used(m)) {
+			sectors_not_moved += bucket_sectors_used(m);
+			buckets_not_moved++;
+		}
+	}
 
-	eytzinger0_sort(ca->copygc_heap.data,
-			ca->copygc_heap.used,
-			sizeof(ca->copygc_heap.data[0]),
-			bucket_idx_cmp, NULL);
+	if (sectors_not_moved && !ret)
+		bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved",
+			 sectors_not_moved, sectors_to_move,
+			 buckets_not_moved, buckets_to_move);
 
-	read_moving(ca, buckets_to_move, sectors_to_move);
+	trace_copygc(ca,
+		     sectors_moved, sectors_not_moved,
+		     buckets_to_move, buckets_not_moved);
 }
 
-static int bch2_moving_gc_thread(void *arg)
+static int bch2_copygc_thread(void *arg)
 {
 	struct bch_dev *ca = arg;
 	struct bch_fs *c = ca->fs;
@@ -273,7 +204,7 @@ static int bch2_moving_gc_thread(void *arg)
 		 * don't start copygc until less than half the gc reserve is
 		 * available:
 		 */
-		available = dev_buckets_available(ca);
+		available = dev_buckets_available(c, ca);
 		want = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) *
 				 c->opts.gc_reserve_percent, 200);
 		if (available > want) {
@@ -283,46 +214,46 @@ static int bch2_moving_gc_thread(void *arg)
 			continue;
 		}
 
-		bch2_moving_gc(ca);
+		bch2_copygc(c, ca);
 	}
 
 	return 0;
 }
 
-void bch2_moving_gc_stop(struct bch_dev *ca)
+void bch2_copygc_stop(struct bch_dev *ca)
 {
-	ca->moving_gc_pd.rate.rate = UINT_MAX;
-	bch2_ratelimit_reset(&ca->moving_gc_pd.rate);
+	ca->copygc_pd.rate.rate = UINT_MAX;
+	bch2_ratelimit_reset(&ca->copygc_pd.rate);
 
-	if (ca->moving_gc_read)
-		kthread_stop(ca->moving_gc_read);
-	ca->moving_gc_read = NULL;
+	if (ca->copygc_thread)
+		kthread_stop(ca->copygc_thread);
+	ca->copygc_thread = NULL;
 }
 
-int bch2_moving_gc_start(struct bch_dev *ca)
+int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca)
 {
 	struct task_struct *t;
 
-	BUG_ON(ca->moving_gc_read);
+	BUG_ON(ca->copygc_thread);
 
-	if (ca->fs->opts.nochanges)
+	if (c->opts.nochanges)
 		return 0;
 
-	if (bch2_fs_init_fault("moving_gc_start"))
+	if (bch2_fs_init_fault("copygc_start"))
 		return -ENOMEM;
 
-	t = kthread_create(bch2_moving_gc_thread, ca, "bch_copygc_read");
+	t = kthread_create(bch2_copygc_thread, ca, "bch_copygc");
 	if (IS_ERR(t))
 		return PTR_ERR(t);
 
-	ca->moving_gc_read = t;
-	wake_up_process(ca->moving_gc_read);
+	ca->copygc_thread = t;
+	wake_up_process(ca->copygc_thread);
 
 	return 0;
 }
 
-void bch2_dev_moving_gc_init(struct bch_dev *ca)
+void bch2_dev_copygc_init(struct bch_dev *ca)
 {
-	bch2_pd_controller_init(&ca->moving_gc_pd);
-	ca->moving_gc_pd.d_term = 0;
+	bch2_pd_controller_init(&ca->copygc_pd);
+	ca->copygc_pd.d_term = 0;
 }
diff --git a/libbcachefs/movinggc.h b/libbcachefs/movinggc.h
index d835d13..c46fa1f 100644
--- a/libbcachefs/movinggc.h
+++ b/libbcachefs/movinggc.h
@@ -1,30 +1,8 @@
 #ifndef _BCACHEFS_MOVINGGC_H
 #define _BCACHEFS_MOVINGGC_H
 
-/*
- * We can't use the entire copygc reserve in one iteration of copygc: we may
- * need the buckets we're freeing up to go back into the copygc reserve to make
- * forward progress, but if the copygc reserve is full they'll be available for
- * any allocation - and it's possible that in a given iteration, we free up most
- * of the buckets we're going to free before we allocate most of the buckets
- * we're going to allocate.
- *
- * If we only use half of the reserve per iteration, then in steady state we'll
- * always have room in the reserve for the buckets we're going to need in the
- * next iteration:
- */
-#define COPYGC_BUCKETS_PER_ITER(ca)					\
-	((ca)->free[RESERVE_MOVINGGC].size / 2)
-
-/*
- * Max sectors to move per iteration: Have to take into account internal
- * fragmentation from the multiple write points for each generation:
- */
-#define COPYGC_SECTORS_PER_ITER(ca)					\
-	((ca)->mi.bucket_size *	COPYGC_BUCKETS_PER_ITER(ca))
-
-void bch2_moving_gc_stop(struct bch_dev *);
-int bch2_moving_gc_start(struct bch_dev *);
-void bch2_dev_moving_gc_init(struct bch_dev *);
+void bch2_copygc_stop(struct bch_dev *);
+int bch2_copygc_start(struct bch_fs *, struct bch_dev *);
+void bch2_dev_copygc_init(struct bch_dev *);
 
 #endif /* _BCACHEFS_MOVINGGC_H */
diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c
index 1e4eafb..a3ecfb9 100644
--- a/libbcachefs/super-io.c
+++ b/libbcachefs/super-io.c
@@ -425,6 +425,11 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
 	if (err)
 		return err;
 
+	if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_NONCE_V1 &&
+	    bch2_sb_get_crypt(sb) &&
+	    BCH_SB_INITIALIZED(sb))
+		return "Incompatible extent nonces";
+
 	sb->version = cpu_to_le64(BCH_SB_VERSION_MAX);
 
 	return NULL;
diff --git a/libbcachefs/super.c b/libbcachefs/super.c
index 0342778..4e8b0a5 100644
--- a/libbcachefs/super.c
+++ b/libbcachefs/super.c
@@ -20,6 +20,7 @@
 #include "debug.h"
 #include "error.h"
 #include "fs.h"
+#include "fs-io.h"
 #include "fsck.h"
 #include "inode.h"
 #include "io.h"
@@ -209,7 +210,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	bch2_tiering_stop(c);
 
 	for_each_member_device(ca, c, i)
-		bch2_moving_gc_stop(ca);
+		bch2_copygc_stop(ca);
 
 	bch2_gc_thread_stop(c);
 
@@ -258,12 +259,8 @@ void bch2_fs_read_only(struct bch_fs *c)
 	 */
 	percpu_ref_kill(&c->writes);
 
-	del_timer(&c->foreground_write_wakeup);
 	cancel_delayed_work(&c->pd_controllers_update);
 
-	c->foreground_write_pd.rate.rate = UINT_MAX;
-	bch2_wake_delayed_writes((unsigned long) c);
-
 	/*
 	 * If we're not doing an emergency shutdown, we want to wait on
 	 * outstanding writes to complete so they don't see spurious errors due
@@ -348,9 +345,9 @@ const char *bch2_fs_read_write(struct bch_fs *c)
 	if (bch2_gc_thread_start(c))
 		goto err;
 
-	err = "error starting moving GC thread";
+	err = "error starting copygc thread";
 	for_each_rw_member(ca, c, i)
-		if (bch2_moving_gc_start(ca)) {
+		if (bch2_copygc_start(c, ca)) {
 			percpu_ref_put(&ca->io_ref);
 			goto err;
 		}
@@ -375,6 +372,7 @@ err:
 
 static void bch2_fs_free(struct bch_fs *c)
 {
+	bch2_fs_fsio_exit(c);
 	bch2_fs_encryption_exit(c);
 	bch2_fs_btree_cache_exit(c);
 	bch2_fs_journal_exit(&c->journal);
@@ -411,7 +409,6 @@ static void bch2_fs_exit(struct bch_fs *c)
 {
 	unsigned i;
 
-	del_timer_sync(&c->foreground_write_wakeup);
 	cancel_delayed_work_sync(&c->pd_controllers_update);
 	cancel_work_sync(&c->read_only_work);
 
@@ -535,8 +532,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	c->tiering_enabled = 1;
 	c->tiering_percent = 10;
 
-	c->foreground_target_percent = 20;
-
 	c->journal.write_time	= &c->journal_write_time;
 	c->journal.delay_time	= &c->journal_delay_time;
 	c->journal.blocked_time	= &c->journal_blocked_time;
@@ -600,7 +595,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	    bch2_fs_btree_cache_init(c) ||
 	    bch2_fs_encryption_init(c) ||
 	    bch2_fs_compress_init(c) ||
-	    bch2_check_set_has_compressed_data(c, c->opts.compression))
+	    bch2_check_set_has_compressed_data(c, c->opts.compression) ||
+	    bch2_fs_fsio_init(c))
 		goto err;
 
 	c->bdi.ra_pages		= VM_MAX_READAHEAD * 1024 / PAGE_SIZE;
@@ -1105,8 +1101,10 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx)
 	ca->dev_idx = dev_idx;
 	__set_bit(ca->dev_idx, ca->self.d);
 
+	writepoint_init(&ca->copygc_write_point, BCH_DATA_USER);
+
 	spin_lock_init(&ca->freelist_lock);
-	bch2_dev_moving_gc_init(ca);
+	bch2_dev_copygc_init(ca);
 
 	INIT_WORK(&ca->io_error_work, bch2_io_error_work);
 
@@ -1224,10 +1222,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb)
 	if (bch2_dev_sysfs_online(ca))
 		pr_warn("error creating sysfs objects");
 
-	lg_local_lock(&c->usage_lock);
-	if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA)))
-		bch2_mark_dev_metadata(c, ca);
-	lg_local_unlock(&c->usage_lock);
+	bch2_mark_dev_superblock(c, ca, 0);
 
 	if (ca->mi.state == BCH_MEMBER_STATE_RW)
 		bch2_dev_allocator_add(c, ca);
@@ -1324,7 +1319,7 @@ static bool bch2_fs_may_start(struct bch_fs *c)
 
 static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca)
 {
-	bch2_moving_gc_stop(ca);
+	bch2_copygc_stop(ca);
 
 	/*
 	 * This stops new data writes (e.g. to existing open data
@@ -1347,8 +1342,8 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca)
 	if (bch2_dev_allocator_start(ca))
 		return "error starting allocator thread";
 
-	if (bch2_moving_gc_start(ca))
-		return "error starting moving GC thread";
+	if (bch2_copygc_start(c, ca))
+		return "error starting copygc thread";
 
 	if (bch2_tiering_start(c))
 		return "error starting tiering thread";
diff --git a/libbcachefs/super.h b/libbcachefs/super.h
index 18e36c0..eb1d2f3 100644
--- a/libbcachefs/super.h
+++ b/libbcachefs/super.h
@@ -35,6 +35,30 @@ static inline unsigned dev_mask_nr(struct bch_devs_mask *devs)
 	return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX);
 }
 
+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
+					 unsigned dev)
+{
+	unsigned i;
+
+	for (i = 0; i < devs.nr; i++)
+		if (devs.devs[i] == dev)
+			return true;
+
+	return false;
+}
+
+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
+					  unsigned dev)
+{
+	unsigned i;
+
+	for (i = 0; i < devs->nr; i++)
+		if (devs->devs[i] == dev) {
+			array_remove_item(devs->devs, devs->nr, i);
+			return;
+		}
+}
+
 static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
 					      struct bch_devs_mask *mask)
 {
diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h
index 756dfeb..35c8beb 100644
--- a/libbcachefs/super_types.h
+++ b/libbcachefs/super_types.h
@@ -13,4 +13,33 @@ struct bch_devs_mask {
 	unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)];
 };
 
+struct bch_devs_list {
+	u8			nr;
+	u8			devs[BCH_REPLICAS_MAX];
+};
+
+struct bch_member_cpu {
+	u64			nbuckets;	/* device size */
+	u16			first_bucket;   /* index of first bucket used */
+	u16			bucket_size;	/* sectors */
+	u8			state;
+	u8			tier;
+	u8			replacement;
+	u8			discard;
+	u8			data_allowed;
+	u8			valid;
+};
+
+struct bch_replicas_cpu_entry {
+	u8			data_type;
+	u8			devs[BCH_SB_MEMBERS_MAX / 8];
+};
+
+struct bch_replicas_cpu {
+	struct rcu_head		rcu;
+	unsigned		nr;
+	unsigned		entry_size;
+	struct bch_replicas_cpu_entry entries[];
+};
+
 #endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c
index c20769b..35f1e56 100644
--- a/libbcachefs/sysfs.c
+++ b/libbcachefs/sysfs.c
@@ -161,8 +161,11 @@ read_attribute(meta_buckets);
 read_attribute(alloc_buckets);
 read_attribute(has_data);
 read_attribute(alloc_debug);
+write_attribute(wake_allocator);
 
 read_attribute(read_realloc_races);
+read_attribute(extent_migrate_done);
+read_attribute(extent_migrate_raced);
 
 rw_attribute(journal_write_delay_ms);
 rw_attribute(journal_reclaim_delay_ms);
@@ -170,7 +173,6 @@ rw_attribute(journal_reclaim_delay_ms);
 rw_attribute(discard);
 rw_attribute(cache_replacement_policy);
 
-rw_attribute(foreground_write_ratelimit_enabled);
 rw_attribute(copy_gc_enabled);
 sysfs_pd_controller_attribute(copy_gc);
 
@@ -179,12 +181,9 @@ rw_attribute(tiering_enabled);
 rw_attribute(tiering_percent);
 sysfs_pd_controller_attribute(tiering);
 
-sysfs_pd_controller_attribute(foreground_write);
 
 rw_attribute(pd_controllers_update_seconds);
 
-rw_attribute(foreground_target_percent);
-
 read_attribute(meta_replicas_have);
 read_attribute(data_replicas_have);
 
@@ -272,18 +271,18 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf)
 		if (k.k->type == BCH_EXTENT) {
 			struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
 			const struct bch_extent_ptr *ptr;
-			const union bch_extent_crc *crc;
+			struct bch_extent_crc_unpacked crc;
 
 			extent_for_each_ptr_crc(e, ptr, crc) {
-				if (crc_compression_type(crc) == BCH_COMPRESSION_NONE) {
+				if (crc.compression_type == BCH_COMPRESSION_NONE) {
 					nr_uncompressed_extents++;
 					uncompressed_sectors += e.k->size;
 				} else {
 					nr_compressed_extents++;
 					compressed_sectors_compressed +=
-						crc_compressed_size(e.k, crc);
+						crc.compressed_size;
 					compressed_sectors_uncompressed +=
-						crc_uncompressed_size(e.k, crc);
+						crc.uncompressed_size;
 				}
 
 				/* only looking at the first ptr */
@@ -323,17 +322,17 @@ SHOW(bch2_fs)
 
 	sysfs_print(read_realloc_races,
 		    atomic_long_read(&c->read_realloc_races));
+	sysfs_print(extent_migrate_done,
+		    atomic_long_read(&c->extent_migrate_done));
+	sysfs_print(extent_migrate_raced,
+		    atomic_long_read(&c->extent_migrate_raced));
 
 	sysfs_printf(btree_gc_periodic, "%u",	(int) c->btree_gc_periodic);
 
-	sysfs_printf(foreground_write_ratelimit_enabled, "%i",
-		     c->foreground_write_ratelimit_enabled);
 	sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled);
-	sysfs_pd_controller_show(foreground_write, &c->foreground_write_pd);
 
 	sysfs_print(pd_controllers_update_seconds,
 		    c->pd_controllers_update_seconds);
-	sysfs_print(foreground_target_percent, c->foreground_target_percent);
 
 	sysfs_printf(tiering_enabled,		"%i", c->tiering_enabled);
 	sysfs_print(tiering_percent,		c->tiering_percent);
@@ -371,9 +370,6 @@ STORE(__bch2_fs)
 	sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms);
 	sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms);
 
-	sysfs_strtoul(foreground_write_ratelimit_enabled,
-		      c->foreground_write_ratelimit_enabled);
-
 	if (attr == &sysfs_btree_gc_periodic) {
 		ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic)
 			?: (ssize_t) size;
@@ -389,8 +385,8 @@ STORE(__bch2_fs)
 			?: (ssize_t) size;
 
 		for_each_member_device(ca, c, i)
-			if (ca->moving_gc_read)
-				wake_up_process(ca->moving_gc_read);
+			if (ca->copygc_thread)
+				wake_up_process(ca->copygc_thread);
 		return ret;
 	}
 
@@ -402,11 +398,8 @@ STORE(__bch2_fs)
 		return ret;
 	}
 
-	sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd);
-
 	sysfs_strtoul(pd_controllers_update_seconds,
 		      c->pd_controllers_update_seconds);
-	sysfs_strtoul(foreground_target_percent, c->foreground_target_percent);
 
 	sysfs_strtoul(tiering_percent,		c->tiering_percent);
 	sysfs_pd_controller_store(tiering,	&c->tiers[1].pd); /* XXX */
@@ -466,7 +459,6 @@ struct attribute *bch2_fs_files[] = {
 	&sysfs_journal_write_delay_ms,
 	&sysfs_journal_reclaim_delay_ms,
 
-	&sysfs_foreground_target_percent,
 	&sysfs_tiering_percent,
 
 	&sysfs_compression_stats,
@@ -494,17 +486,17 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_journal_pins,
 
 	&sysfs_read_realloc_races,
+	&sysfs_extent_migrate_done,
+	&sysfs_extent_migrate_raced,
 
 	&sysfs_trigger_journal_flush,
 	&sysfs_trigger_btree_coalesce,
 	&sysfs_trigger_gc,
 	&sysfs_prune_cache,
 
-	&sysfs_foreground_write_ratelimit_enabled,
 	&sysfs_copy_gc_enabled,
 	&sysfs_tiering_enabled,
 	sysfs_pd_controller_files(tiering),
-	sysfs_pd_controller_files(foreground_write),
 	&sysfs_internal_uuid,
 
 #define BCH_DEBUG_PARAM(name, description) &sysfs_##name,
@@ -710,17 +702,23 @@ static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf)
 static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 {
 	struct bch_fs *c = ca->fs;
-	struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+	struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
 
 	return scnprintf(buf, PAGE_SIZE,
 		"free_inc:               %zu/%zu\n"
 		"free[RESERVE_BTREE]:    %zu/%zu\n"
 		"free[RESERVE_MOVINGGC]: %zu/%zu\n"
 		"free[RESERVE_NONE]:     %zu/%zu\n"
-		"alloc:                  %llu/%llu\n"
-		"meta:                   %llu/%llu\n"
-		"dirty:                  %llu/%llu\n"
-		"available:              %llu/%llu\n"
+		"buckets:\n"
+		"    capacity:           %llu\n"
+		"    alloc:              %llu\n"
+		"    meta:               %llu\n"
+		"    dirty:              %llu\n"
+		"    available:          %llu\n"
+		"sectors:\n"
+		"    meta:               %llu\n"
+		"    dirty:              %llu\n"
+		"    cached:             %llu\n"
 		"freelist_wait:          %s\n"
 		"open buckets:           %u/%u (reserved %u)\n"
 		"open_buckets_wait:      %s\n",
@@ -728,10 +726,14 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf)
 		fifo_used(&ca->free[RESERVE_BTREE]),	ca->free[RESERVE_BTREE].size,
 		fifo_used(&ca->free[RESERVE_MOVINGGC]),	ca->free[RESERVE_MOVINGGC].size,
 		fifo_used(&ca->free[RESERVE_NONE]),	ca->free[RESERVE_NONE].size,
-		stats.buckets_alloc,			ca->mi.nbuckets - ca->mi.first_bucket,
-		stats.buckets[S_META],			ca->mi.nbuckets - ca->mi.first_bucket,
-		stats.buckets[S_DIRTY],			ca->mi.nbuckets - ca->mi.first_bucket,
-		__dev_buckets_available(ca, stats),	ca->mi.nbuckets - ca->mi.first_bucket,
+		ca->mi.nbuckets - ca->mi.first_bucket,
+		stats.buckets_alloc,
+		stats.buckets[S_META],
+		stats.buckets[S_DIRTY],
+		__dev_buckets_available(ca, stats),
+		stats.sectors[S_META],
+		stats.sectors[S_DIRTY],
+		stats.sectors_cached,
 		c->freelist_wait.list.first		? "waiting" : "empty",
 		c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE,
 		c->open_buckets_wait.list.first		? "waiting" : "empty");
@@ -769,7 +771,7 @@ SHOW(bch2_dev)
 {
 	struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj);
 	struct bch_fs *c = ca->fs;
-	struct bch_dev_usage stats = bch2_dev_usage_read(ca);
+	struct bch_dev_usage stats = bch2_dev_usage_read(c, ca);
 	char *out = buf, *end = buf + PAGE_SIZE;
 
 	sysfs_printf(uuid,		"%pU\n", ca->uuid.b);
@@ -788,8 +790,8 @@ SHOW(bch2_dev)
 	sysfs_print(cached_buckets,	stats.buckets_cached);
 	sysfs_print(meta_buckets,	stats.buckets[S_META]);
 	sysfs_print(alloc_buckets,	stats.buckets_alloc);
-	sysfs_print(available_buckets,	dev_buckets_available(ca));
-	sysfs_print(free_buckets,	dev_buckets_free(ca));
+	sysfs_print(available_buckets,	__dev_buckets_available(ca, stats));
+	sysfs_print(free_buckets,	__dev_buckets_free(ca, stats));
 
 	if (attr == &sysfs_has_data) {
 		out += bch2_scnprint_flag_list(out, end - out,
@@ -799,7 +801,7 @@ SHOW(bch2_dev)
 		return out - buf;
 	}
 
-	sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd);
+	sysfs_pd_controller_show(copy_gc, &ca->copygc_pd);
 
 	if (attr == &sysfs_cache_replacement_policy) {
 		out += bch2_scnprint_string_list(out, end - out,
@@ -843,7 +845,7 @@ STORE(bch2_dev)
 	struct bch_fs *c = ca->fs;
 	struct bch_member *mi;
 
-	sysfs_pd_controller_store(copy_gc, &ca->moving_gc_pd);
+	sysfs_pd_controller_store(copy_gc, &ca->copygc_pd);
 
 	if (attr == &sysfs_discard) {
 		bool v = strtoul_or_return(buf);
@@ -899,6 +901,9 @@ STORE(bch2_dev)
 		bch2_tiering_start(c);
 	}
 
+	if (attr == &sysfs_wake_allocator)
+		bch2_wake_allocator(ca);
+
 	return size;
 }
 SYSFS_OPS(bch2_dev);
@@ -942,6 +947,7 @@ struct attribute *bch2_dev_files[] = {
 
 	/* debug: */
 	&sysfs_alloc_debug,
+	&sysfs_wake_allocator,
 
 	sysfs_pd_controller_files(copy_gc),
 	NULL
diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c
index cbfcfcc..2e29f74 100644
--- a/libbcachefs/tier.c
+++ b/libbcachefs/tier.c
@@ -15,105 +15,23 @@
 #include <linux/kthread.h>
 #include <trace/events/bcachefs.h>
 
-struct tiering_state {
-	struct bch_tier		*tier;
-	unsigned		sectors;
-	unsigned		stripe_size;
-	unsigned		dev_idx;
-	struct bch_dev		*ca;
-};
-
-static bool tiering_pred(struct bch_fs *c,
-			 struct bch_tier *tier,
-			 struct bkey_s_c k)
+static bool tiering_pred(void *arg, struct bkey_s_c_extent e)
 {
-	if (bkey_extent_is_data(k.k)) {
-		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-		const struct bch_extent_ptr *ptr;
-		unsigned replicas = 0;
-
-		/* Make sure we have room to add a new pointer: */
-		if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
-		    BKEY_EXTENT_VAL_U64s_MAX)
-			return false;
-
-		extent_for_each_ptr(e, ptr)
-			if (c->devs[ptr->dev]->mi.tier >= tier->idx)
-				replicas++;
-
-		return replicas < c->opts.data_replicas;
-	}
-
-	return false;
-}
-
-static int issue_tiering_move(struct bch_fs *c,
-			      struct bch_tier *tier,
-			      struct moving_context *ctxt,
-			      struct bkey_s_c k)
-{
-	int ret;
-
-	ret = bch2_data_move(c, ctxt, &tier->devs, k, NULL);
-	if (!ret)
-		trace_tiering_copy(k.k);
-	else
-		trace_tiering_alloc_fail(c, k.k->size);
-
-	return ret;
-}
-
-/**
- * tiering_next_cache - issue a move to write an extent to the next cache
- * device in round robin order
- */
-static s64 read_tiering(struct bch_fs *c, struct bch_tier *tier)
-{
-	struct moving_context ctxt;
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	unsigned nr_devices = dev_mask_nr(&tier->devs);
-	int ret;
-
-	if (!nr_devices)
-		return 0;
-
-	trace_tiering_start(c);
-
-	bch2_move_ctxt_init(&ctxt, &tier->pd.rate,
-			   nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE);
-	bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN,
-			     BTREE_ITER_PREFETCH);
-
-	while (!kthread_should_stop() &&
-	       !bch2_move_ctxt_wait(&ctxt) &&
-	       (k = bch2_btree_iter_peek(&iter)).k &&
-	       !btree_iter_err(k)) {
-		if (!tiering_pred(c, tier, k))
-			goto next;
-
-		ret = issue_tiering_move(c, tier, &ctxt, k);
-		if (ret) {
-			bch2_btree_iter_unlock(&iter);
-
-			/* memory allocation failure, wait for some IO to finish */
-			bch2_move_ctxt_wait_for_io(&ctxt);
-			continue;
-		}
-next:
-		bch2_btree_iter_advance_pos(&iter);
-		//bch2_btree_iter_cond_resched(&iter);
+	struct bch_tier *tier = arg;
+	struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]);
+	const struct bch_extent_ptr *ptr;
+	unsigned replicas = 0;
 
-		/* unlock before calling moving_context_wait() */
-		bch2_btree_iter_unlock(&iter);
-		cond_resched();
-	}
+	/* Make sure we have room to add a new pointer: */
+	if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX >
+	    BKEY_EXTENT_VAL_U64s_MAX)
+		return false;
 
-	bch2_btree_iter_unlock(&iter);
-	bch2_move_ctxt_exit(&ctxt);
-	trace_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved);
+	extent_for_each_ptr(e, ptr)
+		if (c->devs[ptr->dev]->mi.tier >= tier->idx)
+			replicas++;
 
-	return ctxt.sectors_moved;
+	return replicas < c->opts.data_replicas;
 }
 
 static int bch2_tiering_thread(void *arg)
@@ -122,15 +40,15 @@ static int bch2_tiering_thread(void *arg)
 	struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]);
 	struct io_clock *clock = &c->io_clock[WRITE];
 	struct bch_dev *ca;
-	u64 tier_capacity, available_sectors;
+	u64 tier_capacity, available_sectors, keys_moved, sectors_moved;
 	unsigned long last;
-	unsigned i;
+	unsigned i, nr_devices;
 
 	set_freezable();
 
 	while (!kthread_should_stop()) {
 		if (kthread_wait_freezable(c->tiering_enabled &&
-					   dev_mask_nr(&tier->devs)))
+					   (nr_devices = dev_mask_nr(&tier->devs))))
 			break;
 
 		while (1) {
@@ -151,7 +69,7 @@ static int bch2_tiering_thread(void *arg)
 							ca->mi.first_bucket);
 					available_sectors +=
 						bucket_to_sector(ca,
-							dev_buckets_available(ca));
+							dev_buckets_available(c, ca));
 				}
 				rcu_read_unlock();
 			}
@@ -167,7 +85,15 @@ static int bch2_tiering_thread(void *arg)
 				return 0;
 		}
 
-		read_tiering(c, tier);
+		bch2_move_data(c, &tier->pd.rate,
+			       SECTORS_IN_FLIGHT_PER_DEVICE * nr_devices,
+			       &tier->devs,
+			       writepoint_ptr(&tier->wp),
+			       0,
+			       -1,
+			       tiering_pred, tier,
+			       &keys_moved,
+			       &sectors_moved);
 	}
 
 	return 0;
diff --git a/libbcachefs/util.c b/libbcachefs/util.c
index 2eb8ca7..fa85375 100644
--- a/libbcachefs/util.c
+++ b/libbcachefs/util.c
@@ -291,13 +291,15 @@ void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done)
 
 int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d)
 {
+	bool kthread = (current->flags & PF_KTHREAD) != 0;
+
 	while (1) {
 		u64 delay = bch2_ratelimit_delay(d);
 
 		if (delay)
 			set_current_state(TASK_INTERRUPTIBLE);
 
-		if (kthread_should_stop())
+		if (kthread && kthread_should_stop())
 			return 1;
 
 		if (!delay)
@@ -434,8 +436,11 @@ size_t bch2_rand_range(size_t max)
 {
 	size_t rand;
 
+	if (!max)
+		return 0;
+
 	do {
-		get_random_bytes(&rand, sizeof(rand));
+		rand = get_random_long();
 		rand &= roundup_pow_of_two(max) - 1;
 	} while (rand >= max);
 
@@ -642,3 +647,129 @@ void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
 
 	return vpmalloc(size, gfp_mask);
 }
+
+#if 0
+void eytzinger1_test(void)
+{
+	unsigned inorder, eytz, size;
+
+	pr_info("1 based eytzinger test:");
+
+	for (size = 2;
+	     size < 65536;
+	     size++) {
+		unsigned extra = eytzinger1_extra(size);
+
+		if (!(size % 4096))
+			pr_info("tree size %u", size);
+
+		BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size));
+		BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size));
+
+		BUG_ON(eytzinger1_prev(eytzinger1_first(size), size)	!= 0);
+		BUG_ON(eytzinger1_next(eytzinger1_last(size), size)	!= 0);
+
+		inorder = 1;
+		eytzinger1_for_each(eytz, size) {
+			BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz);
+			BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder);
+			BUG_ON(eytz != eytzinger1_last(size) &&
+			       eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz);
+
+			inorder++;
+		}
+	}
+}
+
+void eytzinger0_test(void)
+{
+
+	unsigned inorder, eytz, size;
+
+	pr_info("0 based eytzinger test:");
+
+	for (size = 1;
+	     size < 65536;
+	     size++) {
+		unsigned extra = eytzinger0_extra(size);
+
+		if (!(size % 4096))
+			pr_info("tree size %u", size);
+
+		BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size));
+		BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size));
+
+		BUG_ON(eytzinger0_prev(eytzinger0_first(size), size)	!= -1);
+		BUG_ON(eytzinger0_next(eytzinger0_last(size), size)	!= -1);
+
+		inorder = 0;
+		eytzinger0_for_each(eytz, size) {
+			BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz);
+			BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder);
+			BUG_ON(eytz != eytzinger0_last(size) &&
+			       eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz);
+
+			inorder++;
+		}
+	}
+}
+
+static inline int cmp_u16(const void *_l, const void *_r, size_t size)
+{
+	const u16 *l = _l, *r = _r;
+
+	return (*l > *r) - (*r - *l);
+}
+
+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search)
+{
+	int i, c1 = -1, c2 = -1;
+	ssize_t r;
+
+	r = eytzinger0_find_le(test_array, nr,
+			       sizeof(test_array[0]),
+			       cmp_u16, &search);
+	if (r >= 0)
+		c1 = test_array[r];
+
+	for (i = 0; i < nr; i++)
+		if (test_array[i] <= search && test_array[i] > c2)
+			c2 = test_array[i];
+
+	if (c1 != c2) {
+		eytzinger0_for_each(i, nr)
+			pr_info("[%3u] = %12u", i, test_array[i]);
+		pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i",
+			i, r, c1, c2);
+	}
+}
+
+void eytzinger0_find_test(void)
+{
+	unsigned i, nr, allocated = 1 << 12;
+	u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL);
+
+	for (nr = 1; nr < allocated; nr++) {
+		pr_info("testing %u elems", nr);
+
+		get_random_bytes(test_array, nr * sizeof(test_array[0]));
+		eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL);
+
+		/* verify array is sorted correctly: */
+		eytzinger0_for_each(i, nr)
+			BUG_ON(i != eytzinger0_last(nr) &&
+			       test_array[i] > test_array[eytzinger0_next(i, nr)]);
+
+		for (i = 0; i < U16_MAX; i += 1 << 12)
+			eytzinger0_find_test_val(test_array, nr, i);
+
+		for (i = 0; i < nr; i++) {
+			eytzinger0_find_test_val(test_array, nr, test_array[i] - 1);
+			eytzinger0_find_test_val(test_array, nr, test_array[i]);
+			eytzinger0_find_test_val(test_array, nr, test_array[i] + 1);
+		}
+	}
+
+	kfree(test_array);
+}
+#endif
diff --git a/libbcachefs/util.h b/libbcachefs/util.h
index b91b2dc..a251bf9 100644
--- a/libbcachefs/util.h
+++ b/libbcachefs/util.h
@@ -789,4 +789,28 @@ void sort_cmp_size(void *base, size_t num, size_t size,
 	  int (*cmp_func)(const void *, const void *, size_t),
 	  void (*swap_func)(void *, void *, size_t));
 
+/* just the memmove, doesn't update @_nr */
+#define __array_insert_item(_array, _nr, _pos)				\
+	memmove(&(_array)[(_pos) + 1],					\
+		&(_array)[(_pos)],					\
+		sizeof((_array)[0]) * ((_nr) - (_pos)))
+
+#define array_insert_item(_array, _nr, _pos, _new_item)			\
+do {									\
+	__array_insert_item(_array, _nr, _pos);				\
+	(_nr)++;							\
+	(_array)[(_pos)] = (_new_item);					\
+} while (0)
+
+#define array_remove_items(_array, _nr, _pos, _nr_to_remove)		\
+do {									\
+	(_nr) -= (_nr_to_remove);					\
+	memmove(&(_array)[(_pos)],					\
+		&(_array)[(_pos) + (_nr_to_remove)],			\
+		sizeof((_array)[0]) * ((_nr) - (_pos)));		\
+} while (0)
+
+#define array_remove_item(_array, _nr, _pos)				\
+	array_remove_items(_array, _nr, _pos, 1)
+
 #endif /* _BCACHEFS_UTIL_H */
-- 
2.39.2