]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/extents.c
Update bcachefs sources to f7670cba39 bcachefs: Fix for building in userspace
[bcachefs-tools-debian] / libbcachefs / extents.c
index ed33f9bf528a9628c9f1d3c5f1b0e2ed47f79a1d..2980416871d8791360cc3c02455d27435b00116c 100644 (file)
 #include "checksum.h"
 #include "debug.h"
 #include "dirent.h"
+#include "disk_groups.h"
 #include "error.h"
 #include "extents.h"
 #include "inode.h"
 #include "journal.h"
+#include "replicas.h"
 #include "super.h"
 #include "super-io.h"
 #include "util.h"
 
 #include <trace/events/bcachefs.h>
 
-static enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
-                                          struct bkey_i *, struct bkey_i *);
-
-static void sort_key_next(struct btree_node_iter_large *iter,
-                         struct btree *b,
-                         struct btree_node_iter_set *i)
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
 {
-       i->k += __btree_node_offset_to_key(b, i->k)->u64s;
+       struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+       const struct bch_extent_ptr *ptr;
+       unsigned nr_ptrs = 0;
+
+       bkey_for_each_ptr(p, ptr)
+               nr_ptrs++;
 
-       if (i->k == i->end)
-               *i = iter->data[--iter->used];
+       return nr_ptrs;
 }
 
-/*
- * Returns true if l > r - unless l == r, in which case returns true if l is
- * older than r.
- *
- * Necessary for btree_sort_fixup() - if there are multiple keys that compare
- * equal in different sets, we have to process them newest to oldest.
- */
-#define key_sort_cmp(h, l, r)                                          \
-({                                                                     \
-       bkey_cmp_packed(b,                                              \
-                       __btree_node_offset_to_key(b, (l).k),           \
-                       __btree_node_offset_to_key(b, (r).k))           \
-                                                                       \
-       ?: (l).k - (r).k;                                               \
-})
-
-static inline bool should_drop_next_key(struct btree_node_iter_large *iter,
-                                       struct btree *b)
+unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k)
 {
-       struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
-       struct bkey_packed *k = __btree_node_offset_to_key(b, l->k);
-
-       if (bkey_whiteout(k))
-               return true;
+       unsigned nr_ptrs = 0;
 
-       if (iter->used < 2)
-               return false;
+       switch (k.k->type) {
+       case KEY_TYPE_btree_ptr:
+       case KEY_TYPE_extent: {
+               struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+               const struct bch_extent_ptr *ptr;
 
-       if (iter->used > 2 &&
-           key_sort_cmp(iter, r[0], r[1]) >= 0)
-               r++;
+               bkey_for_each_ptr(p, ptr)
+                       nr_ptrs += !ptr->cached;
+               BUG_ON(!nr_ptrs);
+               break;
+       }
+       case KEY_TYPE_reservation:
+               nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas;
+               break;
+       }
 
-       /*
-        * key_sort_cmp() ensures that when keys compare equal the older key
-        * comes first; so if l->k compares equal to r->k then l->k is older and
-        * should be dropped.
-        */
-       return !bkey_cmp_packed(b,
-                               __btree_node_offset_to_key(b, l->k),
-                               __btree_node_offset_to_key(b, r->k));
+       return nr_ptrs;
 }
 
-struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst,
-                                       struct btree *b,
-                                       struct btree_node_iter_large *iter)
+static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
+                                          struct extent_ptr_decoded p)
 {
-       struct bkey_packed *out = dst->start;
-       struct btree_nr_keys nr;
+       unsigned i, durability = 0;
+       struct bch_dev *ca;
 
-       memset(&nr, 0, sizeof(nr));
+       if (p.ptr.cached)
+               return 0;
 
-       heap_resort(iter, key_sort_cmp);
+       ca = bch_dev_bkey_exists(c, p.ptr.dev);
 
-       while (!bch2_btree_node_iter_large_end(iter)) {
-               if (!should_drop_next_key(iter, b)) {
-                       struct bkey_packed *k =
-                               __btree_node_offset_to_key(b, iter->data->k);
+       if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
+               durability = max_t(unsigned, durability, ca->mi.durability);
 
-                       bkey_copy(out, k);
-                       btree_keys_account_key_add(&nr, 0, out);
-                       out = bkey_next(out);
-               }
+       for (i = 0; i < p.ec_nr; i++) {
+               struct stripe *s =
+                       genradix_ptr(&c->stripes[0], p.idx);
+
+               if (WARN_ON(!s))
+                       continue;
 
-               sort_key_next(iter, b, iter->data);
-               heap_sift_down(iter, 0, key_sort_cmp);
+               durability = max_t(unsigned, durability, s->nr_redundant);
        }
 
-       dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-       return nr;
+       return durability;
 }
 
-/* Common among btree and extent ptrs */
+unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const union bch_extent_entry *entry;
+       struct extent_ptr_decoded p;
+       unsigned durability = 0;
+
+       bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+               durability += bch2_extent_ptr_durability(c, p);
 
-const struct bch_extent_ptr *
-bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
+       return durability;
+}
+
+static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
+                                                  unsigned dev)
 {
-       const struct bch_extent_ptr *ptr;
+       struct bch_dev_io_failures *i;
 
-       extent_for_each_ptr(e, ptr)
-               if (ptr->dev == dev)
-                       return ptr;
+       for (i = f->devs; i < f->devs + f->nr; i++)
+               if (i->dev == dev)
+                       return i;
 
        return NULL;
 }
 
-bool bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev)
+void bch2_mark_io_failure(struct bch_io_failures *failed,
+                         struct extent_ptr_decoded *p)
 {
-       struct bch_extent_ptr *ptr;
-       bool dropped = false;
+       struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev);
 
-       extent_for_each_ptr_backwards(e, ptr)
-               if (ptr->dev == dev) {
-                       __bch2_extent_drop_ptr(e, ptr);
-                       dropped = true;
-               }
+       if (!f) {
+               BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs));
 
-       if (dropped)
-               bch2_extent_drop_redundant_crcs(e);
-       return dropped;
+               f = &failed->devs[failed->nr++];
+               f->dev          = p->ptr.dev;
+               f->idx          = p->idx;
+               f->nr_failed    = 1;
+               f->nr_retries   = 0;
+       } else if (p->idx != f->idx) {
+               f->idx          = p->idx;
+               f->nr_failed    = 1;
+               f->nr_retries   = 0;
+       } else {
+               f->nr_failed++;
+       }
 }
 
-const struct bch_extent_ptr *
-bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group)
+/*
+ * returns true if p1 is better than p2:
+ */
+static inline bool ptr_better(struct bch_fs *c,
+                             const struct extent_ptr_decoded p1,
+                             const struct extent_ptr_decoded p2)
 {
-       const struct bch_extent_ptr *ptr;
+       if (likely(!p1.idx && !p2.idx)) {
+               struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev);
+               struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev);
 
-       extent_for_each_ptr(e, ptr) {
-               struct bch_dev *ca = c->devs[ptr->dev];
+               u64 l1 = atomic64_read(&dev1->cur_latency[READ]);
+               u64 l2 = atomic64_read(&dev2->cur_latency[READ]);
 
-               if (ca->mi.group &&
-                   ca->mi.group - 1 == group)
-                       return ptr;
+               /* Pick at random, biased in favor of the faster device: */
+
+               return bch2_rand_range(l1 + l2) > l1;
        }
 
-       return NULL;
+       if (force_reconstruct_read(c))
+               return p1.idx > p2.idx;
+
+       return p1.idx < p2.idx;
 }
 
-const struct bch_extent_ptr *
-bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target)
+/*
+ * This picks a non-stale pointer, preferably from a device other than @avoid.
+ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
+ * other devices, it will still pick a pointer from avoid.
+ */
+int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
+                              struct bch_io_failures *failed,
+                              struct extent_ptr_decoded *pick)
 {
-       const struct bch_extent_ptr *ptr;
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const union bch_extent_entry *entry;
+       struct extent_ptr_decoded p;
+       struct bch_dev_io_failures *f;
+       struct bch_dev *ca;
+       int ret = 0;
 
-       extent_for_each_ptr(e, ptr)
-               if (dev_in_target(c->devs[ptr->dev], target))
-                       return ptr;
+       if (k.k->type == KEY_TYPE_error)
+               return -EIO;
 
-       return NULL;
-}
+       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+               ca = bch_dev_bkey_exists(c, p.ptr.dev);
 
-unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e)
-{
-       const struct bch_extent_ptr *ptr;
-       unsigned nr_ptrs = 0;
+               /*
+                * If there are any dirty pointers it's an error if we can't
+                * read:
+                */
+               if (!ret && !p.ptr.cached)
+                       ret = -EIO;
 
-       extent_for_each_ptr(e, ptr)
-               nr_ptrs++;
+               if (p.ptr.cached && ptr_stale(ca, &p.ptr))
+                       continue;
 
-       return nr_ptrs;
+               f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL;
+               if (f)
+                       p.idx = f->nr_failed < f->nr_retries
+                               ? f->idx
+                               : f->idx + 1;
+
+               if (!p.idx &&
+                   !bch2_dev_is_readable(ca))
+                       p.idx++;
+
+               if (force_reconstruct_read(c) &&
+                   !p.idx && p.ec_nr)
+                       p.idx++;
+
+               if (p.idx >= p.ec_nr + 1)
+                       continue;
+
+               if (ret > 0 && !ptr_better(c, p, *pick))
+                       continue;
+
+               *pick = p;
+               ret = 1;
+       }
+
+       return ret;
 }
 
-unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
+void bch2_bkey_append_ptr(struct bkey_i *k,
+                         struct bch_extent_ptr ptr)
 {
-       struct bkey_s_c_extent e;
-       const struct bch_extent_ptr *ptr;
-       unsigned nr_ptrs = 0;
+       EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
 
-       switch (k.k->type) {
-       case BCH_EXTENT:
-       case BCH_EXTENT_CACHED:
-               e = bkey_s_c_to_extent(k);
+       switch (k->k.type) {
+       case KEY_TYPE_btree_ptr:
+       case KEY_TYPE_extent:
+               EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
 
-               extent_for_each_ptr(e, ptr)
-                       nr_ptrs += !ptr->cached;
-               break;
+               ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
 
-       case BCH_RESERVATION:
-               nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas;
+               memcpy((void *) &k->v + bkey_val_bytes(&k->k),
+                      &ptr,
+                      sizeof(ptr));
+               k->u64s++;
                break;
+       default:
+               BUG();
        }
+}
 
-       return nr_ptrs;
+void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
+{
+       struct bch_extent_ptr *ptr;
+
+       bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
 }
 
-unsigned bch2_extent_ptr_durability(struct bch_fs *c,
-                                   const struct bch_extent_ptr *ptr)
+/* extent specific utility code */
+
+const struct bch_extent_ptr *
+bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
 {
-       struct bch_dev *ca;
+       const struct bch_extent_ptr *ptr;
 
-       if (ptr->cached)
-               return 0;
+       extent_for_each_ptr(e, ptr)
+               if (ptr->dev == dev)
+                       return ptr;
 
-       ca = bch_dev_bkey_exists(c, ptr->dev);
+       return NULL;
+}
 
-       if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
-               return 0;
+const struct bch_extent_ptr *
+bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group)
+{
+       const struct bch_extent_ptr *ptr;
+
+       extent_for_each_ptr(e, ptr) {
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+               if (ca->mi.group &&
+                   ca->mi.group - 1 == group)
+                       return ptr;
+       }
 
-       return ca->mi.durability;
+       return NULL;
 }
 
-unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e)
+const struct bch_extent_ptr *
+bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target)
 {
        const struct bch_extent_ptr *ptr;
-       unsigned durability = 0;
 
        extent_for_each_ptr(e, ptr)
-               durability += bch2_extent_ptr_durability(c, ptr);
+               if (bch2_dev_in_target(c, ptr->dev, target) &&
+                   (!ptr->cached ||
+                    !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
+                       return ptr;
 
-       return durability;
+       return NULL;
 }
 
 unsigned bch2_extent_is_compressed(struct bkey_s_c k)
 {
-       struct bkey_s_c_extent e;
-       const struct bch_extent_ptr *ptr;
-       struct bch_extent_crc_unpacked crc;
        unsigned ret = 0;
 
        switch (k.k->type) {
-       case BCH_EXTENT:
-       case BCH_EXTENT_CACHED:
-               e = bkey_s_c_to_extent(k);
-
-               extent_for_each_ptr_crc(e, ptr, crc)
-                       if (!ptr->cached &&
-                           crc.compression_type != BCH_COMPRESSION_NONE &&
-                           crc.compressed_size < crc.live_size)
-                               ret = max_t(unsigned, ret, crc.compressed_size);
+       case KEY_TYPE_extent: {
+               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+               const union bch_extent_entry *entry;
+               struct extent_ptr_decoded p;
+
+               extent_for_each_ptr_decode(e, p, entry)
+                       if (!p.ptr.cached &&
+                           p.crc.compression_type != BCH_COMPRESSION_NONE)
+                               ret += p.crc.compressed_size;
+       }
        }
 
        return ret;
@@ -253,34 +316,67 @@ unsigned bch2_extent_is_compressed(struct bkey_s_c k)
 bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
                             struct bch_extent_ptr m, u64 offset)
 {
-       const struct bch_extent_ptr *ptr;
-       struct bch_extent_crc_unpacked crc;
+       const union bch_extent_entry *entry;
+       struct extent_ptr_decoded p;
 
-       extent_for_each_ptr_crc(e, ptr, crc)
-               if (ptr->dev    == m.dev &&
-                   ptr->gen    == m.gen &&
-                   (s64) ptr->offset + crc.offset - bkey_start_offset(e.k) ==
+       extent_for_each_ptr_decode(e, p, entry)
+               if (p.ptr.dev   == m.dev &&
+                   p.ptr.gen   == m.gen &&
+                   (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(e.k) ==
                    (s64) m.offset  - offset)
-                       return ptr;
+                       return true;
 
-       return NULL;
+       return false;
 }
 
-/* Doesn't cleanup redundant crcs */
-void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
+static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
+                                         union bch_extent_entry *entry)
 {
-       EBUG_ON(ptr < &e.v->start->ptr ||
-               ptr >= &extent_entry_last(e)->ptr);
-       EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
-       memmove_u64s_down(ptr, ptr + 1,
-                         (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1));
-       e.k->u64s -= sizeof(*ptr) / sizeof(u64);
+       union bch_extent_entry *i = ptrs.start;
+
+       if (i == entry)
+               return NULL;
+
+       while (extent_entry_next(i) != entry)
+               i = extent_entry_next(i);
+       return i;
 }
 
-void bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
+                                          struct bch_extent_ptr *ptr)
 {
-       __bch2_extent_drop_ptr(e, ptr);
-       bch2_extent_drop_redundant_crcs(e);
+       struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+       union bch_extent_entry *dst, *src, *prev;
+       bool drop_crc = true;
+
+       EBUG_ON(ptr < &ptrs.start->ptr ||
+               ptr >= &ptrs.end->ptr);
+       EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
+
+       src = extent_entry_next(to_entry(ptr));
+       if (src != ptrs.end &&
+           !extent_entry_is_crc(src))
+               drop_crc = false;
+
+       dst = to_entry(ptr);
+       while ((prev = extent_entry_prev(ptrs, dst))) {
+               if (extent_entry_is_ptr(prev))
+                       break;
+
+               if (extent_entry_is_crc(prev)) {
+                       if (drop_crc)
+                               dst = prev;
+                       break;
+               }
+
+               dst = prev;
+       }
+
+       memmove_u64s_down(dst, src,
+                         (u64 *) ptrs.end - (u64 *) src);
+       k.k->u64s -= (u64 *) src - (u64 *) dst;
+
+       return dst;
 }
 
 static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
@@ -322,173 +418,95 @@ bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
                             struct bch_extent_crc_unpacked n)
 {
        struct bch_extent_crc_unpacked u;
-       struct bch_extent_ptr *ptr;
+       struct extent_ptr_decoded p;
        union bch_extent_entry *i;
+       bool ret = false;
 
        /* Find a checksum entry that covers only live data: */
-       if (!n.csum_type)
+       if (!n.csum_type) {
                extent_for_each_crc(extent_i_to_s(e), u, i)
                        if (!u.compression_type &&
                            u.csum_type &&
                            u.live_size == u.uncompressed_size) {
                                n = u;
-                               break;
+                               goto found;
                        }
-
-       if (!bch2_can_narrow_extent_crcs(extent_i_to_s_c(e), n))
                return false;
-
+       }
+found:
        BUG_ON(n.compression_type);
        BUG_ON(n.offset);
        BUG_ON(n.live_size != e->k.size);
 
-       bch2_extent_crc_append(e, n);
 restart_narrow_pointers:
-       extent_for_each_ptr_crc(extent_i_to_s(e), ptr, u)
-               if (can_narrow_crc(u, n)) {
-                       ptr->offset += u.offset;
-                       extent_ptr_append(e, *ptr);
-                       __bch2_extent_drop_ptr(extent_i_to_s(e), ptr);
+       extent_for_each_ptr_decode(extent_i_to_s(e), p, i)
+               if (can_narrow_crc(p.crc, n)) {
+                       bch2_bkey_drop_ptr(extent_i_to_s(e).s, &i->ptr);
+                       p.ptr.offset += p.crc.offset;
+                       p.crc = n;
+                       bch2_extent_ptr_decoded_append(e, &p);
+                       ret = true;
                        goto restart_narrow_pointers;
                }
 
-       bch2_extent_drop_redundant_crcs(extent_i_to_s(e));
-       return true;
-}
-
-void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
-{
-       union bch_extent_entry *entry = e.v->start;
-       union bch_extent_crc *crc, *prev = NULL;
-       struct bch_extent_crc_unpacked u, prev_u;
-
-       while (entry != extent_entry_last(e)) {
-               union bch_extent_entry *next = extent_entry_next(entry);
-               size_t crc_u64s = extent_entry_u64s(entry);
-
-               if (!extent_entry_is_crc(entry))
-                       goto next;
-
-               crc = entry_to_crc(entry);
-               u = bch2_extent_crc_unpack(e.k, crc);
-
-               if (next == extent_entry_last(e)) {
-                       /* crc entry with no pointers after it: */
-                       goto drop;
-               }
-
-               if (extent_entry_is_crc(next)) {
-                       /* no pointers before next crc entry: */
-                       goto drop;
-               }
-
-               if (prev && !memcmp(&u, &prev_u, sizeof(u))) {
-                       /* identical to previous crc entry: */
-                       goto drop;
-               }
-
-               if (!prev &&
-                   !u.csum_type &&
-                   !u.compression_type) {
-                       /* null crc entry: */
-                       union bch_extent_entry *e2;
-
-                       extent_for_each_entry_from(e, e2, extent_entry_next(entry)) {
-                               if (!extent_entry_is_ptr(e2))
-                                       break;
-
-                               e2->ptr.offset += u.offset;
-                       }
-                       goto drop;
-               }
-
-               prev = crc;
-               prev_u = u;
-next:
-               entry = next;
-               continue;
-drop:
-               memmove_u64s_down(crc, next,
-                                 (u64 *) extent_entry_last(e) - (u64 *) next);
-               e.k->u64s -= crc_u64s;
-       }
-
-       EBUG_ON(bkey_val_u64s(e.k) && !bch2_extent_nr_ptrs(e.c));
-}
-
-static bool should_drop_ptr(const struct bch_fs *c,
-                           struct bkey_s_c_extent e,
-                           const struct bch_extent_ptr *ptr)
-{
-       return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr);
-}
-
-static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
-{
-       struct bch_extent_ptr *ptr = &e.v->start->ptr;
-       bool dropped = false;
-
-       while ((ptr = extent_ptr_next(e, ptr)))
-               if (should_drop_ptr(c, e.c, ptr)) {
-                       __bch2_extent_drop_ptr(e, ptr);
-                       dropped = true;
-               } else
-                       ptr++;
-
-       if (dropped)
-               bch2_extent_drop_redundant_crcs(e);
+       return ret;
 }
 
-static bool bch2_ptr_normalize(struct bch_fs *c, struct btree *bk,
-                             struct bkey_s k)
+/* returns true if not equal */
+static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
+                                        struct bch_extent_crc_unpacked r)
 {
-       return bch2_extent_normalize(c, k);
+       return (l.csum_type             != r.csum_type ||
+               l.compression_type      != r.compression_type ||
+               l.compressed_size       != r.compressed_size ||
+               l.uncompressed_size     != r.uncompressed_size ||
+               l.offset                != r.offset ||
+               l.live_size             != r.live_size ||
+               l.nonce                 != r.nonce ||
+               bch2_crc_cmp(l.csum, r.csum));
 }
 
-static void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
+void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
 {
-       switch (k->type) {
-       case BCH_EXTENT:
-       case BCH_EXTENT_CACHED: {
-               union bch_extent_entry *entry;
-               u64 *d = (u64 *) bkeyp_val(f, k);
-               unsigned i;
+       union bch_extent_entry *entry;
+       u64 *d = (u64 *) bkeyp_val(f, k);
+       unsigned i;
 
-               for (i = 0; i < bkeyp_val_u64s(f, k); i++)
-                       d[i] = swab64(d[i]);
+       for (i = 0; i < bkeyp_val_u64s(f, k); i++)
+               d[i] = swab64(d[i]);
 
-               for (entry = (union bch_extent_entry *) d;
-                    entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
-                    entry = extent_entry_next(entry)) {
-                       switch (extent_entry_type(entry)) {
-                       case BCH_EXTENT_ENTRY_crc32:
-                               entry->crc32.csum = swab32(entry->crc32.csum);
-                               break;
-                       case BCH_EXTENT_ENTRY_crc64:
-                               entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
-                               entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
-                               break;
-                       case BCH_EXTENT_ENTRY_crc128:
-                               entry->crc128.csum.hi = (__force __le64)
-                                       swab64((__force u64) entry->crc128.csum.hi);
-                               entry->crc128.csum.lo = (__force __le64)
-                                       swab64((__force u64) entry->crc128.csum.lo);
-                               break;
-                       case BCH_EXTENT_ENTRY_ptr:
-                               break;
-                       }
+       for (entry = (union bch_extent_entry *) d;
+            entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
+            entry = extent_entry_next(entry)) {
+               switch (extent_entry_type(entry)) {
+               case BCH_EXTENT_ENTRY_ptr:
+                       break;
+               case BCH_EXTENT_ENTRY_crc32:
+                       entry->crc32.csum = swab32(entry->crc32.csum);
+                       break;
+               case BCH_EXTENT_ENTRY_crc64:
+                       entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
+                       entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
+                       break;
+               case BCH_EXTENT_ENTRY_crc128:
+                       entry->crc128.csum.hi = (__force __le64)
+                               swab64((__force u64) entry->crc128.csum.hi);
+                       entry->crc128.csum.lo = (__force __le64)
+                               swab64((__force u64) entry->crc128.csum.lo);
+                       break;
+               case BCH_EXTENT_ENTRY_stripe_ptr:
+                       break;
                }
-               break;
-       }
        }
 }
 
 static const char *extent_ptr_invalid(const struct bch_fs *c,
-                                     struct bkey_s_c_extent e,
+                                     struct bkey_s_c k,
                                      const struct bch_extent_ptr *ptr,
                                      unsigned size_ondisk,
                                      bool metadata)
 {
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const struct bch_extent_ptr *ptr2;
        struct bch_dev *ca;
 
@@ -500,7 +518,7 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
        if (!ca)
                return "pointer to invalid device";
 
-       extent_for_each_ptr(e, ptr2)
+       bkey_for_each_ptr(ptrs, ptr2)
                if (ptr != ptr2 && ptr->dev == ptr2->dev)
                        return "multiple pointers to same device";
 
@@ -517,254 +535,148 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
        return NULL;
 }
 
-static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
-                               size_t size, struct bkey_s_c_extent e)
+static void bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+                             struct bkey_s_c k)
 {
-       char *out = buf, *end = buf + size;
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
        struct bch_extent_crc_unpacked crc;
        const struct bch_extent_ptr *ptr;
+       const struct bch_extent_stripe_ptr *ec;
        struct bch_dev *ca;
        bool first = true;
 
-#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
-
-       extent_for_each_entry(e, entry) {
+       bkey_extent_entry_for_each(ptrs, entry) {
                if (!first)
-                       p(" ");
+                       pr_buf(out, " ");
 
                switch (__extent_entry_type(entry)) {
-               case BCH_EXTENT_ENTRY_crc32:
-               case BCH_EXTENT_ENTRY_crc64:
-               case BCH_EXTENT_ENTRY_crc128:
-                       crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
-
-                       p("crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
-                         crc.compressed_size,
-                         crc.uncompressed_size,
-                         crc.offset, crc.nonce,
-                         crc.csum_type,
-                         crc.compression_type);
-                       break;
                case BCH_EXTENT_ENTRY_ptr:
                        ptr = entry_to_ptr(entry);
                        ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
                                ? bch_dev_bkey_exists(c, ptr->dev)
                                : NULL;
 
-                       p("ptr: %u:%llu gen %u%s", ptr->dev,
-                         (u64) ptr->offset, ptr->gen,
-                         ca && ptr_stale(ca, ptr)
-                         ? " stale" : "");
+                       pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
+                              (u64) ptr->offset, ptr->gen,
+                              ptr->cached ? " cached" : "",
+                              ca && ptr_stale(ca, ptr)
+                              ? " stale" : "");
+                       break;
+               case BCH_EXTENT_ENTRY_crc32:
+               case BCH_EXTENT_ENTRY_crc64:
+               case BCH_EXTENT_ENTRY_crc128:
+                       crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
+
+                       pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
+                              crc.compressed_size,
+                              crc.uncompressed_size,
+                              crc.offset, crc.nonce,
+                              crc.csum_type,
+                              crc.compression_type);
+                       break;
+               case BCH_EXTENT_ENTRY_stripe_ptr:
+                       ec = &entry->stripe_ptr;
+
+                       pr_buf(out, "ec: idx %llu block %u",
+                              (u64) ec->idx, ec->block);
                        break;
                default:
-                       p("(invalid extent entry %.16llx)", *((u64 *) entry));
-                       goto out;
+                       pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
+                       return;
                }
 
                first = false;
        }
-out:
-       if (bkey_extent_is_cached(e.k))
-               p(" cached");
-#undef p
-       return out - buf;
 }
 
-static inline bool dev_latency_better(struct bch_dev *dev1,
-                                     struct bch_dev *dev2)
-{
-       unsigned l1 = atomic_read(&dev1->latency[READ]);
-       unsigned l2 = atomic_read(&dev2->latency[READ]);
-
-       /* Pick at random, biased in favor of the faster device: */
-
-       return bch2_rand_range(l1 + l2) > l1;
-}
+/* Btree ptrs */
 
-static void extent_pick_read_device(struct bch_fs *c,
-                                   struct bkey_s_c_extent e,
-                                   struct bch_devs_mask *avoid,
-                                   struct extent_pick_ptr *pick)
+const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const union bch_extent_entry *entry;
        const struct bch_extent_ptr *ptr;
-       struct bch_extent_crc_unpacked crc;
+       const char *reason;
 
-       extent_for_each_ptr_crc(e, ptr, crc) {
-               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+       if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
+               return "value too big";
 
-               if (ptr->cached && ptr_stale(ca, ptr))
-                       continue;
+       bkey_extent_entry_for_each(ptrs, entry) {
+               if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
+                       return "invalid extent entry type";
 
-               if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
-                       continue;
+               if (!extent_entry_is_ptr(entry))
+                       return "has non ptr field";
+       }
 
-               if (avoid) {
-                       if (test_bit(ca->dev_idx, avoid->d))
-                               continue;
+       bkey_for_each_ptr(ptrs, ptr) {
+               reason = extent_ptr_invalid(c, k, ptr,
+                                           c->opts.btree_node_size,
+                                           true);
+               if (reason)
+                       return reason;
+       }
 
-                       if (pick->ca &&
-                           test_bit(pick->ca->dev_idx, avoid->d))
-                               goto use;
-               }
+       return NULL;
+}
 
-               if (pick->ca && !dev_latency_better(ca, pick->ca))
-                       continue;
-use:
-               if (!percpu_ref_tryget(&ca->io_ref))
-                       continue;
-
-               if (pick->ca)
-                       percpu_ref_put(&pick->ca->io_ref);
-
-               *pick = (struct extent_pick_ptr) {
-                       .ptr    = *ptr,
-                       .crc    = crc,
-                       .ca     = ca,
-               };
-       }
-}
-
-/* Btree ptrs */
-
-static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
-                                        struct bkey_s_c k)
-{
-       if (bkey_extent_is_cached(k.k))
-               return "cached";
-
-       if (k.k->size)
-               return "nonzero key size";
-
-       if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
-               return "value too big";
-
-       switch (k.k->type) {
-       case BCH_EXTENT: {
-               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-               const union bch_extent_entry *entry;
-               const struct bch_extent_ptr *ptr;
-               const char *reason;
-
-               extent_for_each_entry(e, entry) {
-                       if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
-                               return "invalid extent entry type";
-
-                       if (extent_entry_is_crc(entry))
-                               return "has crc field";
-               }
-
-               extent_for_each_ptr(e, ptr) {
-                       reason = extent_ptr_invalid(c, e, ptr,
-                                                   c->opts.btree_node_size,
-                                                   true);
-                       if (reason)
-                               return reason;
-               }
-
-               return NULL;
-       }
-
-       default:
-               return "invalid value type";
-       }
-}
-
-static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
-                                struct bkey_s_c k)
+void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
+                              struct bkey_s_c k)
 {
-       struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const struct bch_extent_ptr *ptr;
-       unsigned seq;
        const char *err;
        char buf[160];
        struct bucket_mark mark;
        struct bch_dev *ca;
-       unsigned replicas = 0;
-       bool bad;
 
-       extent_for_each_ptr(e, ptr) {
+       bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+                      !bch2_bkey_replicas_marked(c, k, false), c,
+                      "btree key bad (replicas not marked in superblock):\n%s",
+                      (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
+
+       if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
+               return;
+
+       bkey_for_each_ptr(ptrs, ptr) {
                ca = bch_dev_bkey_exists(c, ptr->dev);
-               replicas++;
 
-               if (!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags))
-                       continue;
+               mark = ptr_bucket_mark(ca, ptr);
 
                err = "stale";
-               if (ptr_stale(ca, ptr))
+               if (gen_after(mark.gen, ptr->gen))
                        goto err;
 
-               do {
-                       seq = read_seqcount_begin(&c->gc_pos_lock);
-                       mark = ptr_bucket_mark(ca, ptr);
-
-                       bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
-                               (mark.data_type != BCH_DATA_BTREE ||
-                                mark.dirty_sectors < c->opts.btree_node_size);
-               } while (read_seqcount_retry(&c->gc_pos_lock, seq));
-
                err = "inconsistent";
-               if (bad)
+               if (mark.data_type != BCH_DATA_BTREE ||
+                   mark.dirty_sectors < c->opts.btree_node_size)
                        goto err;
        }
 
-       if (!bch2_bkey_replicas_marked(c, BCH_DATA_BTREE, e.s_c)) {
-               bch2_bkey_val_to_text(c, btree_node_type(b),
-                                    buf, sizeof(buf), k);
-               bch2_fs_bug(c,
-                       "btree key bad (replicas not marked in superblock):\n%s",
-                       buf);
-               return;
-       }
-
        return;
 err:
-       bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
-       bch2_fs_bug(c, "%s btree pointer %s: bucket %zi "
-                     "gen %i mark %08x",
-                     err, buf, PTR_BUCKET_NR(ca, ptr),
-                     mark.gen, (unsigned) mark.counter);
+       bch2_bkey_val_to_text(&PBUF(buf), c, k);
+       bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x",
+                   err, buf, PTR_BUCKET_NR(ca, ptr),
+                   mark.gen, (unsigned) mark.v.counter);
 }
 
-static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
-                                 size_t size, struct bkey_s_c k)
+void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
+                           struct bkey_s_c k)
 {
-       char *out = buf, *end = buf + size;
        const char *invalid;
 
-#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
-
-       if (bkey_extent_is_data(k.k))
-               out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
+       bkey_ptrs_to_text(out, c, k);
 
        invalid = bch2_btree_ptr_invalid(c, k);
        if (invalid)
-               p(" invalid: %s", invalid);
-#undef p
-}
-
-struct extent_pick_ptr
-bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
-                   struct bch_devs_mask *avoid)
-{
-       struct extent_pick_ptr pick = { .ca = NULL };
-
-       extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
-                               avoid, &pick);
-
-       return pick;
+               pr_buf(out, " invalid: %s", invalid);
 }
 
-const struct bkey_ops bch2_bkey_btree_ops = {
-       .key_invalid    = bch2_btree_ptr_invalid,
-       .key_debugcheck = btree_ptr_debugcheck,
-       .val_to_text    = bch2_btree_ptr_to_text,
-       .swab           = bch2_ptr_swab,
-};
-
 /* Extents */
 
-static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
+bool __bch2_cut_front(struct bpos where, struct bkey_s k)
 {
        u64 len = 0;
 
@@ -782,7 +694,7 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
         * cause offset to point to the next bucket:
         */
        if (!len)
-               __set_bkey_deleted(k.k);
+               k.k->type = KEY_TYPE_deleted;
        else if (bkey_extent_is_data(k.k)) {
                struct bkey_s_extent e = bkey_s_to_extent(k);
                union bch_extent_entry *entry;
@@ -803,6 +715,8 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
                        case BCH_EXTENT_ENTRY_crc128:
                                entry->crc128.offset += e.k->size - len;
                                break;
+                       case BCH_EXTENT_ENTRY_stripe_ptr:
+                               break;
                        }
 
                        if (extent_entry_is_crc(entry))
@@ -815,11 +729,6 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
        return true;
 }
 
-bool bch2_cut_front(struct bpos where, struct bkey_i *k)
-{
-       return __bch2_cut_front(where, bkey_i_to_s(k));
-}
-
 bool bch2_cut_back(struct bpos where, struct bkey *k)
 {
        u64 len = 0;
@@ -837,7 +746,7 @@ bool bch2_cut_back(struct bpos where, struct bkey *k)
        k->size = len;
 
        if (!len)
-               __set_bkey_deleted(k);
+               k->type = KEY_TYPE_deleted;
 
        return true;
 }
@@ -855,458 +764,229 @@ void bch2_key_resize(struct bkey *k,
        k->size = new_size;
 }
 
-/*
- * In extent_sort_fix_overlapping(), insert_fixup_extent(),
- * extent_merge_inline() - we're modifying keys in place that are packed. To do
- * that we have to unpack the key, modify the unpacked key - then this
- * copies/repacks the unpacked to the original as necessary.
- */
-static bool __extent_save(struct btree *b, struct btree_node_iter *iter,
-                         struct bkey_packed *dst, struct bkey *src)
+static bool extent_i_save(struct btree *b, struct bkey_packed *dst,
+                         struct bkey_i *src)
 {
        struct bkey_format *f = &b->format;
        struct bkey_i *dst_unpacked;
-       bool ret;
-
-       if ((dst_unpacked = packed_to_bkey(dst))) {
-               dst_unpacked->k = *src;
-               ret = true;
-       } else {
-               ret = bch2_bkey_pack_key(dst, src, f);
-       }
-
-       if (ret && iter)
-               bch2_verify_key_order(b, iter, dst);
-
-       return ret;
-}
-
-static void extent_save(struct btree *b, struct btree_node_iter *iter,
-                       struct bkey_packed *dst, struct bkey *src)
-{
-       BUG_ON(!__extent_save(b, iter, dst, src));
-}
-
-/*
- * If keys compare equal, compare by pointer order:
- *
- * Necessary for sort_fix_overlapping() - if there are multiple keys that
- * compare equal in different sets, we have to process them newest to oldest.
- */
-#define extent_sort_cmp(h, l, r)                                       \
-({                                                                     \
-       struct bkey _ul = bkey_unpack_key(b,                            \
-                               __btree_node_offset_to_key(b, (l).k));  \
-       struct bkey _ur = bkey_unpack_key(b,                            \
-                               __btree_node_offset_to_key(b, (r).k));  \
-                                                                       \
-       bkey_cmp(bkey_start_pos(&_ul),                                  \
-                bkey_start_pos(&_ur)) ?: (r).k - (l).k;                \
-})
-
-static inline void extent_sort_sift(struct btree_node_iter_large *iter,
-                                   struct btree *b, size_t i)
-{
-       heap_sift_down(iter, i, extent_sort_cmp);
-}
-
-static inline void extent_sort_next(struct btree_node_iter_large *iter,
-                                   struct btree *b,
-                                   struct btree_node_iter_set *i)
-{
-       sort_key_next(iter, b, i);
-       heap_sift_down(iter, i - iter->data, extent_sort_cmp);
-}
-
-static void extent_sort_append(struct bch_fs *c,
-                              struct btree *b,
-                              struct btree_nr_keys *nr,
-                              struct bkey_packed *start,
-                              struct bkey_packed **prev,
-                              struct bkey_packed *k)
-{
-       struct bkey_format *f = &b->format;
-       BKEY_PADDED(k) tmp;
-
-       if (bkey_whiteout(k))
-               return;
-
-       bch2_bkey_unpack(b, &tmp.k, k);
-
-       if (*prev &&
-           bch2_extent_merge(c, b, (void *) *prev, &tmp.k))
-               return;
-
-       if (*prev) {
-               bch2_bkey_pack(*prev, (void *) *prev, f);
-
-               btree_keys_account_key_add(nr, 0, *prev);
-               *prev = bkey_next(*prev);
-       } else {
-               *prev = start;
-       }
-
-       bkey_copy(*prev, &tmp.k);
-}
+       struct bkey_packed tmp;
 
-struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
-                                       struct bset *dst,
-                                       struct btree *b,
-                                       struct btree_node_iter_large *iter)
-{
-       struct bkey_format *f = &b->format;
-       struct btree_node_iter_set *_l = iter->data, *_r;
-       struct bkey_packed *prev = NULL, *out, *lk, *rk;
-       struct bkey l_unpacked, r_unpacked;
-       struct bkey_s l, r;
-       struct btree_nr_keys nr;
-
-       memset(&nr, 0, sizeof(nr));
-
-       heap_resort(iter, extent_sort_cmp);
-
-       while (!bch2_btree_node_iter_large_end(iter)) {
-               lk = __btree_node_offset_to_key(b, _l->k);
-
-               if (iter->used == 1) {
-                       extent_sort_append(c, b, &nr, dst->start, &prev, lk);
-                       extent_sort_next(iter, b, _l);
-                       continue;
-               }
-
-               _r = iter->data + 1;
-               if (iter->used > 2 &&
-                   extent_sort_cmp(iter, _r[0], _r[1]) >= 0)
-                       _r++;
-
-               rk = __btree_node_offset_to_key(b, _r->k);
-
-               l = __bkey_disassemble(b, lk, &l_unpacked);
-               r = __bkey_disassemble(b, rk, &r_unpacked);
-
-               /* If current key and next key don't overlap, just append */
-               if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) {
-                       extent_sort_append(c, b, &nr, dst->start, &prev, lk);
-                       extent_sort_next(iter, b, _l);
-                       continue;
-               }
-
-               /* Skip 0 size keys */
-               if (!r.k->size) {
-                       extent_sort_next(iter, b, _r);
-                       continue;
-               }
-
-               /*
-                * overlap: keep the newer key and trim the older key so they
-                * don't overlap. comparing pointers tells us which one is
-                * newer, since the bsets are appended one after the other.
-                */
-
-               /* can't happen because of comparison func */
-               BUG_ON(_l->k < _r->k &&
-                      !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k)));
-
-               if (_l->k > _r->k) {
-                       /* l wins, trim r */
-                       if (bkey_cmp(l.k->p, r.k->p) >= 0) {
-                               sort_key_next(iter, b, _r);
-                       } else {
-                               __bch2_cut_front(l.k->p, r);
-                               extent_save(b, NULL, rk, r.k);
-                       }
-
-                       extent_sort_sift(iter, b, _r - iter->data);
-               } else if (bkey_cmp(l.k->p, r.k->p) > 0) {
-                       BKEY_PADDED(k) tmp;
-
-                       /*
-                        * r wins, but it overlaps in the middle of l - split l:
-                        */
-                       bkey_reassemble(&tmp.k, l.s_c);
-                       bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k);
-
-                       __bch2_cut_front(r.k->p, l);
-                       extent_save(b, NULL, lk, l.k);
-
-                       extent_sort_sift(iter, b, 0);
-
-                       extent_sort_append(c, b, &nr, dst->start, &prev,
-                                          bkey_to_packed(&tmp.k));
-               } else {
-                       bch2_cut_back(bkey_start_pos(r.k), l.k);
-                       extent_save(b, NULL, lk, l.k);
-               }
-       }
-
-       if (prev) {
-               bch2_bkey_pack(prev, (void *) prev, f);
-               btree_keys_account_key_add(&nr, 0, prev);
-               out = bkey_next(prev);
-       } else {
-               out = dst->start;
-       }
+       if ((dst_unpacked = packed_to_bkey(dst)))
+               dst_unpacked->k = src->k;
+       else if (bch2_bkey_pack_key(&tmp, &src->k, f))
+               memcpy_u64s(dst, &tmp, f->key_u64s);
+       else
+               return false;
 
-       dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-       return nr;
+       memcpy_u64s(bkeyp_val(f, dst), &src->v, bkey_val_u64s(&src->k));
+       return true;
 }
 
 struct extent_insert_state {
        struct btree_insert             *trans;
        struct btree_insert_entry       *insert;
        struct bpos                     committed;
-       struct bch_fs_usage             stats;
 
        /* for deleting: */
        struct bkey_i                   whiteout;
-       bool                            do_journal;
+       bool                            update_journal;
+       bool                            update_btree;
        bool                            deleting;
 };
 
-static void bch2_add_sectors(struct extent_insert_state *s,
-                            struct bkey_s_c k, u64 offset, s64 sectors)
-{
-       struct bch_fs *c = s->trans->c;
-       struct btree *b = s->insert->iter->l[0].b;
-
-       EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0);
-
-       if (!sectors)
-               return;
-
-       bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b),
-                     &s->stats, s->trans->journal_res.seq, 0);
-}
-
-static void bch2_subtract_sectors(struct extent_insert_state *s,
-                                struct bkey_s_c k, u64 offset, s64 sectors)
-{
-       bch2_add_sectors(s, k, offset, -sectors);
-}
-
-/* These wrappers subtract exactly the sectors that we're removing from @k */
-static void bch2_cut_subtract_back(struct extent_insert_state *s,
-                                 struct bpos where, struct bkey_s k)
-{
-       bch2_subtract_sectors(s, k.s_c, where.offset,
-                            k.k->p.offset - where.offset);
-       bch2_cut_back(where, k.k);
-}
-
-static void bch2_cut_subtract_front(struct extent_insert_state *s,
-                                  struct bpos where, struct bkey_s k)
-{
-       bch2_subtract_sectors(s, k.s_c, bkey_start_offset(k.k),
-                            where.offset - bkey_start_offset(k.k));
-       __bch2_cut_front(where, k);
-}
-
-static void bch2_drop_subtract(struct extent_insert_state *s, struct bkey_s k)
-{
-       if (k.k->size)
-               bch2_subtract_sectors(s, k.s_c,
-                                    bkey_start_offset(k.k), k.k->size);
-       k.k->size = 0;
-       __set_bkey_deleted(k.k);
-}
-
 static bool bch2_extent_merge_inline(struct bch_fs *,
                                     struct btree_iter *,
                                     struct bkey_packed *,
                                     struct bkey_packed *,
                                     bool);
 
-#define MAX_LOCK_HOLD_TIME     (5 * NSEC_PER_MSEC)
-
-static enum btree_insert_ret
-extent_insert_should_stop(struct extent_insert_state *s)
+static void verify_extent_nonoverlapping(struct btree *b,
+                                        struct btree_node_iter *_iter,
+                                        struct bkey_i *insert)
 {
-       struct btree *b = s->insert->iter->l[0].b;
+#ifdef CONFIG_BCACHEFS_DEBUG
+       struct btree_node_iter iter;
+       struct bkey_packed *k;
+       struct bkey uk;
 
-       /*
-        * Check if we have sufficient space in both the btree node and the
-        * journal reservation:
-        *
-        * Each insert checks for room in the journal entry, but we check for
-        * room in the btree node up-front. In the worst case, bkey_cmpxchg()
-        * will insert two keys, and one iteration of this room will insert one
-        * key, so we need room for three keys.
-        */
-       if (!bch2_btree_node_insert_fits(s->trans->c, b, s->insert->k->k.u64s))
-               return BTREE_INSERT_BTREE_NODE_FULL;
-       else if (!journal_res_insert_fits(s->trans, s->insert))
-               return BTREE_INSERT_JOURNAL_RES_FULL; /* XXX worth tracing */
-       else
-               return BTREE_INSERT_OK;
+       iter = *_iter;
+       k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
+       BUG_ON(k &&
+              (uk = bkey_unpack_key(b, k),
+               bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
+
+       iter = *_iter;
+       k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
+#if 0
+       BUG_ON(k &&
+              (uk = bkey_unpack_key(b, k),
+               bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
+#else
+       if (k &&
+           (uk = bkey_unpack_key(b, k),
+            bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
+               char buf1[100];
+               char buf2[100];
+
+               bch2_bkey_to_text(&PBUF(buf1), &insert->k);
+               bch2_bkey_to_text(&PBUF(buf2), &uk);
+
+               bch2_dump_btree_node(b);
+               panic("insert > next :\n"
+                     "insert %s\n"
+                     "next   %s\n",
+                     buf1, buf2);
+       }
+#endif
+
+#endif
+}
+
+static void verify_modified_extent(struct btree_iter *iter,
+                                  struct bkey_packed *k)
+{
+       bch2_btree_iter_verify(iter, iter->l[0].b);
+       bch2_verify_insert_pos(iter->l[0].b, k, k, k->u64s);
 }
 
 static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
                               struct bkey_i *insert)
 {
        struct btree_iter_level *l = &iter->l[0];
-       struct bset_tree *t = bset_tree_last(l->b);
-       struct bkey_packed *where =
-               bch2_btree_node_iter_bset_pos(&l->iter, l->b, t);
-       struct bkey_packed *prev = bch2_bkey_prev(l->b, t, where);
-       struct bkey_packed *next_live_key = where;
-       unsigned clobber_u64s;
+       struct btree_node_iter node_iter;
+       struct bkey_packed *k;
 
-       if (prev)
-               where = bkey_next(prev);
+       BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
 
-       while (next_live_key != btree_bkey_last(l->b, t) &&
-              bkey_deleted(next_live_key))
-               next_live_key = bkey_next(next_live_key);
+       EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+       verify_extent_nonoverlapping(l->b, &l->iter, insert);
 
-       /*
-        * Everything between where and next_live_key is now deleted keys, and
-        * is overwritten:
-        */
-       clobber_u64s = (u64 *) next_live_key - (u64 *) where;
+       node_iter = l->iter;
+       k = bch2_btree_node_iter_prev_filter(&node_iter, l->b, KEY_TYPE_discard);
+       if (k && !bkey_written(l->b, k) &&
+           bch2_extent_merge_inline(c, iter, k, bkey_to_packed(insert), true))
+               return;
 
-       if (prev &&
-           bch2_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true))
-               goto drop_deleted_keys;
+       node_iter = l->iter;
+       k = bch2_btree_node_iter_peek_filter(&node_iter, l->b, KEY_TYPE_discard);
+       if (k && !bkey_written(l->b, k) &&
+           bch2_extent_merge_inline(c, iter, bkey_to_packed(insert), k, false))
+               return;
 
-       if (next_live_key != btree_bkey_last(l->b, t) &&
-           bch2_extent_merge_inline(c, iter, bkey_to_packed(insert),
-                                   next_live_key, false))
-               goto drop_deleted_keys;
+       k = bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
 
-       bch2_bset_insert(l->b, &l->iter, where, insert, clobber_u64s);
-       bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, where,
-                               clobber_u64s, where->u64s);
-       return;
-drop_deleted_keys:
-       bch2_bset_delete(l->b, where, clobber_u64s);
-       bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
-                                where, clobber_u64s, 0);
+       bch2_bset_insert(l->b, &l->iter, k, insert, 0);
+       bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
+       bch2_btree_iter_verify(iter, l->b);
 }
 
 static void extent_insert_committed(struct extent_insert_state *s)
 {
        struct bch_fs *c = s->trans->c;
        struct btree_iter *iter = s->insert->iter;
-       struct bkey_i *insert = !s->deleting
-               ? s->insert->k
-               : &s->whiteout;
+       struct bkey_i *insert = s->insert->k;
        BKEY_PADDED(k) split;
 
        EBUG_ON(bkey_cmp(insert->k.p, s->committed) < 0);
        EBUG_ON(bkey_cmp(s->committed, bkey_start_pos(&insert->k)) < 0);
 
-       if (!bkey_cmp(s->committed, bkey_start_pos(&insert->k)))
+       bkey_copy(&split.k, insert);
+       if (s->deleting)
+               split.k.k.type = KEY_TYPE_discard;
+
+       bch2_cut_back(s->committed, &split.k.k);
+
+       if (!bkey_cmp(s->committed, iter->pos))
                return;
 
-       if (s->deleting && !s->do_journal) {
-               bch2_cut_front(s->committed, insert);
-               goto done;
-       }
+       bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
 
-       EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+       if (s->update_btree) {
+               if (debug_check_bkeys(c))
+                       bch2_bkey_debugcheck(c, iter->l[0].b,
+                                            bkey_i_to_s_c(&split.k));
 
-       bkey_copy(&split.k, insert);
+               EBUG_ON(bkey_deleted(&split.k.k) || !split.k.k.size);
 
-       if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
-           bkey_cmp(s->committed, insert->k.p) &&
-           bch2_extent_is_compressed(bkey_i_to_s_c(insert))) {
-               /* XXX: possibly need to increase our reservation? */
-               bch2_cut_subtract_back(s, s->committed,
-                                     bkey_i_to_s(&split.k));
-               bch2_cut_front(s->committed, insert);
-               bch2_add_sectors(s, bkey_i_to_s_c(insert),
-                               bkey_start_offset(&insert->k),
-                               insert->k.size);
-       } else {
-               bch2_cut_back(s->committed, &split.k.k);
-               bch2_cut_front(s->committed, insert);
+               extent_bset_insert(c, iter, &split.k);
        }
 
-       if (debug_check_bkeys(c))
-               bch2_bkey_debugcheck(c, iter->l[0].b, bkey_i_to_s_c(&split.k));
+       if (s->update_journal) {
+               bkey_copy(&split.k, !s->deleting ? insert : &s->whiteout);
+               if (s->deleting)
+                       split.k.k.type = KEY_TYPE_discard;
 
-       bch2_btree_journal_key(s->trans, iter, &split.k);
+               bch2_cut_back(s->committed, &split.k.k);
 
-       if (!s->deleting)
-               extent_bset_insert(c, iter, &split.k);
-done:
-       bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
+               EBUG_ON(bkey_deleted(&split.k.k) || !split.k.k.size);
+
+               bch2_btree_journal_key(s->trans, iter, &split.k);
+       }
+
+       bch2_cut_front(s->committed, insert);
 
        insert->k.needs_whiteout        = false;
-       s->do_journal                   = false;
-       s->trans->did_work              = true;
 }
 
-static enum btree_insert_ret
-__extent_insert_advance_pos(struct extent_insert_state *s,
-                           struct bpos next_pos,
-                           struct bkey_s_c k)
+void bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
 {
-       struct extent_insert_hook *hook = s->trans->hook;
-       enum btree_insert_ret ret;
-
-       if (hook)
-               ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
-       else
-               ret = BTREE_INSERT_OK;
+       struct btree *b = iter->l[0].b;
 
-       EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size);
+       BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK);
 
-       if (ret == BTREE_INSERT_OK)
-               s->committed = next_pos;
+       bch2_cut_back(b->key.k.p, &k->k);
 
-       return ret;
+       BUG_ON(bkey_cmp(bkey_start_pos(&k->k), b->data->min_key) < 0);
 }
 
-/*
- * Update iter->pos, marking how much of @insert we've processed, and call hook
- * fn:
- */
-static enum btree_insert_ret
-extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
+enum btree_insert_ret
+bch2_extent_can_insert(struct btree_insert *trans,
+                      struct btree_insert_entry *insert,
+                      unsigned *u64s)
 {
-       struct btree *b = s->insert->iter->l[0].b;
-       struct bpos next_pos = bpos_min(s->insert->k->k.p,
-                                       k.k ? k.k->p : b->key.k.p);
-       enum btree_insert_ret ret;
+       struct btree_iter_level *l = &insert->iter->l[0];
+       struct btree_node_iter node_iter = l->iter;
+       enum bch_extent_overlap overlap;
+       struct bkey_packed *_k;
+       struct bkey unpacked;
+       struct bkey_s_c k;
+       int sectors;
 
-       if (race_fault())
-               return BTREE_INSERT_NEED_TRAVERSE;
+       BUG_ON(trans->flags & BTREE_INSERT_ATOMIC &&
+              !bch2_extent_is_atomic(&insert->k->k, insert->iter));
 
-       /* hole? */
-       if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
-               ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k),
-                                                   bkey_s_c_null);
-               if (ret != BTREE_INSERT_OK)
-                       return ret;
-       }
+       /*
+        * We avoid creating whiteouts whenever possible when deleting, but
+        * those optimizations mean we may potentially insert two whiteouts
+        * instead of one (when we overlap with the front of one extent and the
+        * back of another):
+        */
+       if (bkey_whiteout(&insert->k->k))
+               *u64s += BKEY_U64s;
 
-       /* avoid redundant calls to hook fn: */
-       if (!bkey_cmp(s->committed, next_pos))
+       _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
+                                             KEY_TYPE_discard);
+       if (!_k)
                return BTREE_INSERT_OK;
 
-       return __extent_insert_advance_pos(s, next_pos, k);
-}
+       k = bkey_disassemble(l->b, _k, &unpacked);
 
-static enum btree_insert_ret
-extent_insert_check_split_compressed(struct extent_insert_state *s,
-                                    struct bkey_s_c k,
-                                    enum bch_extent_overlap overlap)
-{
-       struct bch_fs *c = s->trans->c;
-       unsigned sectors;
+       overlap = bch2_extent_overlap(&insert->k->k, k.k);
+
+       /* account for having to split existing extent: */
+       if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
+               *u64s += _k->u64s;
 
        if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
            (sectors = bch2_extent_is_compressed(k))) {
                int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
 
-               if (s->trans->flags & BTREE_INSERT_NOFAIL)
+               if (trans->flags & BTREE_INSERT_NOFAIL)
                        flags |= BCH_DISK_RESERVATION_NOFAIL;
 
-               switch (bch2_disk_reservation_add(c,
-                               s->trans->disk_res,
-                               sectors * bch2_extent_nr_dirty_ptrs(k),
-                               flags)) {
+               switch (bch2_disk_reservation_add(trans->c,
+                               trans->disk_res,
+                               sectors, flags)) {
                case 0:
                        break;
                case -ENOSPC:
@@ -1321,78 +1001,61 @@ extent_insert_check_split_compressed(struct extent_insert_state *s,
        return BTREE_INSERT_OK;
 }
 
-static enum btree_insert_ret
+static void
 extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
-             struct bset_tree *t, struct bkey_packed *_k, struct bkey_s k,
+             struct bkey_packed *_k, struct bkey_s k,
              enum bch_extent_overlap overlap)
 {
        struct bch_fs *c = s->trans->c;
        struct btree_iter *iter = s->insert->iter;
        struct btree_iter_level *l = &iter->l[0];
-       struct btree *b = l->b;
-       struct btree_node_iter *node_iter = &l->iter;
-       enum btree_insert_ret ret;
 
        switch (overlap) {
        case BCH_EXTENT_OVERLAP_FRONT:
                /* insert overlaps with start of k: */
-               bch2_cut_subtract_front(s, insert->k.p, k);
+               __bch2_cut_front(insert->k.p, k);
                BUG_ON(bkey_deleted(k.k));
-               extent_save(b, node_iter, _k, k.k);
+               extent_save(l->b, _k, k.k);
+               verify_modified_extent(iter, _k);
                break;
 
        case BCH_EXTENT_OVERLAP_BACK:
                /* insert overlaps with end of k: */
-               bch2_cut_subtract_back(s, bkey_start_pos(&insert->k), k);
+               bch2_cut_back(bkey_start_pos(&insert->k), k.k);
                BUG_ON(bkey_deleted(k.k));
-               extent_save(b, node_iter, _k, k.k);
+               extent_save(l->b, _k, k.k);
 
                /*
                 * As the auxiliary tree is indexed by the end of the
                 * key and we've just changed the end, update the
                 * auxiliary tree.
                 */
-               bch2_bset_fix_invalidated_key(b, t, _k);
-               bch2_btree_node_iter_fix(iter, b, node_iter, t,
-                                       _k, _k->u64s, _k->u64s);
+               bch2_bset_fix_invalidated_key(l->b, _k);
+               bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+                                        _k, _k->u64s, _k->u64s);
+               verify_modified_extent(iter, _k);
                break;
 
        case BCH_EXTENT_OVERLAP_ALL: {
-               struct bpos orig_pos = k.k->p;
-
                /* The insert key completely covers k, invalidate k */
                if (!bkey_whiteout(k.k))
-                       btree_keys_account_key_drop(&b->nr,
-                                               t - b->set, _k);
-
-               bch2_drop_subtract(s, k);
-               k.k->p = bkey_start_pos(&insert->k);
-               if (!__extent_save(b, node_iter, _k, k.k)) {
-                       /*
-                        * Couldn't repack: we aren't necessarily able
-                        * to repack if the new key is outside the range
-                        * of the old extent, so we have to split
-                        * @insert:
-                        */
-                       k.k->p = orig_pos;
-                       extent_save(b, node_iter, _k, k.k);
-
-                       ret = extent_insert_advance_pos(s, k.s_c);
-                       if (ret != BTREE_INSERT_OK)
-                               return ret;
-
-                       extent_insert_committed(s);
-                       /*
-                        * We split and inserted upto at k.k->p - that
-                        * has to coincide with iter->pos, so that we
-                        * don't have anything more we have to insert
-                        * until we recheck our journal reservation:
-                        */
-                       EBUG_ON(bkey_cmp(s->committed, k.k->p));
+                       btree_account_key_drop(l->b, _k);
+
+               k.k->size = 0;
+               k.k->type = KEY_TYPE_deleted;
+
+               if (_k >= btree_bset_last(l->b)->start) {
+                       unsigned u64s = _k->u64s;
+
+                       bch2_bset_delete(l->b, _k, _k->u64s);
+                       bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+                                                _k, u64s, 0);
+                       bch2_btree_iter_verify(iter, l->b);
                } else {
-                       bch2_bset_fix_invalidated_key(b, t, _k);
-                       bch2_btree_node_iter_fix(iter, b, node_iter, t,
-                                               _k, _k->u64s, _k->u64s);
+                       extent_save(l->b, _k, k.k);
+                       bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+                                                _k, _k->u64s, _k->u64s);
+                       verify_modified_extent(iter, _k);
                }
 
                break;
@@ -1414,132 +1077,108 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
                 * what k points to)
                 */
                bkey_reassemble(&split.k, k.s_c);
-               split.k.k.needs_whiteout |= bset_written(b, bset(b, t));
+               split.k.k.needs_whiteout |= bkey_written(l->b, _k);
 
                bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k);
                BUG_ON(bkey_deleted(&split.k.k));
 
-               bch2_cut_subtract_front(s, insert->k.p, k);
+               __bch2_cut_front(insert->k.p, k);
                BUG_ON(bkey_deleted(k.k));
-               extent_save(b, node_iter, _k, k.k);
+               extent_save(l->b, _k, k.k);
+               verify_modified_extent(iter, _k);
 
-               bch2_add_sectors(s, bkey_i_to_s_c(&split.k),
-                               bkey_start_offset(&split.k.k),
-                               split.k.k.size);
                extent_bset_insert(c, iter, &split.k);
                break;
        }
        }
-
-       return BTREE_INSERT_OK;
 }
 
-static enum btree_insert_ret
-bch2_delete_fixup_extent(struct extent_insert_state *s)
+static void __bch2_insert_fixup_extent(struct extent_insert_state *s)
 {
-       struct bch_fs *c = s->trans->c;
        struct btree_iter *iter = s->insert->iter;
        struct btree_iter_level *l = &iter->l[0];
-       struct btree *b = l->b;
-       struct btree_node_iter *node_iter = &l->iter;
        struct bkey_packed *_k;
        struct bkey unpacked;
        struct bkey_i *insert = s->insert->k;
-       enum btree_insert_ret ret = BTREE_INSERT_OK;
-
-       EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
-
-       s->whiteout     = *insert;
-       s->do_journal   = false;
 
        while (bkey_cmp(s->committed, insert->k.p) < 0 &&
-              (ret = extent_insert_should_stop(s)) == BTREE_INSERT_OK &&
-              (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
-               struct bset_tree *t = bch2_bkey_to_bset(b, _k);
-               struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
-               enum bch_extent_overlap overlap;
+              (_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
+                                                     KEY_TYPE_discard))) {
+               struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
+               enum bch_extent_overlap overlap = bch2_extent_overlap(&insert->k, k.k);
 
-               EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
                EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
 
                if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
                        break;
 
-               if (bkey_whiteout(k.k)) {
-                       s->committed = bpos_min(insert->k.p, k.k->p);
-                       goto next;
-               }
-
-               overlap = bch2_extent_overlap(&insert->k, k.k);
-
-               ret = extent_insert_check_split_compressed(s, k.s_c, overlap);
-               if (ret != BTREE_INSERT_OK)
-                       goto stop;
+               s->committed = bpos_min(s->insert->k->k.p, k.k->p);
 
-               ret = extent_insert_advance_pos(s, k.s_c);
-               if (ret)
-                       goto stop;
-
-               s->do_journal = true;
+               if (!bkey_whiteout(k.k))
+                       s->update_journal = true;
 
-               if (overlap == BCH_EXTENT_OVERLAP_ALL) {
-                       btree_keys_account_key_drop(&b->nr,
-                                               t - b->set, _k);
-                       bch2_subtract_sectors(s, k.s_c,
-                                            bkey_start_offset(k.k), k.k->size);
-                       _k->type = KEY_TYPE_DISCARD;
-                       reserve_whiteout(b, t, _k);
-               } else if (k.k->needs_whiteout ||
-                          bset_written(b, bset(b, t))) {
-                       struct bkey_i discard = *insert;
+               if (!s->update_journal) {
+                       bch2_cut_front(s->committed, insert);
+                       bch2_cut_front(s->committed, &s->whiteout);
+                       bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
+                       goto next;
+               }
 
-                       switch (overlap) {
-                       case BCH_EXTENT_OVERLAP_FRONT:
-                               bch2_cut_front(bkey_start_pos(k.k), &discard);
-                               break;
-                       case BCH_EXTENT_OVERLAP_BACK:
-                               bch2_cut_back(k.k->p, &discard.k);
-                               break;
-                       default:
-                               break;
+               /*
+                * When deleting, if possible just do it by switching the type
+                * of the key we're deleting, instead of creating and inserting
+                * a new whiteout:
+                */
+               if (s->deleting &&
+                   !s->update_btree &&
+                   !bkey_cmp(insert->k.p, k.k->p) &&
+                   !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
+                       if (!bkey_whiteout(k.k)) {
+                               btree_account_key_drop(l->b, _k);
+                               _k->type = KEY_TYPE_discard;
+                               reserve_whiteout(l->b, _k);
                        }
-
-                       discard.k.needs_whiteout = true;
-
-                       ret = extent_squash(s, insert, t, _k, k, overlap);
-                       BUG_ON(ret != BTREE_INSERT_OK);
-
-                       extent_bset_insert(c, iter, &discard);
-               } else {
-                       ret = extent_squash(s, insert, t, _k, k, overlap);
-                       BUG_ON(ret != BTREE_INSERT_OK);
+                       break;
                }
-next:
-               bch2_cut_front(s->committed, insert);
-               bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
-       }
 
-       if (ret == BTREE_INSERT_OK &&
-           bkey_cmp(s->committed, insert->k.p) < 0)
-               ret = extent_insert_advance_pos(s, bkey_s_c_null);
-stop:
-       extent_insert_committed(s);
+               if (k.k->needs_whiteout || bkey_written(l->b, _k)) {
+                       insert->k.needs_whiteout = true;
+                       s->update_btree = true;
+               }
 
-       bch2_fs_usage_apply(c, &s->stats, s->trans->disk_res,
-                          gc_pos_btree_node(b));
+               if (s->update_btree &&
+                   overlap == BCH_EXTENT_OVERLAP_ALL &&
+                   bkey_whiteout(k.k) &&
+                   k.k->needs_whiteout) {
+                       unreserve_whiteout(l->b, _k);
+                       _k->needs_whiteout = false;
+               }
 
-       EBUG_ON(bkey_cmp(iter->pos, s->committed));
-       EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) !=
-               !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF));
+               extent_squash(s, insert, _k, k, overlap);
 
-       bch2_cut_front(iter->pos, insert);
+               if (!s->update_btree)
+                       bch2_cut_front(s->committed, insert);
+next:
+               if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
+                   overlap == BCH_EXTENT_OVERLAP_MIDDLE)
+                       break;
+       }
 
-       if (insert->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
-               ret = BTREE_INSERT_NEED_TRAVERSE;
+       if (bkey_cmp(s->committed, insert->k.p) < 0)
+               s->committed = bpos_min(s->insert->k->k.p, l->b->key.k.p);
 
-       EBUG_ON(insert->k.size && ret == BTREE_INSERT_OK);
+       /*
+        * may have skipped past some deleted extents greater than the insert
+        * key, before we got to a non deleted extent and knew we could bail out
+        * rewind the iterator a bit if necessary:
+        */
+       {
+               struct btree_node_iter node_iter = l->iter;
 
-       return ret;
+               while ((_k = bch2_btree_node_iter_prev_all(&node_iter, l->b)) &&
+                      bkey_cmp_left_packed(l->b, _k, &s->committed) > 0)
+                       l->iter = node_iter;
+       }
 }
 
 /**
@@ -1585,27 +1224,21 @@ enum btree_insert_ret
 bch2_insert_fixup_extent(struct btree_insert *trans,
                         struct btree_insert_entry *insert)
 {
-       struct bch_fs *c = trans->c;
-       struct btree_iter *iter = insert->iter;
-       struct btree_iter_level *l = &iter->l[0];
-       struct btree *b = l->b;
-       struct btree_node_iter *node_iter = &l->iter;
-       struct bkey_packed *_k;
-       struct bkey unpacked;
-       enum btree_insert_ret ret = BTREE_INSERT_OK;
-
+       struct btree_iter *iter = insert->iter;
+       struct btree *b         = iter->l[0].b;
        struct extent_insert_state s = {
                .trans          = trans,
                .insert         = insert,
-               .committed      = insert->iter->pos,
+               .committed      = iter->pos,
+
+               .whiteout       = *insert->k,
+               .update_journal = !bkey_whiteout(&insert->k->k),
+               .update_btree   = !bkey_whiteout(&insert->k->k),
                .deleting       = bkey_whiteout(&insert->k->k),
        };
 
        EBUG_ON(iter->level);
-       EBUG_ON(bkey_deleted(&insert->k->k) || !insert->k->k.size);
-
-       if (s.deleting)
-               return bch2_delete_fixup_extent(&s);
+       EBUG_ON(!insert->k->k.size);
 
        /*
         * As we process overlapping extents, we advance @iter->pos both to
@@ -1615,172 +1248,87 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
         */
        EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
 
-       if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
-               bch2_add_sectors(&s, bkey_i_to_s_c(insert->k),
-                               bkey_start_offset(&insert->k->k),
-                               insert->k->k.size);
+       __bch2_insert_fixup_extent(&s);
 
-       while (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
-              (ret = extent_insert_should_stop(&s)) == BTREE_INSERT_OK &&
-              (_k = bch2_btree_node_iter_peek_all(node_iter, b))) {
-               struct bset_tree *t = bch2_bkey_to_bset(b, _k);
-               struct bkey_s k = __bkey_disassemble(b, _k, &unpacked);
-               enum bch_extent_overlap overlap;
-
-               EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
-               EBUG_ON(bkey_cmp(iter->pos, k.k->p) >= 0);
-
-               if (bkey_cmp(bkey_start_pos(k.k), insert->k->k.p) >= 0)
-                       break;
-
-               overlap = bch2_extent_overlap(&insert->k->k, k.k);
-
-               ret = extent_insert_check_split_compressed(&s, k.s_c, overlap);
-               if (ret != BTREE_INSERT_OK)
-                       goto stop;
-
-               if (!k.k->size)
-                       goto squash;
-
-               /*
-                * Only call advance pos & call hook for nonzero size extents:
-                */
-               ret = extent_insert_advance_pos(&s, k.s_c);
-               if (ret != BTREE_INSERT_OK)
-                       goto stop;
-
-               if (k.k->size &&
-                   (k.k->needs_whiteout || bset_written(b, bset(b, t))))
-                       insert->k->k.needs_whiteout = true;
-
-               if (overlap == BCH_EXTENT_OVERLAP_ALL &&
-                   bkey_whiteout(k.k) &&
-                   k.k->needs_whiteout) {
-                       unreserve_whiteout(b, t, _k);
-                       _k->needs_whiteout = false;
-               }
-squash:
-               ret = extent_squash(&s, insert->k, t, _k, k, overlap);
-               if (ret != BTREE_INSERT_OK)
-                       goto stop;
-       }
-
-       if (ret == BTREE_INSERT_OK &&
-           bkey_cmp(s.committed, insert->k->k.p) < 0)
-               ret = extent_insert_advance_pos(&s, bkey_s_c_null);
-stop:
        extent_insert_committed(&s);
-       /*
-        * Subtract any remaining sectors from @insert, if we bailed out early
-        * and didn't fully insert @insert:
-        */
-       if (insert->k->k.size &&
-           !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
-               bch2_subtract_sectors(&s, bkey_i_to_s_c(insert->k),
-                                    bkey_start_offset(&insert->k->k),
-                                    insert->k->k.size);
-
-       bch2_fs_usage_apply(c, &s.stats, trans->disk_res,
-                          gc_pos_btree_node(b));
 
        EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k->k)));
        EBUG_ON(bkey_cmp(iter->pos, s.committed));
-       EBUG_ON((bkey_cmp(iter->pos, b->key.k.p) == 0) !=
-               !!(iter->flags & BTREE_ITER_AT_END_OF_LEAF));
 
-       if (insert->k->k.size && (iter->flags & BTREE_ITER_AT_END_OF_LEAF))
-               ret = BTREE_INSERT_NEED_TRAVERSE;
-
-       EBUG_ON(insert->k->k.size && ret == BTREE_INSERT_OK);
+       if (insert->k->k.size) {
+               /* got to the end of this leaf node */
+               BUG_ON(bkey_cmp(iter->pos, b->key.k.p));
+               return BTREE_INSERT_NEED_TRAVERSE;
+       }
 
-       return ret;
+       return BTREE_INSERT_OK;
 }
 
-static const char *bch2_extent_invalid(const struct bch_fs *c,
-                                      struct bkey_s_c k)
+const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-       if (bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
-               return "value too big";
-
-       if (!k.k->size)
-               return "zero key size";
+       struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+       const union bch_extent_entry *entry;
+       struct bch_extent_crc_unpacked crc;
+       const struct bch_extent_ptr *ptr;
+       unsigned size_ondisk = e.k->size;
+       const char *reason;
+       unsigned nonce = UINT_MAX;
 
-       switch (k.k->type) {
-       case BCH_EXTENT:
-       case BCH_EXTENT_CACHED: {
-               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-               const union bch_extent_entry *entry;
-               struct bch_extent_crc_unpacked crc;
-               const struct bch_extent_ptr *ptr;
-               unsigned size_ondisk = e.k->size;
-               const char *reason;
-               unsigned nonce = UINT_MAX;
+       if (bkey_val_u64s(e.k) > BKEY_EXTENT_VAL_U64s_MAX)
+               return "value too big";
 
-               extent_for_each_entry(e, entry) {
-                       if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
-                               return "invalid extent entry type";
+       extent_for_each_entry(e, entry) {
+               if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
+                       return "invalid extent entry type";
 
-                       if (extent_entry_is_crc(entry)) {
-                               crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
+               switch (extent_entry_type(entry)) {
+               case BCH_EXTENT_ENTRY_ptr:
+                       ptr = entry_to_ptr(entry);
 
-                               if (crc.offset + e.k->size >
-                                   crc.uncompressed_size)
-                                       return "checksum offset + key size > uncompressed size";
+                       reason = extent_ptr_invalid(c, e.s_c, &entry->ptr,
+                                                   size_ondisk, false);
+                       if (reason)
+                               return reason;
+                       break;
+               case BCH_EXTENT_ENTRY_crc32:
+               case BCH_EXTENT_ENTRY_crc64:
+               case BCH_EXTENT_ENTRY_crc128:
+                       crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
 
-                               size_ondisk = crc.compressed_size;
+                       if (crc.offset + e.k->size >
+                           crc.uncompressed_size)
+                               return "checksum offset + key size > uncompressed size";
 
-                               if (!bch2_checksum_type_valid(c, crc.csum_type))
-                                       return "invalid checksum type";
+                       size_ondisk = crc.compressed_size;
 
-                               if (crc.compression_type >= BCH_COMPRESSION_NR)
-                                       return "invalid compression type";
+                       if (!bch2_checksum_type_valid(c, crc.csum_type))
+                               return "invalid checksum type";
 
-                               if (bch2_csum_type_is_encryption(crc.csum_type)) {
-                                       if (nonce == UINT_MAX)
-                                               nonce = crc.offset + crc.nonce;
-                                       else if (nonce != crc.offset + crc.nonce)
-                                               return "incorrect nonce";
-                               }
-                       } else {
-                               ptr = entry_to_ptr(entry);
+                       if (crc.compression_type >= BCH_COMPRESSION_NR)
+                               return "invalid compression type";
 
-                               reason = extent_ptr_invalid(c, e, &entry->ptr,
-                                                           size_ondisk, false);
-                               if (reason)
-                                       return reason;
+                       if (bch2_csum_type_is_encryption(crc.csum_type)) {
+                               if (nonce == UINT_MAX)
+                                       nonce = crc.offset + crc.nonce;
+                               else if (nonce != crc.offset + crc.nonce)
+                                       return "incorrect nonce";
                        }
+                       break;
+               case BCH_EXTENT_ENTRY_stripe_ptr:
+                       break;
                }
-
-               return NULL;
        }
 
-       case BCH_RESERVATION: {
-               struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
-
-               if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
-                       return "incorrect value size";
-
-               if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
-                       return "invalid nr_replicas";
-
-               return NULL;
-       }
-
-       default:
-               return "invalid value type";
-       }
+       return NULL;
 }
 
-static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
-                                         struct bkey_s_c_extent e)
+void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
+                           struct bkey_s_c k)
 {
-       const struct bch_extent_ptr *ptr;
-       struct bch_dev *ca;
-       struct bucket_mark mark;
-       unsigned seq, stale;
+       struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+       const union bch_extent_entry *entry;
+       struct extent_ptr_decoded p;
        char buf[160];
-       bool bad;
-       unsigned replicas = 0;
 
        /*
         * XXX: we should be doing most/all of these checks at startup time,
@@ -1791,109 +1339,54 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
         * going to get overwritten during replay)
         */
 
-       extent_for_each_ptr(e, ptr) {
-               ca = bch_dev_bkey_exists(c, ptr->dev);
-               replicas++;
-
-               /*
-                * If journal replay hasn't finished, we might be seeing keys
-                * that will be overwritten by the time journal replay is done:
-                */
-               if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
-                       continue;
-
-               stale = 0;
-
-               do {
-                       seq = read_seqcount_begin(&c->gc_pos_lock);
-                       mark = ptr_bucket_mark(ca, ptr);
-
-                       /* between mark and bucket gen */
-                       smp_rmb();
-
-                       stale = ptr_stale(ca, ptr);
-
-                       bch2_fs_bug_on(stale && !ptr->cached, c,
-                                        "stale dirty pointer");
-
-                       bch2_fs_bug_on(stale > 96, c,
-                                        "key too stale: %i",
-                                        stale);
-
-                       if (stale)
-                               break;
-
-                       bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
-                               (mark.data_type != BCH_DATA_USER ||
-                                !(ptr->cached
-                                  ? mark.cached_sectors
-                                  : mark.dirty_sectors));
-               } while (read_seqcount_retry(&c->gc_pos_lock, seq));
+       bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+                      !bch2_bkey_replicas_marked(c, e.s_c, false), c,
+                      "extent key bad (replicas not marked in superblock):\n%s",
+                      (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf));
 
-               if (bad)
-                       goto bad_ptr;
-       }
-
-       if (replicas > BCH_REPLICAS_MAX) {
-               bch2_bkey_val_to_text(c, btree_node_type(b), buf,
-                                    sizeof(buf), e.s_c);
-               bch2_fs_bug(c,
-                       "extent key bad (too many replicas: %u): %s",
-                       replicas, buf);
-               return;
-       }
-
-       if (!bkey_extent_is_cached(e.k) &&
-           !bch2_bkey_replicas_marked(c, BCH_DATA_USER, e.s_c)) {
-               bch2_bkey_val_to_text(c, btree_node_type(b),
-                                    buf, sizeof(buf), e.s_c);
-               bch2_fs_bug(c,
-                       "extent key bad (replicas not marked in superblock):\n%s",
-                       buf);
+       /*
+        * If journal replay hasn't finished, we might be seeing keys
+        * that will be overwritten by the time journal replay is done:
+        */
+       if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
                return;
-       }
-
-       return;
 
-bad_ptr:
-       bch2_bkey_val_to_text(c, btree_node_type(b), buf,
-                            sizeof(buf), e.s_c);
-       bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu "
-                  "gen %i type %u", buf,
-                  PTR_BUCKET_NR(ca, ptr), mark.gen, mark.data_type);
-       return;
-}
-
-static void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b,
-                                  struct bkey_s_c k)
-{
-       switch (k.k->type) {
-       case BCH_EXTENT:
-       case BCH_EXTENT_CACHED:
-               bch2_extent_debugcheck_extent(c, b, bkey_s_c_to_extent(k));
-               break;
-       case BCH_RESERVATION:
-               break;
-       default:
-               BUG();
+       extent_for_each_ptr_decode(e, p, entry) {
+               struct bch_dev *ca      = bch_dev_bkey_exists(c, p.ptr.dev);
+               struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr);
+               unsigned stale          = gen_after(mark.gen, p.ptr.gen);
+               unsigned disk_sectors   = ptr_disk_sectors(p);
+               unsigned mark_sectors   = p.ptr.cached
+                       ? mark.cached_sectors
+                       : mark.dirty_sectors;
+
+               bch2_fs_bug_on(stale && !p.ptr.cached, c,
+                              "stale dirty pointer (ptr gen %u bucket %u",
+                              p.ptr.gen, mark.gen);
+
+               bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale);
+
+               bch2_fs_bug_on(!stale &&
+                              (mark.data_type != BCH_DATA_USER ||
+                               mark_sectors < disk_sectors), c,
+                              "extent pointer not marked: %s:\n"
+                              "type %u sectors %u < %u",
+                              (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf),
+                              mark.data_type,
+                              mark_sectors, disk_sectors);
        }
 }
 
-static void bch2_extent_to_text(struct bch_fs *c, char *buf,
-                               size_t size, struct bkey_s_c k)
+void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
+                        struct bkey_s_c k)
 {
-       char *out = buf, *end = buf + size;
        const char *invalid;
 
-#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
-
-       if (bkey_extent_is_data(k.k))
-               out += extent_print_ptrs(c, buf, size, bkey_s_c_to_extent(k));
+       bkey_ptrs_to_text(out, c, k);
 
        invalid = bch2_extent_invalid(c, k);
        if (invalid)
-               p(" invalid: %s", invalid);
-#undef p
+               pr_buf(out, " invalid: %s", invalid);
 }
 
 static void bch2_extent_crc_init(union bch_extent_crc *crc,
@@ -1948,25 +1441,45 @@ static void bch2_extent_crc_init(union bch_extent_crc *crc,
 void bch2_extent_crc_append(struct bkey_i_extent *e,
                            struct bch_extent_crc_unpacked new)
 {
-       struct bch_extent_crc_unpacked crc;
-       const union bch_extent_entry *i;
+       bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
+       __extent_entry_push(e);
+}
 
-       BUG_ON(new.compressed_size > new.uncompressed_size);
-       BUG_ON(new.live_size != e->k.size);
-       BUG_ON(!new.compressed_size || !new.uncompressed_size);
+static inline void __extent_entry_insert(struct bkey_i_extent *e,
+                                        union bch_extent_entry *dst,
+                                        union bch_extent_entry *new)
+{
+       union bch_extent_entry *end = extent_entry_last(extent_i_to_s(e));
 
-       /*
-        * Look up the last crc entry, so we can check if we need to add
-        * another:
-        */
-       extent_for_each_crc(extent_i_to_s(e), crc, i)
-               ;
+       memmove_u64s_up((u64 *) dst + extent_entry_u64s(new),
+                       dst, (u64 *) end - (u64 *) dst);
+       e->k.u64s += extent_entry_u64s(new);
+       memcpy(dst, new, extent_entry_bytes(new));
+}
 
-       if (!memcmp(&crc, &new, sizeof(crc)))
-               return;
+void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e,
+                                   struct extent_ptr_decoded *p)
+{
+       struct bch_extent_crc_unpacked crc;
+       union bch_extent_entry *pos;
+       unsigned i;
 
-       bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
-       __extent_entry_push(e);
+       extent_for_each_crc(extent_i_to_s(e), crc, pos)
+               if (!bch2_crc_unpacked_cmp(crc, p->crc)) {
+                       pos = extent_entry_next(pos);
+                       goto found;
+               }
+
+       bch2_extent_crc_append(e, p->crc);
+       pos = extent_entry_last(extent_i_to_s(e));
+found:
+       p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
+       __extent_entry_insert(e, pos, to_entry(&p->ptr));
+
+       for (i = 0; i < p->ec_nr; i++) {
+               p->ec[i].type = 1 << BCH_EXTENT_ENTRY_stripe_ptr;
+               __extent_entry_insert(e, pos, to_entry(&p->ec[i]));
+       }
 }
 
 /*
@@ -1979,188 +1492,84 @@ void bch2_extent_crc_append(struct bkey_i_extent *e,
  */
 bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
 {
-       struct bkey_s_extent e;
-
-       switch (k.k->type) {
-       case KEY_TYPE_ERROR:
-               return false;
-
-       case KEY_TYPE_DELETED:
-       case KEY_TYPE_COOKIE:
-               return true;
-
-       case KEY_TYPE_DISCARD:
-               return bversion_zero(k.k->version);
+       struct bch_extent_ptr *ptr;
 
-       case BCH_EXTENT:
-       case BCH_EXTENT_CACHED:
-               e = bkey_s_to_extent(k);
+       bch2_bkey_drop_ptrs(k, ptr,
+               ptr->cached &&
+               ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
 
-               bch2_extent_drop_stale(c, e);
+       /* will only happen if all pointers were cached: */
+       if (!bkey_val_u64s(k.k))
+               k.k->type = KEY_TYPE_deleted;
 
-               if (!bkey_val_u64s(e.k)) {
-                       if (bkey_extent_is_cached(e.k)) {
-                               k.k->type = KEY_TYPE_DISCARD;
-                               if (bversion_zero(k.k->version))
-                                       return true;
-                       } else {
-                               k.k->type = KEY_TYPE_ERROR;
-                       }
-               }
-
-               return false;
-       case BCH_RESERVATION:
-               return false;
-       default:
-               BUG();
-       }
+       return false;
 }
 
 void bch2_extent_mark_replicas_cached(struct bch_fs *c,
                                      struct bkey_s_extent e,
-                                     unsigned nr_desired_replicas,
-                                     unsigned target)
+                                     unsigned target,
+                                     unsigned nr_desired_replicas)
 {
-       struct bch_extent_ptr *ptr;
-       int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas;
-
-       if (extra <= 0)
-               return;
+       union bch_extent_entry *entry;
+       struct extent_ptr_decoded p;
+       int extra = bch2_bkey_durability(c, e.s_c) - nr_desired_replicas;
 
-       extent_for_each_ptr(e, ptr) {
-               int n = bch2_extent_ptr_durability(c, ptr);
+       if (target && extra > 0)
+               extent_for_each_ptr_decode(e, p, entry) {
+                       int n = bch2_extent_ptr_durability(c, p);
 
-               if (n && n <= extra &&
-                   !dev_in_target(c->devs[ptr->dev], target)) {
-                       ptr->cached = true;
-                       extra -= n;
+                       if (n && n <= extra &&
+                           !bch2_dev_in_target(c, p.ptr.dev, target)) {
+                               entry->ptr.cached = true;
+                               extra -= n;
+                       }
                }
-       }
 
-       extent_for_each_ptr(e, ptr) {
-               int n = bch2_extent_ptr_durability(c, ptr);
+       if (extra > 0)
+               extent_for_each_ptr_decode(e, p, entry) {
+                       int n = bch2_extent_ptr_durability(c, p);
 
-               if (n && n <= extra) {
-                       ptr->cached = true;
-                       extra -= n;
+                       if (n && n <= extra) {
+                               entry->ptr.cached = true;
+                               extra -= n;
+                       }
                }
-       }
-}
-
-/*
- * This picks a non-stale pointer, preferably from a device other than @avoid.
- * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
- * other devices, it will still pick a pointer from avoid.
- */
-void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
-                         struct bch_devs_mask *avoid,
-                         struct extent_pick_ptr *ret)
-{
-       struct bkey_s_c_extent e;
-
-       switch (k.k->type) {
-       case KEY_TYPE_DELETED:
-       case KEY_TYPE_DISCARD:
-       case KEY_TYPE_COOKIE:
-               ret->ca = NULL;
-               return;
-
-       case KEY_TYPE_ERROR:
-               ret->ca = ERR_PTR(-EIO);
-               return;
-
-       case BCH_EXTENT:
-       case BCH_EXTENT_CACHED:
-               e = bkey_s_c_to_extent(k);
-               ret->ca = NULL;
-
-               extent_pick_read_device(c, bkey_s_c_to_extent(k), avoid, ret);
-
-               if (!ret->ca && !bkey_extent_is_cached(e.k))
-                       ret->ca = ERR_PTR(-EIO);
-               return;
-
-       case BCH_RESERVATION:
-               ret->ca = NULL;
-               return;
-
-       default:
-               BUG();
-       }
 }
 
-static enum merge_result bch2_extent_merge(struct bch_fs *c,
-                                          struct btree *bk,
-                                          struct bkey_i *l, struct bkey_i *r)
+enum merge_result bch2_extent_merge(struct bch_fs *c,
+                                   struct bkey_i *l, struct bkey_i *r)
 {
-       struct bkey_s_extent el, er;
+       struct bkey_s_extent el = bkey_i_to_s_extent(l);
+       struct bkey_s_extent er = bkey_i_to_s_extent(r);
        union bch_extent_entry *en_l, *en_r;
 
-       if (key_merging_disabled(c))
-               return BCH_MERGE_NOMERGE;
-
-       /*
-        * Generic header checks
-        * Assumes left and right are in order
-        * Left and right must be exactly aligned
-        */
-
-       if (l->k.u64s           != r->k.u64s ||
-           l->k.type           != r->k.type ||
-           bversion_cmp(l->k.version, r->k.version) ||
-           bkey_cmp(l->k.p, bkey_start_pos(&r->k)))
+       if (bkey_val_u64s(&l->k) != bkey_val_u64s(&r->k))
                return BCH_MERGE_NOMERGE;
 
-       switch (l->k.type) {
-       case KEY_TYPE_DELETED:
-       case KEY_TYPE_DISCARD:
-       case KEY_TYPE_ERROR:
-               /* These types are mergeable, and no val to check */
-               break;
-
-       case BCH_EXTENT:
-       case BCH_EXTENT_CACHED:
-               el = bkey_i_to_s_extent(l);
-               er = bkey_i_to_s_extent(r);
+       extent_for_each_entry(el, en_l) {
+               struct bch_extent_ptr *lp, *rp;
+               struct bch_dev *ca;
 
-               extent_for_each_entry(el, en_l) {
-                       struct bch_extent_ptr *lp, *rp;
-                       struct bch_dev *ca;
+               en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
 
-                       en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
-
-                       if ((extent_entry_type(en_l) !=
-                            extent_entry_type(en_r)) ||
-                           extent_entry_is_crc(en_l))
-                               return BCH_MERGE_NOMERGE;
-
-                       lp = &en_l->ptr;
-                       rp = &en_r->ptr;
-
-                       if (lp->offset + el.k->size     != rp->offset ||
-                           lp->dev                     != rp->dev ||
-                           lp->gen                     != rp->gen)
-                               return BCH_MERGE_NOMERGE;
+               if ((extent_entry_type(en_l) !=
+                    extent_entry_type(en_r)) ||
+                   !extent_entry_is_ptr(en_l))
+                       return BCH_MERGE_NOMERGE;
 
-                       /* We don't allow extents to straddle buckets: */
-                       ca = bch_dev_bkey_exists(c, lp->dev);
+               lp = &en_l->ptr;
+               rp = &en_r->ptr;
 
-                       if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
-                               return BCH_MERGE_NOMERGE;
-               }
+               if (lp->offset + el.k->size     != rp->offset ||
+                   lp->dev                     != rp->dev ||
+                   lp->gen                     != rp->gen)
+                       return BCH_MERGE_NOMERGE;
 
-               break;
-       case BCH_RESERVATION: {
-               struct bkey_i_reservation *li = bkey_i_to_reservation(l);
-               struct bkey_i_reservation *ri = bkey_i_to_reservation(r);
+               /* We don't allow extents to straddle buckets: */
+               ca = bch_dev_bkey_exists(c, lp->dev);
 
-               if (li->v.generation != ri->v.generation ||
-                   li->v.nr_replicas != ri->v.nr_replicas)
+               if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
                        return BCH_MERGE_NOMERGE;
-               break;
-       }
-       default:
-               return BCH_MERGE_NOMERGE;
        }
 
        l->k.needs_whiteout |= r->k.needs_whiteout;
@@ -2179,130 +1588,6 @@ static enum merge_result bch2_extent_merge(struct bch_fs *c,
        return BCH_MERGE_MERGE;
 }
 
-static void extent_i_save(struct btree *b, struct bkey_packed *dst,
-                         struct bkey_i *src)
-{
-       struct bkey_format *f = &b->format;
-       struct bkey_i *dst_unpacked;
-
-       BUG_ON(bkeyp_val_u64s(f, dst) != bkey_val_u64s(&src->k));
-
-       /*
-        * We don't want the bch2_verify_key_order() call in extent_save(),
-        * because we may be out of order with deleted keys that are about to be
-        * removed by extent_bset_insert()
-        */
-
-       if ((dst_unpacked = packed_to_bkey(dst)))
-               bkey_copy(dst_unpacked, src);
-       else
-               BUG_ON(!bch2_bkey_pack(dst, src, f));
-}
-
-static bool extent_merge_one_overlapping(struct btree_iter *iter,
-                                        struct bpos new_pos,
-                                        struct bset_tree *t,
-                                        struct bkey_packed *k, struct bkey uk,
-                                        bool check, bool could_pack)
-{
-       struct btree_iter_level *l = &iter->l[0];
-
-       BUG_ON(!bkey_deleted(k));
-
-       if (check) {
-               return !bkey_packed(k) || could_pack;
-       } else {
-               uk.p = new_pos;
-               extent_save(l->b, &l->iter, k, &uk);
-               bch2_bset_fix_invalidated_key(l->b, t, k);
-               bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
-                                        k, k->u64s, k->u64s);
-               return true;
-       }
-}
-
-static bool extent_merge_do_overlapping(struct btree_iter *iter,
-                                       struct bkey *m, bool back_merge)
-{
-       struct btree_iter_level *l = &iter->l[0];
-       struct btree *b = l->b;
-       struct btree_node_iter *node_iter = &l->iter;
-       struct bset_tree *t;
-       struct bkey_packed *k;
-       struct bkey uk;
-       struct bpos new_pos = back_merge ? m->p : bkey_start_pos(m);
-       bool could_pack = bkey_pack_pos((void *) &uk, new_pos, b);
-       bool check = true;
-
-       /*
-        * @m is the new merged extent:
-        *
-        * The merge took place in the last bset; we know there can't be any 0
-        * size extents overlapping with m there because if so they would have
-        * been between the two extents we merged.
-        *
-        * But in the other bsets, we have to check for and fix such extents:
-        */
-do_fixup:
-       for_each_bset(b, t) {
-               if (t == bset_tree_last(b))
-                       break;
-
-               /*
-                * if we don't find this bset in the iterator we already got to
-                * the end of that bset, so start searching from the end.
-                */
-               k = bch2_btree_node_iter_bset_pos(node_iter, b, t);
-
-               if (k == btree_bkey_last(b, t))
-                       k = bch2_bkey_prev_all(b, t, k);
-               if (!k)
-                       continue;
-
-               if (back_merge) {
-                       /*
-                        * Back merge: 0 size extents will be before the key
-                        * that was just inserted (and thus the iterator
-                        * position) - walk backwards to find them
-                        */
-                       for (;
-                            k &&
-                            (uk = bkey_unpack_key(b, k),
-                             bkey_cmp(uk.p, bkey_start_pos(m)) > 0);
-                            k = bch2_bkey_prev_all(b, t, k)) {
-                               if (bkey_cmp(uk.p, m->p) >= 0)
-                                       continue;
-
-                               if (!extent_merge_one_overlapping(iter, new_pos,
-                                               t, k, uk, check, could_pack))
-                                       return false;
-                       }
-               } else {
-                       /* Front merge - walk forwards */
-                       for (;
-                            k != btree_bkey_last(b, t) &&
-                            (uk = bkey_unpack_key(b, k),
-                             bkey_cmp(uk.p, m->p) < 0);
-                            k = bkey_next(k)) {
-                               if (bkey_cmp(uk.p,
-                                            bkey_start_pos(m)) <= 0)
-                                       continue;
-
-                               if (!extent_merge_one_overlapping(iter, new_pos,
-                                               t, k, uk, check, could_pack))
-                                       return false;
-                       }
-               }
-       }
-
-       if (check) {
-               check = false;
-               goto do_fixup;
-       }
-
-       return true;
-}
-
 /*
  * When merging an extent that we're inserting into a btree node, the new merged
  * extent could overlap with an existing 0 size extent - if we don't fix that,
@@ -2319,13 +1604,13 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
 {
        struct btree *b = iter->l[0].b;
        struct btree_node_iter *node_iter = &iter->l[0].iter;
-       const struct bkey_format *f = &b->format;
-       struct bset_tree *t = bset_tree_last(b);
-       struct bkey_packed *m;
-       BKEY_PADDED(k) li;
-       BKEY_PADDED(k) ri;
-       struct bkey_i *mi;
-       struct bkey tmp;
+       BKEY_PADDED(k) li, ri;
+       struct bkey_packed *m   = back_merge ? l : r;
+       struct bkey_i *mi       = back_merge ? &li.k : &ri.k;
+       struct bset_tree *t     = bch2_bkey_to_bset(b, m);
+       enum merge_result ret;
+
+       EBUG_ON(bkey_written(b, m));
 
        /*
         * We need to save copies of both l and r, because we might get a
@@ -2334,57 +1619,49 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
        bch2_bkey_unpack(b, &li.k, l);
        bch2_bkey_unpack(b, &ri.k, r);
 
-       m = back_merge ? l : r;
-       mi = back_merge ? &li.k : &ri.k;
+       ret = bch2_bkey_merge(c, &li.k, &ri.k);
+       if (ret == BCH_MERGE_NOMERGE)
+               return false;
 
-       /* l & r should be in last bset: */
-       EBUG_ON(bch2_bkey_to_bset(b, m) != t);
+       /*
+        * check if we overlap with deleted extents - would break the sort
+        * order:
+        */
+       if (back_merge) {
+               struct bkey_packed *n = bkey_next(m);
 
-       switch (bch2_extent_merge(c, b, &li.k, &ri.k)) {
-       case BCH_MERGE_NOMERGE:
-               return false;
-       case BCH_MERGE_PARTIAL:
-               if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &mi->k, f))
+               if (n != btree_bkey_last(b, t) &&
+                   bkey_cmp_left_packed(b, n, &li.k.k.p) <= 0 &&
+                   bkey_deleted(n))
                        return false;
+       } else if (ret == BCH_MERGE_MERGE) {
+               struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
 
-               if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
+               if (prev &&
+                   bkey_cmp_left_packed_byval(b, prev,
+                               bkey_start_pos(&li.k.k)) > 0)
                        return false;
+       }
 
-               extent_i_save(b, m, mi);
-               bch2_bset_fix_invalidated_key(b, t, m);
-
-               /*
-                * Update iterator to reflect what we just inserted - otherwise,
-                * the iter_fix() call is going to put us _before_ the key we
-                * just partially merged with:
-                */
-               if (back_merge)
-                       bch2_btree_iter_set_pos_same_leaf(iter, li.k.k.p);
-
-               bch2_btree_node_iter_fix(iter, b, node_iter,
-                                        t, m, m->u64s, m->u64s);
+       if (ret == BCH_MERGE_PARTIAL) {
+               if (!extent_i_save(b, m, mi))
+                       return false;
 
                if (!back_merge)
                        bkey_copy(packed_to_bkey(l), &li.k);
                else
                        bkey_copy(packed_to_bkey(r), &ri.k);
-               return false;
-       case BCH_MERGE_MERGE:
-               if (bkey_packed(m) && !bch2_bkey_pack_key((void *) &tmp, &li.k.k, f))
-                       return false;
-
-               if (!extent_merge_do_overlapping(iter, &li.k.k, back_merge))
+       } else {
+               if (!extent_i_save(b, m, &li.k))
                        return false;
+       }
 
-               extent_i_save(b, m, &li.k);
-               bch2_bset_fix_invalidated_key(b, t, m);
+       bch2_bset_fix_invalidated_key(b, m);
+       bch2_btree_node_iter_fix(iter, b, node_iter,
+                                m, m->u64s, m->u64s);
+       verify_modified_extent(iter, m);
 
-               bch2_btree_node_iter_fix(iter, b, node_iter,
-                                        t, m, m->u64s, m->u64s);
-               return true;
-       default:
-               BUG();
-       }
+       return ret == BCH_MERGE_MERGE;
 }
 
 int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
@@ -2411,12 +1688,53 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
        return ret;
 }
 
-const struct bkey_ops bch2_bkey_extent_ops = {
-       .key_invalid    = bch2_extent_invalid,
-       .key_debugcheck = bch2_extent_debugcheck,
-       .val_to_text    = bch2_extent_to_text,
-       .swab           = bch2_ptr_swab,
-       .key_normalize  = bch2_ptr_normalize,
-       .key_merge      = bch2_extent_merge,
-       .is_extents     = true,
-};
+/* KEY_TYPE_reservation: */
+
+const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+       if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
+               return "incorrect value size";
+
+       if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
+               return "invalid nr_replicas";
+
+       return NULL;
+}
+
+void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
+                             struct bkey_s_c k)
+{
+       struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+       pr_buf(out, "generation %u replicas %u",
+              le32_to_cpu(r.v->generation),
+              r.v->nr_replicas);
+}
+
+enum merge_result bch2_reservation_merge(struct bch_fs *c,
+                                        struct bkey_i *l, struct bkey_i *r)
+{
+       struct bkey_i_reservation *li = bkey_i_to_reservation(l);
+       struct bkey_i_reservation *ri = bkey_i_to_reservation(r);
+
+       if (li->v.generation != ri->v.generation ||
+           li->v.nr_replicas != ri->v.nr_replicas)
+               return BCH_MERGE_NOMERGE;
+
+       l->k.needs_whiteout |= r->k.needs_whiteout;
+
+       /* Keys with no pointers aren't restricted to one bucket and could
+        * overflow KEY_SIZE
+        */
+       if ((u64) l->k.size + r->k.size > KEY_SIZE_MAX) {
+               bch2_key_resize(&l->k, KEY_SIZE_MAX);
+               bch2_cut_front(l->k.p, r);
+               return BCH_MERGE_PARTIAL;
+       }
+
+       bch2_key_resize(&l->k, l->k.size + r->k.size);
+
+       return BCH_MERGE_MERGE;
+}