]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/extents.c
Update bcachefs sources to bed61fae3b bcachefs: Delete a faulty assertion
[bcachefs-tools-debian] / libbcachefs / extents.c
index 89b5be907eeafbd13bd7cb2ea743ab33ca91110f..1b25f84e4b9cb883fe36dd70bfe43a8df10484aa 100644 (file)
 #include "replicas.h"
 #include "super.h"
 #include "super-io.h"
+#include "trace.h"
 #include "util.h"
 
-#include <trace/events/bcachefs.h>
-
 static unsigned bch2_crc_field_size_max[] = {
        [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
        [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
@@ -115,6 +114,13 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
                return -EIO;
 
        bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+               /*
+                * Unwritten extent: no need to actually read, treat it as a
+                * hole and return 0s:
+                */
+               if (p.ptr.unwritten)
+                       return 0;
+
                ca = bch_dev_bkey_exists(c, p.ptr.dev);
 
                /*
@@ -156,12 +162,17 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 
 /* KEY_TYPE_btree_ptr: */
 
-const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                          enum bkey_invalid_flags flags,
+                          struct printbuf *err)
 {
-       if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX)
-               return "value too big";
+       if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) {
+               prt_printf(err, "value too big (%zu > %u)",
+                      bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
+               return -BCH_ERR_invalid_bkey;
+       }
 
-       return bch2_bkey_ptrs_invalid(c, k);
+       return bch2_bkey_ptrs_invalid(c, k, flags, err);
 }
 
 void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
@@ -170,35 +181,31 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
        bch2_bkey_ptrs_to_text(out, c, k);
 }
 
-const char *bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                             enum bkey_invalid_flags flags,
+                             struct printbuf *err)
 {
-       struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
-
-       if (bkey_val_bytes(k.k) <= sizeof(*bp.v))
-               return "value too small";
-
-       if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
-               return "value too big";
-
-       if (c->sb.version < bcachefs_metadata_version_snapshot &&
-           bp.v->min_key.snapshot)
-               return "invalid min_key.snapshot";
+       if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) {
+               prt_printf(err, "value too big (%zu > %zu)",
+                      bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
+               return -BCH_ERR_invalid_bkey;
+       }
 
-       return bch2_bkey_ptrs_invalid(c, k);
+       return bch2_bkey_ptrs_invalid(c, k, flags, err);
 }
 
 void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
-                           struct bkey_s_c k)
+                              struct bkey_s_c k)
 {
        struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
-       pr_buf(out, "seq %llx written %u min_key %s",
+       prt_printf(out, "seq %llx written %u min_key %s",
               le64_to_cpu(bp.v->seq),
               le16_to_cpu(bp.v->sectors_written),
               BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : "");
 
        bch2_bpos_to_text(out, bp.v->min_key);
-       pr_buf(out, " ");
+       prt_printf(out, " ");
        bch2_bkey_ptrs_to_text(out, c, k);
 }
 
@@ -211,8 +218,8 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
        compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
 
        if (version < bcachefs_metadata_version_inode_btree_change &&
-           btree_node_type_is_extents(btree_id) &&
-           bkey_cmp(bp.v->min_key, POS_MIN))
+           btree_id_is_extents(btree_id) &&
+           !bkey_eq(bp.v->min_key, POS_MIN))
                bp.v->min_key = write
                        ? bpos_nosnap_predecessor(bp.v->min_key)
                        : bpos_nosnap_successor(bp.v->min_key);
@@ -220,17 +227,6 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
 
 /* KEY_TYPE_extent: */
 
-const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
-{
-       return bch2_bkey_ptrs_invalid(c, k);
-}
-
-void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
-                        struct bkey_s_c k)
-{
-       bch2_bkey_ptrs_to_text(out, c, k);
-}
-
 bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 {
        struct bkey_ptrs   l_ptrs = bch2_bkey_ptrs(l);
@@ -265,6 +261,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
                    rp.ptr.offset + rp.crc.offset ||
                    lp.ptr.dev                  != rp.ptr.dev ||
                    lp.ptr.gen                  != rp.ptr.gen ||
+                   lp.ptr.unwritten            != rp.ptr.unwritten ||
                    lp.has_ec                   != rp.has_ec)
                        return false;
 
@@ -287,7 +284,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
                if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <=
                    lp.crc.uncompressed_size) {
                        /* can use left extent's crc entry */
-               } else if (lp.crc.live_size <= rp.crc.offset ) {
+               } else if (lp.crc.live_size <= rp.crc.offset) {
                        /* can use right extent's crc entry */
                } else {
                        /* check if checksums can be merged: */
@@ -303,10 +300,22 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 
                        if (lp.crc.csum_type &&
                            lp.crc.uncompressed_size +
-                           rp.crc.uncompressed_size > c->sb.encoded_extent_max)
+                           rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9))
                                return false;
+               }
 
-                       if (lp.crc.uncompressed_size + rp.crc.uncompressed_size >
+               en_l = extent_entry_next(en_l);
+               en_r = extent_entry_next(en_r);
+       }
+
+       en_l = l_ptrs.start;
+       en_r = r_ptrs.start;
+       while (en_l < l_ptrs.end && en_r < r_ptrs.end) {
+               if (extent_entry_is_crc(en_l)) {
+                       struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+                       struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+                       if (crc_l.uncompressed_size + crc_r.uncompressed_size >
                            bch2_crc_field_size_max[extent_entry_type(en_l)])
                                return false;
                }
@@ -334,7 +343,7 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
                        if (crc_l.offset + crc_l.live_size + crc_r.live_size <=
                            crc_l.uncompressed_size) {
                                /* can use left extent's crc entry */
-                       } else if (crc_l.live_size <= crc_r.offset ) {
+                       } else if (crc_l.live_size <= crc_r.offset) {
                                /* can use right extent's crc entry */
                                crc_r.offset -= crc_l.live_size;
                                bch2_extent_crc_pack(entry_to_crc(en_l), crc_r,
@@ -363,17 +372,19 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 
 /* KEY_TYPE_reservation: */
 
-const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                            enum bkey_invalid_flags flags,
+                            struct printbuf *err)
 {
        struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
 
-       if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
-               return "incorrect value size";
-
-       if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
-               return "invalid nr_replicas";
+       if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) {
+               prt_printf(err, "invalid nr_replicas (%u)",
+                      r.v->nr_replicas);
+               return -BCH_ERR_invalid_bkey;
+       }
 
-       return NULL;
+       return 0;
 }
 
 void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
@@ -381,7 +392,7 @@ void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
 {
        struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
 
-       pr_buf(out, "generation %u replicas %u",
+       prt_printf(out, "generation %u replicas %u",
               le32_to_cpu(r.v->generation),
               r.v->nr_replicas);
 }
@@ -480,7 +491,7 @@ restart_narrow_pointers:
 
        bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
                if (can_narrow_crc(p.crc, n)) {
-                       __bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
+                       bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr);
                        p.ptr.offset += p.crc.offset;
                        p.crc = n;
                        bch2_extent_ptr_decoded_append(k, &p);
@@ -506,13 +517,13 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst,
        switch (type) {
        case BCH_EXTENT_ENTRY_crc32:
                set_common_fields(dst->crc32, src);
-               dst->crc32.csum  = *((__le32 *) &src.csum.lo);
+               dst->crc32.csum         = (u32 __force) *((__le32 *) &src.csum.lo);
                break;
        case BCH_EXTENT_ENTRY_crc64:
                set_common_fields(dst->crc64, src);
                dst->crc64.nonce        = src.nonce;
-               dst->crc64.csum_lo      = src.csum.lo;
-               dst->crc64.csum_hi      = *((__le16 *) &src.csum.hi);
+               dst->crc64.csum_lo      = (u64 __force) src.csum.lo;
+               dst->crc64.csum_hi      = (u64 __force) *((__le16 *) &src.csum.hi);
                break;
        case BCH_EXTENT_ENTRY_crc128:
                set_common_fields(dst->crc128, src);
@@ -633,24 +644,37 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
        return replicas;
 }
 
-static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
-                                          struct extent_ptr_decoded p)
+unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
 {
-       unsigned durability = 0;
        struct bch_dev *ca;
 
-       if (p.ptr.cached)
+       if (p->ptr.cached)
                return 0;
 
-       ca = bch_dev_bkey_exists(c, p.ptr.dev);
+       ca = bch_dev_bkey_exists(c, p->ptr.dev);
 
-       if (ca->mi.state != BCH_MEMBER_STATE_failed)
-               durability = max_t(unsigned, durability, ca->mi.durability);
+       return ca->mi.durability +
+               (p->has_ec
+                ? p->ec.redundancy
+                : 0);
+}
 
-       if (p.has_ec)
-               durability += p.ec.redundancy;
+unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
+{
+       struct bch_dev *ca;
 
-       return durability;
+       if (p->ptr.cached)
+               return 0;
+
+       ca = bch_dev_bkey_exists(c, p->ptr.dev);
+
+       if (ca->mi.state == BCH_MEMBER_STATE_failed)
+               return 0;
+
+       return ca->mi.durability +
+               (p->has_ec
+                ? p->ec.redundancy
+                : 0);
 }
 
 unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
@@ -661,40 +685,23 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
        unsigned durability = 0;
 
        bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-               durability += bch2_extent_ptr_durability(c, p);
+               durability += bch2_extent_ptr_durability(c, &p);
 
        return durability;
 }
 
-void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
-                                   unsigned target,
-                                   unsigned nr_desired_replicas)
+static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k)
 {
-       struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-       union bch_extent_entry *entry;
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const union bch_extent_entry *entry;
        struct extent_ptr_decoded p;
-       int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
-
-       if (target && extra > 0)
-               bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-                       int n = bch2_extent_ptr_durability(c, p);
-
-                       if (n && n <= extra &&
-                           !bch2_dev_in_target(c, p.ptr.dev, target)) {
-                               entry->ptr.cached = true;
-                               extra -= n;
-                       }
-               }
+       unsigned durability = 0;
 
-       if (extra > 0)
-               bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-                       int n = bch2_extent_ptr_durability(c, p);
+       bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+               if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev])
+                       durability += bch2_extent_ptr_durability(c, &p);
 
-                       if (n && n <= extra) {
-                               entry->ptr.cached = true;
-                               extra -= n;
-                       }
-               }
+       return durability;
 }
 
 void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry)
@@ -706,41 +713,6 @@ void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry
        k->k.u64s -= extent_entry_u64s(entry);
 }
 
-void bch2_bkey_append_ptr(struct bkey_i *k,
-                         struct bch_extent_ptr ptr)
-{
-       EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
-
-       switch (k->k.type) {
-       case KEY_TYPE_btree_ptr:
-       case KEY_TYPE_btree_ptr_v2:
-       case KEY_TYPE_extent:
-               EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
-
-               ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-
-               memcpy((void *) &k->v + bkey_val_bytes(&k->k),
-                      &ptr,
-                      sizeof(ptr));
-               k->u64s++;
-               break;
-       default:
-               BUG();
-       }
-}
-
-static inline void __extent_entry_insert(struct bkey_i *k,
-                                        union bch_extent_entry *dst,
-                                        union bch_extent_entry *new)
-{
-       union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k));
-
-       memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new),
-                             dst, (u64 *) end - (u64 *) dst);
-       k->k.u64s += extent_entry_u64s(new);
-       memcpy(dst, new, extent_entry_bytes(new));
-}
-
 void bch2_extent_ptr_decoded_append(struct bkey_i *k,
                                    struct extent_ptr_decoded *p)
 {
@@ -800,8 +772,8 @@ static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
 /*
  * Returns pointer to the next entry after the one being dropped:
  */
-union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k,
-                                            struct bch_extent_ptr *ptr)
+union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k,
+                                                  struct bch_extent_ptr *ptr)
 {
        struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
        union bch_extent_entry *entry = to_entry(ptr), *next;
@@ -844,7 +816,7 @@ union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
 {
        bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
        union bch_extent_entry *ret =
-               __bch2_bkey_drop_ptr(k, ptr);
+               bch2_bkey_drop_ptr_noerror(k, ptr);
 
        /*
         * If we deleted all the dirty pointers and there's still cached
@@ -873,8 +845,15 @@ void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
        bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
 }
 
-const struct bch_extent_ptr *
-bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
+void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
+{
+       struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev);
+
+       if (ptr)
+               bch2_bkey_drop_ptr_noerror(k, ptr);
+}
+
+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const struct bch_extent_ptr *ptr;
@@ -917,6 +896,78 @@ bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
        return false;
 }
 
+/*
+ * Returns true if two extents refer to the same data:
+ */
+bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
+{
+       if (k1.k->type != k2.k->type)
+               return false;
+
+       if (bkey_extent_is_direct_data(k1.k)) {
+               struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1);
+               struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2);
+               const union bch_extent_entry *entry1, *entry2;
+               struct extent_ptr_decoded p1, p2;
+
+               if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2))
+                       return false;
+
+               bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
+                       bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+                               if (p1.ptr.dev          == p2.ptr.dev &&
+                                   p1.ptr.gen          == p2.ptr.gen &&
+                                   (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+                                   (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+                                       return true;
+
+               return false;
+       } else {
+               /* KEY_TYPE_deleted, etc. */
+               return true;
+       }
+}
+
+struct bch_extent_ptr *
+bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2)
+{
+       struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2);
+       union bch_extent_entry *entry2;
+       struct extent_ptr_decoded p2;
+
+       bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
+               if (p1.ptr.dev          == p2.ptr.dev &&
+                   p1.ptr.gen          == p2.ptr.gen &&
+                   (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+                   (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+                       return &entry2->ptr;
+
+       return NULL;
+}
+
+void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
+{
+       struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+       union bch_extent_entry *entry;
+       union bch_extent_entry *ec = NULL;
+
+       bkey_extent_entry_for_each(ptrs, entry) {
+               if (&entry->ptr == ptr) {
+                       ptr->cached = true;
+                       if (ec)
+                               extent_entry_drop(k, ec);
+                       return;
+               }
+
+               if (extent_entry_is_stripe_ptr(entry))
+                       ec = entry;
+               else if (extent_entry_is_ptr(entry))
+                       ec = NULL;
+       }
+
+       BUG();
+}
+
 /*
  * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
  *
@@ -947,29 +998,44 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
        struct bch_dev *ca;
        bool first = true;
 
+       if (c)
+               prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k));
+
        bkey_extent_entry_for_each(ptrs, entry) {
                if (!first)
-                       pr_buf(out, " ");
+                       prt_printf(out, " ");
 
                switch (__extent_entry_type(entry)) {
                case BCH_EXTENT_ENTRY_ptr:
                        ptr = entry_to_ptr(entry);
-                       ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+                       ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
                                ? bch_dev_bkey_exists(c, ptr->dev)
                                : NULL;
 
-                       pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
-                              (u64) ptr->offset, ptr->gen,
-                              ptr->cached ? " cached" : "",
-                              ca && ptr_stale(ca, ptr)
-                              ? " stale" : "");
+                       if (!ca) {
+                               prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
+                                      (u64) ptr->offset, ptr->gen,
+                                      ptr->cached ? " cached" : "");
+                       } else {
+                               u32 offset;
+                               u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+                               prt_printf(out, "ptr: %u:%llu:%u gen %u",
+                                          ptr->dev, b, offset, ptr->gen);
+                               if (ptr->cached)
+                                       prt_str(out, " cached");
+                               if (ptr->unwritten)
+                                       prt_str(out, " unwritten");
+                               if (ca && ptr_stale(ca, ptr))
+                                       prt_printf(out, " stale");
+                       }
                        break;
                case BCH_EXTENT_ENTRY_crc32:
                case BCH_EXTENT_ENTRY_crc64:
                case BCH_EXTENT_ENTRY_crc128:
                        crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
-                       pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
+                       prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
                               crc.compressed_size,
                               crc.uncompressed_size,
                               crc.offset, crc.nonce,
@@ -979,11 +1045,11 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
                case BCH_EXTENT_ENTRY_stripe_ptr:
                        ec = &entry->stripe_ptr;
 
-                       pr_buf(out, "ec: idx %llu block %u",
+                       prt_printf(out, "ec: idx %llu block %u",
                               (u64) ec->idx, ec->block);
                        break;
                default:
-                       pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
+                       prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
                        return;
                }
 
@@ -991,69 +1057,118 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
        }
 }
 
-static const char *extent_ptr_invalid(const struct bch_fs *c,
-                                     struct bkey_s_c k,
-                                     const struct bch_extent_ptr *ptr,
-                                     unsigned size_ondisk,
-                                     bool metadata)
+static int extent_ptr_invalid(const struct bch_fs *c,
+                             struct bkey_s_c k,
+                             enum bkey_invalid_flags flags,
+                             const struct bch_extent_ptr *ptr,
+                             unsigned size_ondisk,
+                             bool metadata,
+                             struct printbuf *err)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const struct bch_extent_ptr *ptr2;
+       u64 bucket;
+       u32 bucket_offset;
        struct bch_dev *ca;
 
-       if (!bch2_dev_exists2(c, ptr->dev))
-               return "pointer to invalid device";
+       if (!bch2_dev_exists2(c, ptr->dev)) {
+               /*
+                * If we're in the write path this key might have already been
+                * overwritten, and we could be seeing a device that doesn't
+                * exist anymore due to racing with device removal:
+                */
+               if (flags & BKEY_INVALID_WRITE)
+                       return 0;
 
-       ca = bch_dev_bkey_exists(c, ptr->dev);
-       if (!ca)
-               return "pointer to invalid device";
+               prt_printf(err, "pointer to invalid device (%u)", ptr->dev);
+               return -BCH_ERR_invalid_bkey;
+       }
 
+       ca = bch_dev_bkey_exists(c, ptr->dev);
        bkey_for_each_ptr(ptrs, ptr2)
-               if (ptr != ptr2 && ptr->dev == ptr2->dev)
-                       return "multiple pointers to same device";
+               if (ptr != ptr2 && ptr->dev == ptr2->dev) {
+                       prt_printf(err, "multiple pointers to same device (%u)", ptr->dev);
+                       return -BCH_ERR_invalid_bkey;
+               }
 
-       if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
-               return "offset past end of device";
+       bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
 
-       if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
-               return "offset before first bucket";
+       if (bucket >= ca->mi.nbuckets) {
+               prt_printf(err, "pointer past last bucket (%llu > %llu)",
+                      bucket, ca->mi.nbuckets);
+               return -BCH_ERR_invalid_bkey;
+       }
 
-       if (bucket_remainder(ca, ptr->offset) +
-           size_ondisk > ca->mi.bucket_size)
-               return "spans multiple buckets";
+       if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) {
+               prt_printf(err, "pointer before first bucket (%llu < %u)",
+                      bucket, ca->mi.first_bucket);
+               return -BCH_ERR_invalid_bkey;
+       }
 
-       return NULL;
+       if (bucket_offset + size_ondisk > ca->mi.bucket_size) {
+               prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)",
+                      bucket_offset, size_ondisk, ca->mi.bucket_size);
+               return -BCH_ERR_invalid_bkey;
+       }
+
+       return 0;
 }
 
-const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
+int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
+                          enum bkey_invalid_flags flags,
+                          struct printbuf *err)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       struct bch_devs_list devs;
        const union bch_extent_entry *entry;
        struct bch_extent_crc_unpacked crc;
        unsigned size_ondisk = k.k->size;
-       const char *reason;
        unsigned nonce = UINT_MAX;
-       unsigned i;
+       unsigned nr_ptrs = 0;
+       bool unwritten = false, have_ec = false, crc_since_last_ptr = false;
+       int ret;
 
-       if (k.k->type == KEY_TYPE_btree_ptr ||
-           k.k->type == KEY_TYPE_btree_ptr_v2)
-               size_ondisk = c->opts.btree_node_size;
+       if (bkey_is_btree_ptr(k.k))
+               size_ondisk = btree_sectors(c);
 
        bkey_extent_entry_for_each(ptrs, entry) {
-               if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
-                       return "invalid extent entry type";
+               if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) {
+                       prt_printf(err, "invalid extent entry type (got %u, max %u)",
+                              __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
+                       return -BCH_ERR_invalid_bkey;
+               }
 
-               if (k.k->type == KEY_TYPE_btree_ptr &&
-                   !extent_entry_is_ptr(entry))
-                       return "has non ptr field";
+               if (bkey_is_btree_ptr(k.k) &&
+                   !extent_entry_is_ptr(entry)) {
+                       prt_printf(err, "has non ptr field");
+                       return -BCH_ERR_invalid_bkey;
+               }
 
                switch (extent_entry_type(entry)) {
                case BCH_EXTENT_ENTRY_ptr:
-                       reason = extent_ptr_invalid(c, k, &entry->ptr,
-                                                   size_ondisk, false);
-                       if (reason)
-                               return reason;
+                       ret = extent_ptr_invalid(c, k, flags, &entry->ptr,
+                                                size_ondisk, false, err);
+                       if (ret)
+                               return ret;
+
+                       if (nr_ptrs && unwritten != entry->ptr.unwritten) {
+                               prt_printf(err, "extent with unwritten and written ptrs");
+                               return -BCH_ERR_invalid_bkey;
+                       }
+
+                       if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) {
+                               prt_printf(err, "has unwritten ptrs");
+                               return -BCH_ERR_invalid_bkey;
+                       }
+
+                       if (entry->ptr.cached && have_ec) {
+                               prt_printf(err, "cached, erasure coded ptr");
+                               return -BCH_ERR_invalid_bkey;
+                       }
+
+                       unwritten = entry->ptr.unwritten;
+                       have_ec = false;
+                       crc_since_last_ptr = false;
+                       nr_ptrs++;
                        break;
                case BCH_EXTENT_ENTRY_crc32:
                case BCH_EXTENT_ENTRY_crc64:
@@ -1061,36 +1176,71 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
                        crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
                        if (crc.offset + crc.live_size >
-                           crc.uncompressed_size)
-                               return "checksum offset + key size > uncompressed size";
+                           crc.uncompressed_size) {
+                               prt_printf(err, "checksum offset + key size > uncompressed size");
+                               return -BCH_ERR_invalid_bkey;
+                       }
 
                        size_ondisk = crc.compressed_size;
 
-                       if (!bch2_checksum_type_valid(c, crc.csum_type))
-                               return "invalid checksum type";
+                       if (!bch2_checksum_type_valid(c, crc.csum_type)) {
+                               prt_printf(err, "invalid checksum type");
+                               return -BCH_ERR_invalid_bkey;
+                       }
 
-                       if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR)
-                               return "invalid compression type";
+                       if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) {
+                               prt_printf(err, "invalid compression type");
+                               return -BCH_ERR_invalid_bkey;
+                       }
 
                        if (bch2_csum_type_is_encryption(crc.csum_type)) {
                                if (nonce == UINT_MAX)
                                        nonce = crc.offset + crc.nonce;
-                               else if (nonce != crc.offset + crc.nonce)
-                                       return "incorrect nonce";
+                               else if (nonce != crc.offset + crc.nonce) {
+                                       prt_printf(err, "incorrect nonce");
+                                       return -BCH_ERR_invalid_bkey;
+                               }
                        }
+
+                       if (crc_since_last_ptr) {
+                               prt_printf(err, "redundant crc entry");
+                               return -BCH_ERR_invalid_bkey;
+                       }
+                       crc_since_last_ptr = true;
                        break;
                case BCH_EXTENT_ENTRY_stripe_ptr:
+                       if (have_ec) {
+                               prt_printf(err, "redundant stripe entry");
+                               return -BCH_ERR_invalid_bkey;
+                       }
+                       have_ec = true;
+                       break;
+               case BCH_EXTENT_ENTRY_rebalance:
                        break;
                }
        }
 
-       devs = bch2_bkey_devs(k);
-       bubble_sort(devs.devs, devs.nr, u8_cmp);
-       for (i = 0; i + 1 < devs.nr; i++)
-               if (devs.devs[i] == devs.devs[i + 1])
-                       return "multiple ptrs to same device";
+       if (!nr_ptrs) {
+               prt_str(err, "no ptrs");
+               return -BCH_ERR_invalid_bkey;
+       }
+
+       if (nr_ptrs >= BCH_BKEY_PTRS_MAX) {
+               prt_str(err, "too many ptrs");
+               return -BCH_ERR_invalid_bkey;
+       }
 
-       return NULL;
+       if (crc_since_last_ptr) {
+               prt_printf(err, "redundant crc entry");
+               return -BCH_ERR_invalid_bkey;
+       }
+
+       if (have_ec) {
+               prt_printf(err, "redundant stripe entry");
+               return -BCH_ERR_invalid_bkey;
+       }
+
+       return 0;
 }
 
 void bch2_ptr_swab(struct bkey_s k)
@@ -1125,6 +1275,8 @@ void bch2_ptr_swab(struct bkey_s k)
                        break;
                case BCH_EXTENT_ENTRY_stripe_ptr:
                        break;
+               case BCH_EXTENT_ENTRY_rebalance:
+                       break;
                }
        }
 }
@@ -1137,10 +1289,10 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
        int val_u64s_delta;
        u64 sub;
 
-       if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
+       if (bkey_le(where, bkey_start_pos(k.k)))
                return 0;
 
-       EBUG_ON(bkey_cmp(where, k.k->p) > 0);
+       EBUG_ON(bkey_gt(where, k.k->p));
 
        sub = where.offset - bkey_start_offset(k.k);
 
@@ -1175,6 +1327,8 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
                                break;
                        case BCH_EXTENT_ENTRY_stripe_ptr:
                                break;
+                       case BCH_EXTENT_ENTRY_rebalance:
+                               break;
                        }
 
                        if (extent_entry_is_crc(entry))
@@ -1217,10 +1371,10 @@ int bch2_cut_back_s(struct bpos where, struct bkey_s k)
        int val_u64s_delta;
        u64 len = 0;
 
-       if (bkey_cmp(where, k.k->p) >= 0)
+       if (bkey_ge(where, k.k->p))
                return 0;
 
-       EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0);
+       EBUG_ON(bkey_lt(where, bkey_start_pos(k.k)));
 
        len = where.offset - bkey_start_offset(k.k);