]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/extents.c
Update bcachefs sources to 2a6125decb43 bcachefs: bch_sb_field_downgrade
[bcachefs-tools-debian] / libbcachefs / extents.c
index 7e00550980de3f4c9a8c1eef3079677f8177c502..82ec056f4cdbb1f4e4234fce274939b61b7a5015 100644 (file)
@@ -13,6 +13,7 @@
 #include "btree_iter.h"
 #include "buckets.h"
 #include "checksum.h"
+#include "compress.h"
 #include "debug.h"
 #include "disk_groups.h"
 #include "error.h"
@@ -162,16 +163,19 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
 
 /* KEY_TYPE_btree_ptr: */
 
-int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                          unsigned flags, struct printbuf *err)
+int bch2_btree_ptr_invalid(struct bch_fs *c, struct bkey_s_c k,
+                          enum bkey_invalid_flags flags,
+                          struct printbuf *err)
 {
-       if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) {
-               prt_printf(err, "value too big (%zu > %u)",
-                      bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
-               return -BCH_ERR_invalid_bkey;
-       }
+       int ret = 0;
+
+       bkey_fsck_err_on(bkey_val_u64s(k.k) > BCH_REPLICAS_MAX, c, err,
+                        btree_ptr_val_too_big,
+                        "value too big (%zu > %u)", bkey_val_u64s(k.k), BCH_REPLICAS_MAX);
 
-       return bch2_bkey_ptrs_invalid(c, k, flags, err);
+       ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+fsck_err:
+       return ret;
 }
 
 void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
@@ -180,16 +184,20 @@ void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
        bch2_bkey_ptrs_to_text(out, c, k);
 }
 
-int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                             unsigned flags, struct printbuf *err)
+int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
+                             enum bkey_invalid_flags flags,
+                             struct printbuf *err)
 {
-       if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) {
-               prt_printf(err, "value too big (%zu > %zu)",
-                      bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
-               return -BCH_ERR_invalid_bkey;
-       }
+       int ret = 0;
+
+       bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err,
+                        btree_ptr_v2_val_too_big,
+                        "value too big (%zu > %zu)",
+                        bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
 
-       return bch2_bkey_ptrs_invalid(c, k, flags, err);
+       ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
+fsck_err:
+       return ret;
 }
 
 void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
@@ -216,7 +224,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version,
        compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key);
 
        if (version < bcachefs_metadata_version_inode_btree_change &&
-           btree_node_type_is_extents(btree_id) &&
+           btree_id_is_extents(btree_id) &&
            !bkey_eq(bp.v->min_key, POS_MIN))
                bp.v->min_key = write
                        ? bpos_nosnap_predecessor(bp.v->min_key)
@@ -370,18 +378,18 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r)
 
 /* KEY_TYPE_reservation: */
 
-int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                            unsigned flags, struct printbuf *err)
+int bch2_reservation_invalid(struct bch_fs *c, struct bkey_s_c k,
+                            enum bkey_invalid_flags flags,
+                            struct printbuf *err)
 {
        struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+       int ret = 0;
 
-       if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) {
-               prt_printf(err, "invalid nr_replicas (%u)",
-                      r.v->nr_replicas);
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       return 0;
+       bkey_fsck_err_on(!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX, c, err,
+                        reservation_key_nr_replicas_invalid,
+                        "invalid nr_replicas (%u)", r.v->nr_replicas);
+fsck_err:
+       return ret;
 }
 
 void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
@@ -514,13 +522,13 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst,
        switch (type) {
        case BCH_EXTENT_ENTRY_crc32:
                set_common_fields(dst->crc32, src);
-               dst->crc32.csum  = *((__le32 *) &src.csum.lo);
+               dst->crc32.csum         = (u32 __force) *((__le32 *) &src.csum.lo);
                break;
        case BCH_EXTENT_ENTRY_crc64:
                set_common_fields(dst->crc64, src);
                dst->crc64.nonce        = src.nonce;
-               dst->crc64.csum_lo      = src.csum.lo;
-               dst->crc64.csum_hi      = *((__le16 *) &src.csum.hi);
+               dst->crc64.csum_lo      = (u64 __force) src.csum.lo;
+               dst->crc64.csum_hi      = (u64 __force) *((__le16 *) &src.csum.hi);
                break;
        case BCH_EXTENT_ENTRY_crc128:
                set_common_fields(dst->crc128, src);
@@ -641,37 +649,31 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
        return replicas;
 }
 
-unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
+static inline unsigned __extent_ptr_durability(struct bch_dev *ca, struct extent_ptr_decoded *p)
 {
-       struct bch_dev *ca;
-
        if (p->ptr.cached)
                return 0;
 
-       ca = bch_dev_bkey_exists(c, p->ptr.dev);
-
-       return ca->mi.durability +
-               (p->has_ec
-                ? p->ec.redundancy
-                : 0);
+       return p->has_ec
+               ? p->ec.redundancy + 1
+               : ca->mi.durability;
 }
 
-unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
+unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
 {
-       struct bch_dev *ca;
+       struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev);
 
-       if (p->ptr.cached)
-               return 0;
+       return __extent_ptr_durability(ca, p);
+}
 
-       ca = bch_dev_bkey_exists(c, p->ptr.dev);
+unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p)
+{
+       struct bch_dev *ca = bch_dev_bkey_exists(c, p->ptr.dev);
 
        if (ca->mi.state == BCH_MEMBER_STATE_failed)
                return 0;
 
-       return ca->mi.durability +
-               (p->has_ec
-                ? p->ec.redundancy
-                : 0);
+       return __extent_ptr_durability(ca, p);
 }
 
 unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
@@ -754,18 +756,6 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
        return i;
 }
 
-static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
-{
-       union bch_extent_entry *next = extent_entry_next(entry);
-
-       /* stripes have ptrs, but their layout doesn't work with this code */
-       BUG_ON(k.k->type == KEY_TYPE_stripe);
-
-       memmove_u64s_down(entry, next,
-                         (u64 *) bkey_val_end(k) - (u64 *) next);
-       k.k->u64s -= (u64 *) next - (u64 *) entry;
-}
-
 /*
  * Returns pointer to the next entry after the one being dropped:
  */
@@ -853,7 +843,6 @@ void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
 const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const struct bch_extent_ptr *ptr;
 
        bkey_for_each_ptr(ptrs, ptr)
                if (ptr->dev == dev)
@@ -865,7 +854,6 @@ const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned
 bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const struct bch_extent_ptr *ptr;
 
        bkey_for_each_ptr(ptrs, ptr)
                if (bch2_dev_in_target(c, ptr->dev, target) &&
@@ -912,11 +900,11 @@ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2)
 
                bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1)
                        bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2)
-                       if (p1.ptr.dev          == p2.ptr.dev &&
-                           p1.ptr.gen          == p2.ptr.gen &&
-                           (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
-                           (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
-                               return true;
+                               if (p1.ptr.dev          == p2.ptr.dev &&
+                                   p1.ptr.gen          == p2.ptr.gen &&
+                                   (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) ==
+                                   (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k))
+                                       return true;
 
                return false;
        } else {
@@ -989,10 +977,6 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
-       struct bch_extent_crc_unpacked crc;
-       const struct bch_extent_ptr *ptr;
-       const struct bch_extent_stripe_ptr *ec;
-       struct bch_dev *ca;
        bool first = true;
 
        if (c)
@@ -1003,9 +987,9 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
                        prt_printf(out, " ");
 
                switch (__extent_entry_type(entry)) {
-               case BCH_EXTENT_ENTRY_ptr:
-                       ptr = entry_to_ptr(entry);
-                       ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+               case BCH_EXTENT_ENTRY_ptr: {
+                       const struct bch_extent_ptr *ptr = entry_to_ptr(entry);
+                       struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
                                ? bch_dev_bkey_exists(c, ptr->dev)
                                : NULL;
 
@@ -1027,10 +1011,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
                                        prt_printf(out, " stale");
                        }
                        break;
+               }
                case BCH_EXTENT_ENTRY_crc32:
                case BCH_EXTENT_ENTRY_crc64:
-               case BCH_EXTENT_ENTRY_crc128:
-                       crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
+               case BCH_EXTENT_ENTRY_crc128: {
+                       struct bch_extent_crc_unpacked crc =
+                               bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
                        prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
                               crc.compressed_size,
@@ -1039,12 +1025,26 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
                               bch2_csum_types[crc.csum_type],
                               bch2_compression_types[crc.compression_type]);
                        break;
-               case BCH_EXTENT_ENTRY_stripe_ptr:
-                       ec = &entry->stripe_ptr;
+               }
+               case BCH_EXTENT_ENTRY_stripe_ptr: {
+                       const struct bch_extent_stripe_ptr *ec = &entry->stripe_ptr;
 
                        prt_printf(out, "ec: idx %llu block %u",
                               (u64) ec->idx, ec->block);
                        break;
+               }
+               case BCH_EXTENT_ENTRY_rebalance: {
+                       const struct bch_extent_rebalance *r = &entry->rebalance;
+
+                       prt_str(out, "rebalance: target ");
+                       if (c)
+                               bch2_target_to_text(out, c, r->target);
+                       else
+                               prt_printf(out, "%u", r->target);
+                       prt_str(out, " compression ");
+                       bch2_compression_opt_to_text(out, r->compression);
+                       break;
+               }
                default:
                        prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
                        return;
@@ -1054,56 +1054,58 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
        }
 }
 
-static int extent_ptr_invalid(const struct bch_fs *c,
+static int extent_ptr_invalid(struct bch_fs *c,
                              struct bkey_s_c k,
+                             enum bkey_invalid_flags flags,
                              const struct bch_extent_ptr *ptr,
                              unsigned size_ondisk,
                              bool metadata,
                              struct printbuf *err)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const struct bch_extent_ptr *ptr2;
        u64 bucket;
        u32 bucket_offset;
        struct bch_dev *ca;
+       int ret = 0;
 
        if (!bch2_dev_exists2(c, ptr->dev)) {
-               prt_printf(err, "pointer to invalid device (%u)", ptr->dev);
-               return -BCH_ERR_invalid_bkey;
+               /*
+                * If we're in the write path this key might have already been
+                * overwritten, and we could be seeing a device that doesn't
+                * exist anymore due to racing with device removal:
+                */
+               if (flags & BKEY_INVALID_WRITE)
+                       return 0;
+
+               bkey_fsck_err(c, err, ptr_to_invalid_device,
+                          "pointer to invalid device (%u)", ptr->dev);
        }
 
        ca = bch_dev_bkey_exists(c, ptr->dev);
        bkey_for_each_ptr(ptrs, ptr2)
-               if (ptr != ptr2 && ptr->dev == ptr2->dev) {
-                       prt_printf(err, "multiple pointers to same device (%u)", ptr->dev);
-                       return -BCH_ERR_invalid_bkey;
-               }
+               bkey_fsck_err_on(ptr != ptr2 && ptr->dev == ptr2->dev, c, err,
+                                ptr_to_duplicate_device,
+                                "multiple pointers to same device (%u)", ptr->dev);
 
        bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset);
 
-       if (bucket >= ca->mi.nbuckets) {
-               prt_printf(err, "pointer past last bucket (%llu > %llu)",
-                      bucket, ca->mi.nbuckets);
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) {
-               prt_printf(err, "pointer before first bucket (%llu < %u)",
-                      bucket, ca->mi.first_bucket);
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (bucket_offset + size_ondisk > ca->mi.bucket_size) {
-               prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)",
+       bkey_fsck_err_on(bucket >= ca->mi.nbuckets, c, err,
+                        ptr_after_last_bucket,
+                        "pointer past last bucket (%llu > %llu)", bucket, ca->mi.nbuckets);
+       bkey_fsck_err_on(ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket), c, err,
+                        ptr_before_first_bucket,
+                        "pointer before first bucket (%llu < %u)", bucket, ca->mi.first_bucket);
+       bkey_fsck_err_on(bucket_offset + size_ondisk > ca->mi.bucket_size, c, err,
+                        ptr_spans_multiple_buckets,
+                        "pointer spans multiple buckets (%u + %u > %u)",
                       bucket_offset, size_ondisk, ca->mi.bucket_size);
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       return 0;
+fsck_err:
+       return ret;
 }
 
-int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
-                          unsigned flags, struct printbuf *err)
+int bch2_bkey_ptrs_invalid(struct bch_fs *c, struct bkey_s_c k,
+                          enum bkey_invalid_flags flags,
+                          struct printbuf *err)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
@@ -1111,48 +1113,39 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
        unsigned size_ondisk = k.k->size;
        unsigned nonce = UINT_MAX;
        unsigned nr_ptrs = 0;
-       bool unwritten = false, have_ec = false, crc_since_last_ptr = false;
-       int ret;
+       bool have_written = false, have_unwritten = false, have_ec = false, crc_since_last_ptr = false;
+       int ret = 0;
 
        if (bkey_is_btree_ptr(k.k))
                size_ondisk = btree_sectors(c);
 
        bkey_extent_entry_for_each(ptrs, entry) {
-               if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) {
-                       prt_printf(err, "invalid extent entry type (got %u, max %u)",
-                              __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
-                       return -BCH_ERR_invalid_bkey;
-               }
+               bkey_fsck_err_on(__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX, c, err,
+                       extent_ptrs_invalid_entry,
+                       "invalid extent entry type (got %u, max %u)",
+                       __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX);
 
-               if (bkey_is_btree_ptr(k.k) &&
-                   !extent_entry_is_ptr(entry)) {
-                       prt_printf(err, "has non ptr field");
-                       return -BCH_ERR_invalid_bkey;
-               }
+               bkey_fsck_err_on(bkey_is_btree_ptr(k.k) &&
+                                !extent_entry_is_ptr(entry), c, err,
+                                btree_ptr_has_non_ptr,
+                                "has non ptr field");
 
                switch (extent_entry_type(entry)) {
                case BCH_EXTENT_ENTRY_ptr:
-                       ret = extent_ptr_invalid(c, k, &entry->ptr, size_ondisk,
-                                                false, err);
+                       ret = extent_ptr_invalid(c, k, flags, &entry->ptr,
+                                                size_ondisk, false, err);
                        if (ret)
                                return ret;
 
-                       if (nr_ptrs && unwritten != entry->ptr.unwritten) {
-                               prt_printf(err, "extent with unwritten and written ptrs");
-                               return -BCH_ERR_invalid_bkey;
-                       }
-
-                       if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) {
-                               prt_printf(err, "has unwritten ptrs");
-                               return -BCH_ERR_invalid_bkey;
-                       }
+                       bkey_fsck_err_on(entry->ptr.cached && have_ec, c, err,
+                                        ptr_cached_and_erasure_coded,
+                                        "cached, erasure coded ptr");
 
-                       if (entry->ptr.cached && have_ec) {
-                               prt_printf(err, "cached, erasure coded ptr");
-                               return -BCH_ERR_invalid_bkey;
-                       }
+                       if (!entry->ptr.unwritten)
+                               have_written = true;
+                       else
+                               have_unwritten = true;
 
-                       unwritten = entry->ptr.unwritten;
                        have_ec = false;
                        crc_since_last_ptr = false;
                        nr_ptrs++;
@@ -1162,70 +1155,77 @@ int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k,
                case BCH_EXTENT_ENTRY_crc128:
                        crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
-                       if (crc.offset + crc.live_size >
-                           crc.uncompressed_size) {
-                               prt_printf(err, "checksum offset + key size > uncompressed size");
-                               return -BCH_ERR_invalid_bkey;
-                       }
-
-                       size_ondisk = crc.compressed_size;
-
-                       if (!bch2_checksum_type_valid(c, crc.csum_type)) {
-                               prt_printf(err, "invalid checksum type");
-                               return -BCH_ERR_invalid_bkey;
-                       }
-
-                       if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) {
-                               prt_printf(err, "invalid compression type");
-                               return -BCH_ERR_invalid_bkey;
-                       }
+                       bkey_fsck_err_on(crc.offset + crc.live_size > crc.uncompressed_size, c, err,
+                                        ptr_crc_uncompressed_size_too_small,
+                                        "checksum offset + key size > uncompressed size");
+                       bkey_fsck_err_on(!bch2_checksum_type_valid(c, crc.csum_type), c, err,
+                                        ptr_crc_csum_type_unknown,
+                                        "invalid checksum type");
+                       bkey_fsck_err_on(crc.compression_type >= BCH_COMPRESSION_TYPE_NR, c, err,
+                                        ptr_crc_compression_type_unknown,
+                                        "invalid compression type");
 
                        if (bch2_csum_type_is_encryption(crc.csum_type)) {
                                if (nonce == UINT_MAX)
                                        nonce = crc.offset + crc.nonce;
-                               else if (nonce != crc.offset + crc.nonce) {
-                                       prt_printf(err, "incorrect nonce");
-                                       return -BCH_ERR_invalid_bkey;
-                               }
+                               else if (nonce != crc.offset + crc.nonce)
+                                       bkey_fsck_err(c, err, ptr_crc_nonce_mismatch,
+                                                     "incorrect nonce");
                        }
 
-                       if (crc_since_last_ptr) {
-                               prt_printf(err, "redundant crc entry");
-                               return -BCH_ERR_invalid_bkey;
-                       }
+                       bkey_fsck_err_on(crc_since_last_ptr, c, err,
+                                        ptr_crc_redundant,
+                                        "redundant crc entry");
                        crc_since_last_ptr = true;
+
+                       bkey_fsck_err_on(crc_is_encoded(crc) &&
+                                        (crc.uncompressed_size > c->opts.encoded_extent_max >> 9) &&
+                                        (flags & (BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT)), c, err,
+                                        ptr_crc_uncompressed_size_too_big,
+                                        "too large encoded extent");
+
+                       size_ondisk = crc.compressed_size;
                        break;
                case BCH_EXTENT_ENTRY_stripe_ptr:
-                       if (have_ec) {
-                               prt_printf(err, "redundant stripe entry");
+                       bkey_fsck_err_on(have_ec, c, err,
+                                        ptr_stripe_redundant,
+                                        "redundant stripe entry");
+                       have_ec = true;
+                       break;
+               case BCH_EXTENT_ENTRY_rebalance: {
+                       const struct bch_extent_rebalance *r = &entry->rebalance;
+
+                       if (!bch2_compression_opt_valid(r->compression)) {
+                               struct bch_compression_opt opt = __bch2_compression_decode(r->compression);
+                               prt_printf(err, "invalid compression opt %u:%u",
+                                          opt.type, opt.level);
                                return -BCH_ERR_invalid_bkey;
                        }
-                       have_ec = true;
                        break;
                }
+               }
        }
 
-       if (!nr_ptrs) {
-               prt_str(err, "no ptrs");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (nr_ptrs >= BCH_BKEY_PTRS_MAX) {
-               prt_str(err, "too many ptrs");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (crc_since_last_ptr) {
-               prt_printf(err, "redundant crc entry");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       if (have_ec) {
-               prt_printf(err, "redundant stripe entry");
-               return -BCH_ERR_invalid_bkey;
-       }
-
-       return 0;
+       bkey_fsck_err_on(!nr_ptrs, c, err,
+                        extent_ptrs_no_ptrs,
+                        "no ptrs");
+       bkey_fsck_err_on(nr_ptrs > BCH_BKEY_PTRS_MAX, c, err,
+                        extent_ptrs_too_many_ptrs,
+                        "too many ptrs: %u > %u", nr_ptrs, BCH_BKEY_PTRS_MAX);
+       bkey_fsck_err_on(have_written && have_unwritten, c, err,
+                        extent_ptrs_written_and_unwritten,
+                        "extent with unwritten and written ptrs");
+       bkey_fsck_err_on(k.k->type != KEY_TYPE_extent && have_unwritten, c, err,
+                        extent_ptrs_unwritten,
+                        "has unwritten ptrs");
+       bkey_fsck_err_on(crc_since_last_ptr, c, err,
+                        extent_ptrs_redundant_crc,
+                        "redundant crc entry");
+       bkey_fsck_err_on(have_ec, c, err,
+                        extent_ptrs_redundant_stripe,
+                        "redundant stripe entry");
+fsck_err:
+       return ret;
 }
 
 void bch2_ptr_swab(struct bkey_s k)
@@ -1260,10 +1260,131 @@ void bch2_ptr_swab(struct bkey_s k)
                        break;
                case BCH_EXTENT_ENTRY_stripe_ptr:
                        break;
+               case BCH_EXTENT_ENTRY_rebalance:
+                       break;
                }
        }
 }
 
+const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const union bch_extent_entry *entry;
+
+       bkey_extent_entry_for_each(ptrs, entry)
+               if (__extent_entry_type(entry) == BCH_EXTENT_ENTRY_rebalance)
+                       return &entry->rebalance;
+
+       return NULL;
+}
+
+unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
+                                      unsigned target, unsigned compression)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       unsigned rewrite_ptrs = 0;
+
+       if (compression) {
+               unsigned compression_type = bch2_compression_opt_to_type(compression);
+               const union bch_extent_entry *entry;
+               struct extent_ptr_decoded p;
+               unsigned i = 0;
+
+               bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+                       if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible ||
+                           p.ptr.unwritten) {
+                               rewrite_ptrs = 0;
+                               goto incompressible;
+                       }
+
+                       if (!p.ptr.cached && p.crc.compression_type != compression_type)
+                               rewrite_ptrs |= 1U << i;
+                       i++;
+               }
+       }
+incompressible:
+       if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
+               unsigned i = 0;
+
+               bkey_for_each_ptr(ptrs, ptr) {
+                       if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, target))
+                               rewrite_ptrs |= 1U << i;
+                       i++;
+               }
+       }
+
+       return rewrite_ptrs;
+}
+
+bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
+{
+       const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
+
+       /*
+        * If it's an indirect extent, we don't delete the rebalance entry when
+        * done so that we know what options were applied - check if it still
+        * needs work done:
+        */
+       if (r &&
+           k.k->type == KEY_TYPE_reflink_v &&
+           !bch2_bkey_ptrs_need_rebalance(c, k, r->target, r->compression))
+               r = NULL;
+
+       return r != NULL;
+}
+
+int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
+                                 unsigned target, unsigned compression)
+{
+       struct bkey_s k = bkey_i_to_s(_k);
+       struct bch_extent_rebalance *r;
+       bool needs_rebalance;
+
+       if (!bkey_extent_is_direct_data(k.k))
+               return 0;
+
+       /* get existing rebalance entry: */
+       r = (struct bch_extent_rebalance *) bch2_bkey_rebalance_opts(k.s_c);
+       if (r) {
+               if (k.k->type == KEY_TYPE_reflink_v) {
+                       /*
+                        * indirect extents: existing options take precedence,
+                        * so that we don't move extents back and forth if
+                        * they're referenced by different inodes with different
+                        * options:
+                        */
+                       if (r->target)
+                               target = r->target;
+                       if (r->compression)
+                               compression = r->compression;
+               }
+
+               r->target       = target;
+               r->compression  = compression;
+       }
+
+       needs_rebalance = bch2_bkey_ptrs_need_rebalance(c, k.s_c, target, compression);
+
+       if (needs_rebalance && !r) {
+               union bch_extent_entry *new = bkey_val_end(k);
+
+               new->rebalance.type             = 1U << BCH_EXTENT_ENTRY_rebalance;
+               new->rebalance.compression      = compression;
+               new->rebalance.target           = target;
+               new->rebalance.unused           = 0;
+               k.k->u64s += extent_entry_u64s(new);
+       } else if (!needs_rebalance && r && k.k->type != KEY_TYPE_reflink_v) {
+               /*
+                * For indirect extents, don't delete the rebalance entry when
+                * we're finished so that we know we specifically moved it or
+                * compressed it to its current location/compression type
+                */
+               extent_entry_drop(k, (union bch_extent_entry *) r);
+       }
+
+       return 0;
+}
+
 /* Generic extent code: */
 
 int bch2_cut_front_s(struct bpos where, struct bkey_s k)
@@ -1310,6 +1431,8 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k)
                                break;
                        case BCH_EXTENT_ENTRY_stripe_ptr:
                                break;
+                       case BCH_EXTENT_ENTRY_rebalance:
+                               break;
                        }
 
                        if (extent_entry_is_crc(entry))