]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/extents.c
Update bcachefs sources to f7ccf51390 bcachefs: durability
[bcachefs-tools-debian] / libbcachefs / extents.c
index 74d54ab172a696e4d9d9500f1d623bd879c7d72b..ed33f9bf528a9628c9f1d3c5f1b0e2ed47f79a1d 100644 (file)
@@ -9,6 +9,8 @@
 #include "bkey_methods.h"
 #include "btree_gc.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
 #include "checksum.h"
 #include "debug.h"
 #include "dirent.h"
@@ -16,7 +18,9 @@
 #include "extents.h"
 #include "inode.h"
 #include "journal.h"
+#include "super.h"
 #include "super-io.h"
+#include "util.h"
 #include "xattr.h"
 
 #include <trace/events/bcachefs.h>
@@ -24,7 +28,7 @@
 static enum merge_result bch2_extent_merge(struct bch_fs *, struct btree *,
                                           struct bkey_i *, struct bkey_i *);
 
-static void sort_key_next(struct btree_node_iter *iter,
+static void sort_key_next(struct btree_node_iter_large *iter,
                          struct btree *b,
                          struct btree_node_iter_set *i)
 {
@@ -50,7 +54,7 @@ static void sort_key_next(struct btree_node_iter *iter,
        ?: (l).k - (r).k;                                               \
 })
 
-static inline bool should_drop_next_key(struct btree_node_iter *iter,
+static inline bool should_drop_next_key(struct btree_node_iter_large *iter,
                                        struct btree *b)
 {
        struct btree_node_iter_set *l = iter->data, *r = iter->data + 1;
@@ -77,8 +81,8 @@ static inline bool should_drop_next_key(struct btree_node_iter *iter,
 }
 
 struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst,
-                                                 struct btree *b,
-                                                 struct btree_node_iter *iter)
+                                       struct btree *b,
+                                       struct btree_node_iter_large *iter)
 {
        struct bkey_packed *out = dst->start;
        struct btree_nr_keys nr;
@@ -87,7 +91,7 @@ struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *dst,
 
        heap_resort(iter, key_sort_cmp);
 
-       while (!bch2_btree_node_iter_end(iter)) {
+       while (!bch2_btree_node_iter_large_end(iter)) {
                if (!should_drop_next_key(iter, b)) {
                        struct bkey_packed *k =
                                __btree_node_offset_to_key(b, iter->data->k);
@@ -119,6 +123,50 @@ bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
        return NULL;
 }
 
+bool bch2_extent_drop_device(struct bkey_s_extent e, unsigned dev)
+{
+       struct bch_extent_ptr *ptr;
+       bool dropped = false;
+
+       extent_for_each_ptr_backwards(e, ptr)
+               if (ptr->dev == dev) {
+                       __bch2_extent_drop_ptr(e, ptr);
+                       dropped = true;
+               }
+
+       if (dropped)
+               bch2_extent_drop_redundant_crcs(e);
+       return dropped;
+}
+
+const struct bch_extent_ptr *
+bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group)
+{
+       const struct bch_extent_ptr *ptr;
+
+       extent_for_each_ptr(e, ptr) {
+               struct bch_dev *ca = c->devs[ptr->dev];
+
+               if (ca->mi.group &&
+                   ca->mi.group - 1 == group)
+                       return ptr;
+       }
+
+       return NULL;
+}
+
+const struct bch_extent_ptr *
+bch2_extent_has_target(struct bch_fs *c, struct bkey_s_c_extent e, unsigned target)
+{
+       const struct bch_extent_ptr *ptr;
+
+       extent_for_each_ptr(e, ptr)
+               if (dev_in_target(c->devs[ptr->dev], target))
+                       return ptr;
+
+       return NULL;
+}
+
 unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent e)
 {
        const struct bch_extent_ptr *ptr;
@@ -153,24 +201,112 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k)
        return nr_ptrs;
 }
 
-/* returns true if equal */
-static bool crc_cmp(union bch_extent_crc *l, union bch_extent_crc *r)
+unsigned bch2_extent_ptr_durability(struct bch_fs *c,
+                                   const struct bch_extent_ptr *ptr)
 {
-       return extent_crc_type(l) == extent_crc_type(r) &&
-               !memcmp(l, r, extent_entry_bytes(to_entry(l)));
+       struct bch_dev *ca;
+
+       if (ptr->cached)
+               return 0;
+
+       ca = bch_dev_bkey_exists(c, ptr->dev);
+
+       if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
+               return 0;
+
+       return ca->mi.durability;
 }
 
-/* Increment pointers after @crc by crc's offset until the next crc entry: */
-void bch2_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_crc *crc)
+unsigned bch2_extent_durability(struct bch_fs *c, struct bkey_s_c_extent e)
 {
-       union bch_extent_entry *entry;
+       const struct bch_extent_ptr *ptr;
+       unsigned durability = 0;
+
+       extent_for_each_ptr(e, ptr)
+               durability += bch2_extent_ptr_durability(c, ptr);
 
-       extent_for_each_entry_from(e, entry, extent_entry_next(to_entry(crc))) {
-               if (!extent_entry_is_ptr(entry))
-                       return;
+       return durability;
+}
 
-               entry->ptr.offset += crc_offset(crc);
+unsigned bch2_extent_is_compressed(struct bkey_s_c k)
+{
+       struct bkey_s_c_extent e;
+       const struct bch_extent_ptr *ptr;
+       struct bch_extent_crc_unpacked crc;
+       unsigned ret = 0;
+
+       switch (k.k->type) {
+       case BCH_EXTENT:
+       case BCH_EXTENT_CACHED:
+               e = bkey_s_c_to_extent(k);
+
+               extent_for_each_ptr_crc(e, ptr, crc)
+                       if (!ptr->cached &&
+                           crc.compression_type != BCH_COMPRESSION_NONE &&
+                           crc.compressed_size < crc.live_size)
+                               ret = max_t(unsigned, ret, crc.compressed_size);
        }
+
+       return ret;
+}
+
+bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e,
+                            struct bch_extent_ptr m, u64 offset)
+{
+       const struct bch_extent_ptr *ptr;
+       struct bch_extent_crc_unpacked crc;
+
+       extent_for_each_ptr_crc(e, ptr, crc)
+               if (ptr->dev    == m.dev &&
+                   ptr->gen    == m.gen &&
+                   (s64) ptr->offset + crc.offset - bkey_start_offset(e.k) ==
+                   (s64) m.offset  - offset)
+                       return ptr;
+
+       return NULL;
+}
+
+/* Doesn't cleanup redundant crcs */
+void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
+{
+       EBUG_ON(ptr < &e.v->start->ptr ||
+               ptr >= &extent_entry_last(e)->ptr);
+       EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
+       memmove_u64s_down(ptr, ptr + 1,
+                         (u64 *) extent_entry_last(e) - (u64 *) (ptr + 1));
+       e.k->u64s -= sizeof(*ptr) / sizeof(u64);
+}
+
+void bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr)
+{
+       __bch2_extent_drop_ptr(e, ptr);
+       bch2_extent_drop_redundant_crcs(e);
+}
+
+static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
+                                 struct bch_extent_crc_unpacked n)
+{
+       return !u.compression_type &&
+               u.csum_type &&
+               u.uncompressed_size > u.live_size &&
+               bch2_csum_type_is_encryption(u.csum_type) ==
+               bch2_csum_type_is_encryption(n.csum_type);
+}
+
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e,
+                                struct bch_extent_crc_unpacked n)
+{
+       struct bch_extent_crc_unpacked crc;
+       const union bch_extent_entry *i;
+
+       if (!n.csum_type)
+               return false;
+
+       extent_for_each_crc(e, crc, i)
+               if (can_narrow_crc(crc, n))
+                       return true;
+
+       return false;
 }
 
 /*
@@ -181,96 +317,50 @@ void bch2_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_cr
  * not compressed, we can modify them to point to only the data that is
  * currently live (so that readers won't have to bounce) while we've got the
  * checksum we need:
- *
- * XXX: to guard against data being corrupted while in memory, instead of
- * recomputing the checksum here, it would be better in the read path to instead
- * of computing the checksum of the entire extent:
- *
- * | extent                              |
- *
- * compute the checksums of the live and dead data separately
- * | dead data || live data || dead data |
- *
- * and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then
- * use crc_live here (that we verified was correct earlier)
- *
- * note: doesn't work with encryption
  */
-void bch2_extent_narrow_crcs(struct bkey_s_extent e)
+bool bch2_extent_narrow_crcs(struct bkey_i_extent *e,
+                            struct bch_extent_crc_unpacked n)
 {
-       union bch_extent_crc *crc;
-       bool have_wide = false, have_narrow = false;
-       struct bch_csum csum = { 0 };
-       unsigned csum_type = 0;
-
-       extent_for_each_crc(e, crc) {
-               if (crc_compression_type(crc) ||
-                   bch2_csum_type_is_encryption(crc_csum_type(crc)))
-                       continue;
-
-               if (crc_uncompressed_size(e.k, crc) != e.k->size) {
-                       have_wide = true;
-               } else {
-                       have_narrow = true;
-                       csum = crc_csum(crc);
-                       csum_type = crc_csum_type(crc);
-               }
-       }
-
-       if (!have_wide || !have_narrow)
-               return;
-
-       extent_for_each_crc(e, crc) {
-               if (crc_compression_type(crc))
-                       continue;
-
-               if (crc_uncompressed_size(e.k, crc) != e.k->size) {
-                       switch (extent_crc_type(crc)) {
-                       case BCH_EXTENT_CRC_NONE:
-                               BUG();
-                       case BCH_EXTENT_CRC32:
-                               if (bch_crc_bytes[csum_type] > 4)
-                                       continue;
-
-                               bch2_extent_crc_narrow_pointers(e, crc);
-                               crc->crc32._compressed_size     = e.k->size - 1;
-                               crc->crc32._uncompressed_size   = e.k->size - 1;
-                               crc->crc32.offset               = 0;
-                               crc->crc32.csum_type            = csum_type;
-                               crc->crc32.csum                 = csum.lo;
+       struct bch_extent_crc_unpacked u;
+       struct bch_extent_ptr *ptr;
+       union bch_extent_entry *i;
+
+       /* Find a checksum entry that covers only live data: */
+       if (!n.csum_type)
+               extent_for_each_crc(extent_i_to_s(e), u, i)
+                       if (!u.compression_type &&
+                           u.csum_type &&
+                           u.live_size == u.uncompressed_size) {
+                               n = u;
                                break;
-                       case BCH_EXTENT_CRC64:
-                               if (bch_crc_bytes[csum_type] > 10)
-                                       continue;
+                       }
 
-                               bch2_extent_crc_narrow_pointers(e, crc);
-                               crc->crc64._compressed_size     = e.k->size - 1;
-                               crc->crc64._uncompressed_size   = e.k->size - 1;
-                               crc->crc64.offset               = 0;
-                               crc->crc64.csum_type            = csum_type;
-                               crc->crc64.csum_lo              = csum.lo;
-                               crc->crc64.csum_hi              = csum.hi;
-                               break;
-                       case BCH_EXTENT_CRC128:
-                               if (bch_crc_bytes[csum_type] > 16)
-                                       continue;
+       if (!bch2_can_narrow_extent_crcs(extent_i_to_s_c(e), n))
+               return false;
 
-                               bch2_extent_crc_narrow_pointers(e, crc);
-                               crc->crc128._compressed_size    = e.k->size - 1;
-                               crc->crc128._uncompressed_size  = e.k->size - 1;
-                               crc->crc128.offset              = 0;
-                               crc->crc128.csum_type           = csum_type;
-                               crc->crc128.csum                = csum;
-                               break;
-                       }
+       BUG_ON(n.compression_type);
+       BUG_ON(n.offset);
+       BUG_ON(n.live_size != e->k.size);
+
+       bch2_extent_crc_append(e, n);
+restart_narrow_pointers:
+       extent_for_each_ptr_crc(extent_i_to_s(e), ptr, u)
+               if (can_narrow_crc(u, n)) {
+                       ptr->offset += u.offset;
+                       extent_ptr_append(e, *ptr);
+                       __bch2_extent_drop_ptr(extent_i_to_s(e), ptr);
+                       goto restart_narrow_pointers;
                }
-       }
+
+       bch2_extent_drop_redundant_crcs(extent_i_to_s(e));
+       return true;
 }
 
 void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
 {
        union bch_extent_entry *entry = e.v->start;
        union bch_extent_crc *crc, *prev = NULL;
+       struct bch_extent_crc_unpacked u, prev_u;
 
        while (entry != extent_entry_last(e)) {
                union bch_extent_entry *next = extent_entry_next(entry);
@@ -280,6 +370,7 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
                        goto next;
 
                crc = entry_to_crc(entry);
+               u = bch2_extent_crc_unpack(e.k, crc);
 
                if (next == extent_entry_last(e)) {
                        /* crc entry with no pointers after it: */
@@ -291,20 +382,28 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e)
                        goto drop;
                }
 
-               if (prev && crc_cmp(crc, prev)) {
+               if (prev && !memcmp(&u, &prev_u, sizeof(u))) {
                        /* identical to previous crc entry: */
                        goto drop;
                }
 
                if (!prev &&
-                   !crc_csum_type(crc) &&
-                   !crc_compression_type(crc)) {
+                   !u.csum_type &&
+                   !u.compression_type) {
                        /* null crc entry: */
-                       bch2_extent_crc_narrow_pointers(e, crc);
+                       union bch_extent_entry *e2;
+
+                       extent_for_each_entry_from(e, e2, extent_entry_next(entry)) {
+                               if (!extent_entry_is_ptr(e2))
+                                       break;
+
+                               e2->ptr.offset += u.offset;
+                       }
                        goto drop;
                }
 
                prev = crc;
+               prev_u = u;
 next:
                entry = next;
                continue;
@@ -321,7 +420,7 @@ static bool should_drop_ptr(const struct bch_fs *c,
                            struct bkey_s_c_extent e,
                            const struct bch_extent_ptr *ptr)
 {
-       return ptr->cached && ptr_stale(c->devs[ptr->dev], ptr);
+       return ptr->cached && ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr);
 }
 
 static void bch2_extent_drop_stale(struct bch_fs *c, struct bkey_s_extent e)
@@ -370,8 +469,10 @@ static void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
                                entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
                                break;
                        case BCH_EXTENT_ENTRY_crc128:
-                               entry->crc128.csum.hi = swab64(entry->crc64.csum_hi);
-                               entry->crc128.csum.lo = swab64(entry->crc64.csum_lo);
+                               entry->crc128.csum.hi = (__force __le64)
+                                       swab64((__force u64) entry->crc128.csum.hi);
+                               entry->crc128.csum.lo = (__force __le64)
+                                       swab64((__force u64) entry->crc128.csum.lo);
                                break;
                        case BCH_EXTENT_ENTRY_ptr:
                                break;
@@ -391,10 +492,11 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
        const struct bch_extent_ptr *ptr2;
        struct bch_dev *ca;
 
-       if (ptr->dev >= c->sb.nr_devices)
+       if (ptr->dev >= c->sb.nr_devices ||
+           !c->devs[ptr->dev])
                return "pointer to invalid device";
 
-       ca = c->devs[ptr->dev];
+       ca = bch_dev_bkey_exists(c, ptr->dev);
        if (!ca)
                return "pointer to invalid device";
 
@@ -402,13 +504,13 @@ static const char *extent_ptr_invalid(const struct bch_fs *c,
                if (ptr != ptr2 && ptr->dev == ptr2->dev)
                        return "multiple pointers to same device";
 
-       if (ptr->offset + size_ondisk > ca->mi.bucket_size * ca->mi.nbuckets)
+       if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
                return "offset past end of device";
 
-       if (ptr->offset < ca->mi.bucket_size * ca->mi.first_bucket)
+       if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
                return "offset before first bucket";
 
-       if ((ptr->offset & (ca->mi.bucket_size - 1)) +
+       if (bucket_remainder(ca, ptr->offset) +
            size_ondisk > ca->mi.bucket_size)
                return "spans multiple buckets";
 
@@ -420,7 +522,7 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
 {
        char *out = buf, *end = buf + size;
        const union bch_extent_entry *entry;
-       const union bch_extent_crc *crc;
+       struct bch_extent_crc_unpacked crc;
        const struct bch_extent_ptr *ptr;
        struct bch_dev *ca;
        bool first = true;
@@ -435,17 +537,20 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf,
                case BCH_EXTENT_ENTRY_crc32:
                case BCH_EXTENT_ENTRY_crc64:
                case BCH_EXTENT_ENTRY_crc128:
-                       crc = entry_to_crc(entry);
-
-                       p("crc: c_size %u size %u offset %u csum %u compress %u",
-                         crc_compressed_size(e.k, crc),
-                         crc_uncompressed_size(e.k, crc),
-                         crc_offset(crc), crc_csum_type(crc),
-                         crc_compression_type(crc));
+                       crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
+
+                       p("crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
+                         crc.compressed_size,
+                         crc.uncompressed_size,
+                         crc.offset, crc.nonce,
+                         crc.csum_type,
+                         crc.compression_type);
                        break;
                case BCH_EXTENT_ENTRY_ptr:
                        ptr = entry_to_ptr(entry);
-                       ca = c->devs[ptr->dev];
+                       ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+                               ? bch_dev_bkey_exists(c, ptr->dev)
+                               : NULL;
 
                        p("ptr: %u:%llu gen %u%s", ptr->dev,
                          (u64) ptr->offset, ptr->gen,
@@ -466,6 +571,60 @@ out:
        return out - buf;
 }
 
+static inline bool dev_latency_better(struct bch_dev *dev1,
+                                     struct bch_dev *dev2)
+{
+       unsigned l1 = atomic_read(&dev1->latency[READ]);
+       unsigned l2 = atomic_read(&dev2->latency[READ]);
+
+       /* Pick at random, biased in favor of the faster device: */
+
+       return bch2_rand_range(l1 + l2) > l1;
+}
+
+static void extent_pick_read_device(struct bch_fs *c,
+                                   struct bkey_s_c_extent e,
+                                   struct bch_devs_mask *avoid,
+                                   struct extent_pick_ptr *pick)
+{
+       const struct bch_extent_ptr *ptr;
+       struct bch_extent_crc_unpacked crc;
+
+       extent_for_each_ptr_crc(e, ptr, crc) {
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+
+               if (ptr->cached && ptr_stale(ca, ptr))
+                       continue;
+
+               if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
+                       continue;
+
+               if (avoid) {
+                       if (test_bit(ca->dev_idx, avoid->d))
+                               continue;
+
+                       if (pick->ca &&
+                           test_bit(pick->ca->dev_idx, avoid->d))
+                               goto use;
+               }
+
+               if (pick->ca && !dev_latency_better(ca, pick->ca))
+                       continue;
+use:
+               if (!percpu_ref_tryget(&ca->io_ref))
+                       continue;
+
+               if (pick->ca)
+                       percpu_ref_put(&pick->ca->io_ref);
+
+               *pick = (struct extent_pick_ptr) {
+                       .ptr    = *ptr,
+                       .crc    = crc,
+                       .ca     = ca,
+               };
+       }
+}
+
 /* Btree ptrs */
 
 static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
@@ -485,24 +644,24 @@ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c,
                struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
                const union bch_extent_entry *entry;
                const struct bch_extent_ptr *ptr;
-               const union bch_extent_crc *crc;
                const char *reason;
 
-               extent_for_each_entry(e, entry)
+               extent_for_each_entry(e, entry) {
                        if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
                                return "invalid extent entry type";
 
-               extent_for_each_ptr_crc(e, ptr, crc) {
+                       if (extent_entry_is_crc(entry))
+                               return "has crc field";
+               }
+
+               extent_for_each_ptr(e, ptr) {
                        reason = extent_ptr_invalid(c, e, ptr,
-                                                   c->sb.btree_node_size,
+                                                   c->opts.btree_node_size,
                                                    true);
                        if (reason)
                                return reason;
                }
 
-               if (crc)
-                       return "has crc field";
-
                return NULL;
        }
 
@@ -519,24 +678,29 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
        unsigned seq;
        const char *err;
        char buf[160];
-       struct bucket *g;
+       struct bucket_mark mark;
        struct bch_dev *ca;
        unsigned replicas = 0;
        bool bad;
 
        extent_for_each_ptr(e, ptr) {
-               ca = c->devs[ptr->dev];
-               g = PTR_BUCKET(ca, ptr);
+               ca = bch_dev_bkey_exists(c, ptr->dev);
                replicas++;
 
+               if (!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags))
+                       continue;
+
                err = "stale";
                if (ptr_stale(ca, ptr))
                        goto err;
 
                do {
                        seq = read_seqcount_begin(&c->gc_pos_lock);
+                       mark = ptr_bucket_mark(ca, ptr);
+
                        bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
-                               g->mark.data_type != BUCKET_BTREE;
+                               (mark.data_type != BCH_DATA_BTREE ||
+                                mark.dirty_sectors < c->opts.btree_node_size);
                } while (read_seqcount_retry(&c->gc_pos_lock, seq));
 
                err = "inconsistent";
@@ -544,7 +708,7 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
                        goto err;
        }
 
-       if (!bch2_sb_has_replicas(c, e, BCH_DATA_BTREE)) {
+       if (!bch2_bkey_replicas_marked(c, BCH_DATA_BTREE, e.s_c)) {
                bch2_bkey_val_to_text(c, btree_node_type(b),
                                     buf, sizeof(buf), k);
                bch2_fs_bug(c,
@@ -557,11 +721,9 @@ static void btree_ptr_debugcheck(struct bch_fs *c, struct btree *b,
 err:
        bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), k);
        bch2_fs_bug(c, "%s btree pointer %s: bucket %zi "
-                     "gen %i last_gc %i mark %08x",
+                     "gen %i mark %08x",
                      err, buf, PTR_BUCKET_NR(ca, ptr),
-                     PTR_BUCKET(ca, ptr)->mark.gen,
-                     ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)],
-                     (unsigned) g->mark.counter);
+                     mark.gen, (unsigned) mark.counter);
 }
 
 static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
@@ -582,38 +744,13 @@ static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf,
 }
 
 struct extent_pick_ptr
-bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b)
+bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b,
+                   struct bch_devs_mask *avoid)
 {
-       struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key);
-       const union bch_extent_crc *crc;
-       const struct bch_extent_ptr *ptr;
        struct extent_pick_ptr pick = { .ca = NULL };
 
-       extent_for_each_ptr_crc(e, ptr, crc) {
-               struct bch_dev *ca = c->devs[ptr->dev];
-               struct btree *root = btree_node_root(c, b);
-
-               if (bch2_fs_inconsistent_on(crc, c,
-                               "btree node pointer with crc at btree %u level %u/%u bucket %zu",
-                               b->btree_id, b->level, root ? root->level : -1,
-                               PTR_BUCKET_NR(ca, ptr)))
-                       break;
-
-               if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
-                       continue;
-
-               if (pick.ca && pick.ca->mi.tier < ca->mi.tier)
-                       continue;
-
-               if (!percpu_ref_tryget(&ca->io_ref))
-                       continue;
-
-               if (pick.ca)
-                       percpu_ref_put(&pick.ca->io_ref);
-
-               pick.ca         = ca;
-               pick.ptr        = *ptr;
-       }
+       extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key),
+                               avoid, &pick);
 
        return pick;
 }
@@ -648,28 +785,28 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k)
                __set_bkey_deleted(k.k);
        else if (bkey_extent_is_data(k.k)) {
                struct bkey_s_extent e = bkey_s_to_extent(k);
-               struct bch_extent_ptr *ptr;
-               union bch_extent_crc *crc, *prev_crc = NULL;
+               union bch_extent_entry *entry;
+               bool seen_crc = false;
 
-               extent_for_each_ptr_crc(e, ptr, crc) {
-                       switch (extent_crc_type(crc)) {
-                       case BCH_EXTENT_CRC_NONE:
-                               ptr->offset += e.k->size - len;
+               extent_for_each_entry(e, entry) {
+                       switch (extent_entry_type(entry)) {
+                       case BCH_EXTENT_ENTRY_ptr:
+                               if (!seen_crc)
+                                       entry->ptr.offset += e.k->size - len;
                                break;
-                       case BCH_EXTENT_CRC32:
-                               if (prev_crc != crc)
-                                       crc->crc32.offset += e.k->size - len;
+                       case BCH_EXTENT_ENTRY_crc32:
+                               entry->crc32.offset += e.k->size - len;
                                break;
-                       case BCH_EXTENT_CRC64:
-                               if (prev_crc != crc)
-                                       crc->crc64.offset += e.k->size - len;
+                       case BCH_EXTENT_ENTRY_crc64:
+                               entry->crc64.offset += e.k->size - len;
                                break;
-                       case BCH_EXTENT_CRC128:
-                               if (prev_crc != crc)
-                                       crc->crc128.offset += e.k->size - len;
+                       case BCH_EXTENT_ENTRY_crc128:
+                               entry->crc128.offset += e.k->size - len;
                                break;
                        }
-                       prev_crc = crc;
+
+                       if (extent_entry_is_crc(entry))
+                               seen_crc = true;
                }
        }
 
@@ -767,13 +904,13 @@ static void extent_save(struct btree *b, struct btree_node_iter *iter,
                 bkey_start_pos(&_ur)) ?: (r).k - (l).k;                \
 })
 
-static inline void extent_sort_sift(struct btree_node_iter *iter,
+static inline void extent_sort_sift(struct btree_node_iter_large *iter,
                                    struct btree *b, size_t i)
 {
        heap_sift_down(iter, i, extent_sort_cmp);
 }
 
-static inline void extent_sort_next(struct btree_node_iter *iter,
+static inline void extent_sort_next(struct btree_node_iter_large *iter,
                                    struct btree *b,
                                    struct btree_node_iter_set *i)
 {
@@ -815,7 +952,7 @@ static void extent_sort_append(struct bch_fs *c,
 struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
                                        struct bset *dst,
                                        struct btree *b,
-                                       struct btree_node_iter *iter)
+                                       struct btree_node_iter_large *iter)
 {
        struct bkey_format *f = &b->format;
        struct btree_node_iter_set *_l = iter->data, *_r;
@@ -828,7 +965,7 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
 
        heap_resort(iter, extent_sort_cmp);
 
-       while (!bch2_btree_node_iter_end(iter)) {
+       while (!bch2_btree_node_iter_large_end(iter)) {
                lk = __btree_node_offset_to_key(b, _l->k);
 
                if (iter->used == 1) {
@@ -927,10 +1064,10 @@ struct extent_insert_state {
 };
 
 static void bch2_add_sectors(struct extent_insert_state *s,
-                           struct bkey_s_c k, u64 offset, s64 sectors)
+                            struct bkey_s_c k, u64 offset, s64 sectors)
 {
        struct bch_fs *c = s->trans->c;
-       struct btree *b = s->insert->iter->nodes[0];
+       struct btree *b = s->insert->iter->l[0].b;
 
        EBUG_ON(bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0);
 
@@ -938,7 +1075,7 @@ static void bch2_add_sectors(struct extent_insert_state *s,
                return;
 
        bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b),
-                    &s->stats, s->trans->journal_res.seq);
+                     &s->stats, s->trans->journal_res.seq, 0);
 }
 
 static void bch2_subtract_sectors(struct extent_insert_state *s,
@@ -984,7 +1121,7 @@ static bool bch2_extent_merge_inline(struct bch_fs *,
 static enum btree_insert_ret
 extent_insert_should_stop(struct extent_insert_state *s)
 {
-       struct btree *b = s->insert->iter->nodes[0];
+       struct btree *b = s->insert->iter->l[0].b;
 
        /*
         * Check if we have sufficient space in both the btree node and the
@@ -1006,19 +1143,18 @@ extent_insert_should_stop(struct extent_insert_state *s)
 static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
                               struct bkey_i *insert)
 {
-       struct btree *b = iter->nodes[0];
-       struct btree_node_iter *node_iter = &iter->node_iters[0];
-       struct bset_tree *t = bset_tree_last(b);
+       struct btree_iter_level *l = &iter->l[0];
+       struct bset_tree *t = bset_tree_last(l->b);
        struct bkey_packed *where =
-               bch2_btree_node_iter_bset_pos(node_iter, b, t);
-       struct bkey_packed *prev = bch2_bkey_prev(b, t, where);
+               bch2_btree_node_iter_bset_pos(&l->iter, l->b, t);
+       struct bkey_packed *prev = bch2_bkey_prev(l->b, t, where);
        struct bkey_packed *next_live_key = where;
        unsigned clobber_u64s;
 
        if (prev)
                where = bkey_next(prev);
 
-       while (next_live_key != btree_bkey_last(b, t) &&
+       while (next_live_key != btree_bkey_last(l->b, t) &&
               bkey_deleted(next_live_key))
                next_live_key = bkey_next(next_live_key);
 
@@ -1032,18 +1168,19 @@ static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
            bch2_extent_merge_inline(c, iter, prev, bkey_to_packed(insert), true))
                goto drop_deleted_keys;
 
-       if (next_live_key != btree_bkey_last(b, t) &&
+       if (next_live_key != btree_bkey_last(l->b, t) &&
            bch2_extent_merge_inline(c, iter, bkey_to_packed(insert),
                                    next_live_key, false))
                goto drop_deleted_keys;
 
-       bch2_bset_insert(b, node_iter, where, insert, clobber_u64s);
-       bch2_btree_node_iter_fix(iter, b, node_iter, t, where,
+       bch2_bset_insert(l->b, &l->iter, where, insert, clobber_u64s);
+       bch2_btree_node_iter_fix(iter, l->b, &l->iter, t, where,
                                clobber_u64s, where->u64s);
        return;
 drop_deleted_keys:
-       bch2_bset_delete(b, where, clobber_u64s);
-       bch2_btree_node_iter_fix(iter, b, node_iter, t, where, clobber_u64s, 0);
+       bch2_bset_delete(l->b, where, clobber_u64s);
+       bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
+                                where, clobber_u64s, 0);
 }
 
 static void extent_insert_committed(struct extent_insert_state *s)
@@ -1072,7 +1209,7 @@ static void extent_insert_committed(struct extent_insert_state *s)
 
        if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) &&
            bkey_cmp(s->committed, insert->k.p) &&
-           bkey_extent_is_compressed(bkey_i_to_s_c(insert))) {
+           bch2_extent_is_compressed(bkey_i_to_s_c(insert))) {
                /* XXX: possibly need to increase our reservation? */
                bch2_cut_subtract_back(s, s->committed,
                                      bkey_i_to_s(&split.k));
@@ -1086,8 +1223,7 @@ static void extent_insert_committed(struct extent_insert_state *s)
        }
 
        if (debug_check_bkeys(c))
-               bch2_bkey_debugcheck(c, iter->nodes[iter->level],
-                               bkey_i_to_s_c(&split.k));
+               bch2_bkey_debugcheck(c, iter->l[0].b, bkey_i_to_s_c(&split.k));
 
        bch2_btree_journal_key(s->trans, iter, &split.k);
 
@@ -1101,46 +1237,24 @@ done:
        s->trans->did_work              = true;
 }
 
-static enum extent_insert_hook_ret
+static enum btree_insert_ret
 __extent_insert_advance_pos(struct extent_insert_state *s,
                            struct bpos next_pos,
                            struct bkey_s_c k)
 {
        struct extent_insert_hook *hook = s->trans->hook;
-       enum extent_insert_hook_ret ret;
-#if 0
-       /*
-        * Currently disabled for encryption - broken with fcollapse. Will have
-        * to reenable when versions are exposed for send/receive - versions
-        * will have to be monotonic then:
-        */
-       if (k.k && k.k->size &&
-           !bversion_zero(s->insert->k->k.version) &&
-           bversion_cmp(k.k->version, s->insert->k->k.version) > 0) {
-               ret = BTREE_HOOK_NO_INSERT;
-       } else
-#endif
+       enum btree_insert_ret ret;
+
        if (hook)
                ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k);
        else
-               ret = BTREE_HOOK_DO_INSERT;
+               ret = BTREE_INSERT_OK;
 
        EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size);
 
-       switch (ret) {
-       case BTREE_HOOK_DO_INSERT:
-               break;
-       case BTREE_HOOK_NO_INSERT:
-               extent_insert_committed(s);
-               bch2_cut_subtract_front(s, next_pos, bkey_i_to_s(s->insert->k));
-
-               bch2_btree_iter_set_pos_same_leaf(s->insert->iter, next_pos);
-               break;
-       case BTREE_HOOK_RESTART_TRANS:
-               return ret;
-       }
+       if (ret == BTREE_INSERT_OK)
+               s->committed = next_pos;
 
-       s->committed = next_pos;
        return ret;
 }
 
@@ -1148,39 +1262,28 @@ __extent_insert_advance_pos(struct extent_insert_state *s,
  * Update iter->pos, marking how much of @insert we've processed, and call hook
  * fn:
  */
-static enum extent_insert_hook_ret
+static enum btree_insert_ret
 extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k)
 {
-       struct btree *b = s->insert->iter->nodes[0];
+       struct btree *b = s->insert->iter->l[0].b;
        struct bpos next_pos = bpos_min(s->insert->k->k.p,
                                        k.k ? k.k->p : b->key.k.p);
+       enum btree_insert_ret ret;
+
+       if (race_fault())
+               return BTREE_INSERT_NEED_TRAVERSE;
 
        /* hole? */
        if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) {
-               bool have_uncommitted = bkey_cmp(s->committed,
-                               bkey_start_pos(&s->insert->k->k)) > 0;
-
-               switch (__extent_insert_advance_pos(s, bkey_start_pos(k.k),
-                                                   bkey_s_c_null)) {
-               case BTREE_HOOK_DO_INSERT:
-                       break;
-               case BTREE_HOOK_NO_INSERT:
-                       /*
-                        * we had to split @insert and insert the committed
-                        * part - need to bail out and recheck journal
-                        * reservation/btree node before we advance pos past @k:
-                        */
-                       if (have_uncommitted)
-                               return BTREE_HOOK_NO_INSERT;
-                       break;
-               case BTREE_HOOK_RESTART_TRANS:
-                       return BTREE_HOOK_RESTART_TRANS;
-               }
+               ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k),
+                                                   bkey_s_c_null);
+               if (ret != BTREE_INSERT_OK)
+                       return ret;
        }
 
        /* avoid redundant calls to hook fn: */
        if (!bkey_cmp(s->committed, next_pos))
-               return BTREE_HOOK_DO_INSERT;
+               return BTREE_INSERT_OK;
 
        return __extent_insert_advance_pos(s, next_pos, k);
 }
@@ -1194,7 +1297,7 @@ extent_insert_check_split_compressed(struct extent_insert_state *s,
        unsigned sectors;
 
        if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
-           (sectors = bkey_extent_is_compressed(k))) {
+           (sectors = bch2_extent_is_compressed(k))) {
                int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD;
 
                if (s->trans->flags & BTREE_INSERT_NOFAIL)
@@ -1202,7 +1305,8 @@ extent_insert_check_split_compressed(struct extent_insert_state *s,
 
                switch (bch2_disk_reservation_add(c,
                                s->trans->disk_res,
-                               sectors, flags)) {
+                               sectors * bch2_extent_nr_dirty_ptrs(k),
+                               flags)) {
                case 0:
                        break;
                case -ENOSPC:
@@ -1224,8 +1328,10 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
 {
        struct bch_fs *c = s->trans->c;
        struct btree_iter *iter = s->insert->iter;
-       struct btree *b = iter->nodes[0];
-       struct btree_node_iter *node_iter = &iter->node_iters[0];
+       struct btree_iter_level *l = &iter->l[0];
+       struct btree *b = l->b;
+       struct btree_node_iter *node_iter = &l->iter;
+       enum btree_insert_ret ret;
 
        switch (overlap) {
        case BCH_EXTENT_OVERLAP_FRONT:
@@ -1271,9 +1377,9 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert,
                        k.k->p = orig_pos;
                        extent_save(b, node_iter, _k, k.k);
 
-                       if (extent_insert_advance_pos(s, k.s_c) ==
-                           BTREE_HOOK_RESTART_TRANS)
-                               return BTREE_INSERT_NEED_TRAVERSE;
+                       ret = extent_insert_advance_pos(s, k.s_c);
+                       if (ret != BTREE_INSERT_OK)
+                               return ret;
 
                        extent_insert_committed(s);
                        /*
@@ -1333,8 +1439,9 @@ bch2_delete_fixup_extent(struct extent_insert_state *s)
 {
        struct bch_fs *c = s->trans->c;
        struct btree_iter *iter = s->insert->iter;
-       struct btree *b = iter->nodes[0];
-       struct btree_node_iter *node_iter = &iter->node_iters[0];
+       struct btree_iter_level *l = &iter->l[0];
+       struct btree *b = l->b;
+       struct btree_node_iter *node_iter = &l->iter;
        struct bkey_packed *_k;
        struct bkey unpacked;
        struct bkey_i *insert = s->insert->k;
@@ -1369,15 +1476,9 @@ bch2_delete_fixup_extent(struct extent_insert_state *s)
                if (ret != BTREE_INSERT_OK)
                        goto stop;
 
-               switch (extent_insert_advance_pos(s, k.s_c)) {
-               case BTREE_HOOK_DO_INSERT:
-                       break;
-               case BTREE_HOOK_NO_INSERT:
-                       continue;
-               case BTREE_HOOK_RESTART_TRANS:
-                       ret = BTREE_INSERT_NEED_TRAVERSE;
+               ret = extent_insert_advance_pos(s, k.s_c);
+               if (ret)
                        goto stop;
-               }
 
                s->do_journal = true;
 
@@ -1418,10 +1519,9 @@ next:
                bch2_btree_iter_set_pos_same_leaf(iter, s->committed);
        }
 
-       if (bkey_cmp(s->committed, insert->k.p) < 0 &&
-           ret == BTREE_INSERT_OK &&
-           extent_insert_advance_pos(s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
-               ret = BTREE_INSERT_NEED_TRAVERSE;
+       if (ret == BTREE_INSERT_OK &&
+           bkey_cmp(s->committed, insert->k.p) < 0)
+               ret = extent_insert_advance_pos(s, bkey_s_c_null);
 stop:
        extent_insert_committed(s);
 
@@ -1487,8 +1587,9 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
 {
        struct bch_fs *c = trans->c;
        struct btree_iter *iter = insert->iter;
-       struct btree *b = iter->nodes[0];
-       struct btree_node_iter *node_iter = &iter->node_iters[0];
+       struct btree_iter_level *l = &iter->l[0];
+       struct btree *b = l->b;
+       struct btree_node_iter *node_iter = &l->iter;
        struct bkey_packed *_k;
        struct bkey unpacked;
        enum btree_insert_ret ret = BTREE_INSERT_OK;
@@ -1543,18 +1644,10 @@ bch2_insert_fixup_extent(struct btree_insert *trans,
 
                /*
                 * Only call advance pos & call hook for nonzero size extents:
-                * If hook returned BTREE_HOOK_NO_INSERT, @insert->k no longer
-                * overlaps with @k:
                 */
-               switch (extent_insert_advance_pos(&s, k.s_c)) {
-               case BTREE_HOOK_DO_INSERT:
-                       break;
-               case BTREE_HOOK_NO_INSERT:
-                       continue;
-               case BTREE_HOOK_RESTART_TRANS:
-                       ret = BTREE_INSERT_NEED_TRAVERSE;
+               ret = extent_insert_advance_pos(&s, k.s_c);
+               if (ret != BTREE_INSERT_OK)
                        goto stop;
-               }
 
                if (k.k->size &&
                    (k.k->needs_whiteout || bset_written(b, bset(b, t))))
@@ -1572,10 +1665,9 @@ squash:
                        goto stop;
        }
 
-       if (bkey_cmp(s.committed, insert->k->k.p) < 0 &&
-           ret == BTREE_INSERT_OK &&
-           extent_insert_advance_pos(&s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS)
-               ret = BTREE_INSERT_NEED_TRAVERSE;
+       if (ret == BTREE_INSERT_OK &&
+           bkey_cmp(s.committed, insert->k->k.p) < 0)
+               ret = extent_insert_advance_pos(&s, bkey_s_c_null);
 stop:
        extent_insert_committed(&s);
        /*
@@ -1618,29 +1710,37 @@ static const char *bch2_extent_invalid(const struct bch_fs *c,
        case BCH_EXTENT_CACHED: {
                struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
                const union bch_extent_entry *entry;
-               const union bch_extent_crc *crc;
+               struct bch_extent_crc_unpacked crc;
                const struct bch_extent_ptr *ptr;
                unsigned size_ondisk = e.k->size;
                const char *reason;
+               unsigned nonce = UINT_MAX;
 
                extent_for_each_entry(e, entry) {
                        if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
                                return "invalid extent entry type";
 
                        if (extent_entry_is_crc(entry)) {
-                               crc = entry_to_crc(entry);
+                               crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry));
 
-                               if (crc_offset(crc) + e.k->size >
-                                   crc_uncompressed_size(e.k, crc))
+                               if (crc.offset + e.k->size >
+                                   crc.uncompressed_size)
                                        return "checksum offset + key size > uncompressed size";
 
-                               size_ondisk = crc_compressed_size(e.k, crc);
+                               size_ondisk = crc.compressed_size;
 
-                               if (!bch2_checksum_type_valid(c, crc_csum_type(crc)))
+                               if (!bch2_checksum_type_valid(c, crc.csum_type))
                                        return "invalid checksum type";
 
-                               if (crc_compression_type(crc) >= BCH_COMPRESSION_NR)
+                               if (crc.compression_type >= BCH_COMPRESSION_NR)
                                        return "invalid compression type";
+
+                               if (bch2_csum_type_is_encryption(crc.csum_type)) {
+                                       if (nonce == UINT_MAX)
+                                               nonce = crc.offset + crc.nonce;
+                                       else if (nonce != crc.offset + crc.nonce)
+                                               return "incorrect nonce";
+                               }
                        } else {
                                ptr = entry_to_ptr(entry);
 
@@ -1676,11 +1776,10 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
 {
        const struct bch_extent_ptr *ptr;
        struct bch_dev *ca;
-       struct bucket *g;
+       struct bucket_mark mark;
        unsigned seq, stale;
        char buf[160];
        bool bad;
-       unsigned ptrs_per_tier[BCH_TIER_MAX];
        unsigned replicas = 0;
 
        /*
@@ -1692,13 +1791,9 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
         * going to get overwritten during replay)
         */
 
-       memset(ptrs_per_tier, 0, sizeof(ptrs_per_tier));
-
        extent_for_each_ptr(e, ptr) {
-               ca = c->devs[ptr->dev];
-               g = PTR_BUCKET(ca, ptr);
+               ca = bch_dev_bkey_exists(c, ptr->dev);
                replicas++;
-               ptrs_per_tier[ca->mi.tier]++;
 
                /*
                 * If journal replay hasn't finished, we might be seeing keys
@@ -1710,10 +1805,8 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
                stale = 0;
 
                do {
-                       struct bucket_mark mark;
-
                        seq = read_seqcount_begin(&c->gc_pos_lock);
-                       mark = READ_ONCE(g->mark);
+                       mark = ptr_bucket_mark(ca, ptr);
 
                        /* between mark and bucket gen */
                        smp_rmb();
@@ -1730,12 +1823,11 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
                        if (stale)
                                break;
 
-                       bad = (mark.data_type != BUCKET_DATA ||
-                              (gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
-                               !mark.owned_by_allocator &&
-                               !(ptr->cached
-                                 ? mark.cached_sectors
-                                 : mark.dirty_sectors)));
+                       bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 &&
+                               (mark.data_type != BCH_DATA_USER ||
+                                !(ptr->cached
+                                  ? mark.cached_sectors
+                                  : mark.dirty_sectors));
                } while (read_seqcount_retry(&c->gc_pos_lock, seq));
 
                if (bad)
@@ -1752,7 +1844,7 @@ static void bch2_extent_debugcheck_extent(struct bch_fs *c, struct btree *b,
        }
 
        if (!bkey_extent_is_cached(e.k) &&
-           !bch2_sb_has_replicas(c, e, BCH_DATA_USER)) {
+           !bch2_bkey_replicas_marked(c, BCH_DATA_USER, e.s_c)) {
                bch2_bkey_val_to_text(c, btree_node_type(b),
                                     buf, sizeof(buf), e.s_c);
                bch2_fs_bug(c,
@@ -1767,10 +1859,8 @@ bad_ptr:
        bch2_bkey_val_to_text(c, btree_node_type(b), buf,
                             sizeof(buf), e.s_c);
        bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu "
-                  "gen %i last_gc %i mark 0x%08x",
-                  buf, PTR_BUCKET_NR(ca, ptr), PTR_BUCKET(ca, ptr)->mark.gen,
-                  ca->oldest_gens[PTR_BUCKET_NR(ca, ptr)],
-                  (unsigned) g->mark.counter);
+                  "gen %i type %u", buf,
+                  PTR_BUCKET_NR(ca, ptr), mark.gen, mark.data_type);
        return;
 }
 
@@ -1806,109 +1896,76 @@ static void bch2_extent_to_text(struct bch_fs *c, char *buf,
 #undef p
 }
 
-static unsigned PTR_TIER(struct bch_fs *c,
-                        const struct bch_extent_ptr *ptr)
-{
-       return c->devs[ptr->dev]->mi.tier;
-}
-
 static void bch2_extent_crc_init(union bch_extent_crc *crc,
-                                unsigned compressed_size,
-                                unsigned uncompressed_size,
-                                unsigned compression_type,
-                                unsigned nonce,
-                                struct bch_csum csum, unsigned csum_type)
-{
-       if (bch_crc_bytes[csum_type]    <= 4 &&
-           uncompressed_size           <= CRC32_SIZE_MAX &&
-           nonce                       <= CRC32_NONCE_MAX) {
+                                struct bch_extent_crc_unpacked new)
+{
+#define common_fields(_crc)                                            \
+               .csum_type              = _crc.csum_type,               \
+               .compression_type       = _crc.compression_type,        \
+               ._compressed_size       = _crc.compressed_size - 1,     \
+               ._uncompressed_size     = _crc.uncompressed_size - 1,   \
+               .offset                 = _crc.offset
+
+       if (bch_crc_bytes[new.csum_type]        <= 4 &&
+           new.uncompressed_size               <= CRC32_SIZE_MAX &&
+           new.nonce                           <= CRC32_NONCE_MAX) {
                crc->crc32 = (struct bch_extent_crc32) {
                        .type = 1 << BCH_EXTENT_ENTRY_crc32,
-                       ._compressed_size       = compressed_size - 1,
-                       ._uncompressed_size     = uncompressed_size - 1,
-                       .offset                 = 0,
-                       .compression_type       = compression_type,
-                       .csum_type              = csum_type,
-                       .csum                   = *((__le32 *) &csum.lo),
+                       common_fields(new),
+                       .csum                   = *((__le32 *) &new.csum.lo),
                };
                return;
        }
 
-       if (bch_crc_bytes[csum_type]    <= 10 &&
-           uncompressed_size           <= CRC64_SIZE_MAX &&
-           nonce                       <= CRC64_NONCE_MAX) {
+       if (bch_crc_bytes[new.csum_type]        <= 10 &&
+           new.uncompressed_size               <= CRC64_SIZE_MAX &&
+           new.nonce                           <= CRC64_NONCE_MAX) {
                crc->crc64 = (struct bch_extent_crc64) {
                        .type = 1 << BCH_EXTENT_ENTRY_crc64,
-                       ._compressed_size       = compressed_size - 1,
-                       ._uncompressed_size     = uncompressed_size - 1,
-                       .offset                 = 0,
-                       .nonce                  = nonce,
-                       .compression_type       = compression_type,
-                       .csum_type              = csum_type,
-                       .csum_lo                = csum.lo,
-                       .csum_hi                = *((__le16 *) &csum.hi),
+                       common_fields(new),
+                       .nonce                  = new.nonce,
+                       .csum_lo                = new.csum.lo,
+                       .csum_hi                = *((__le16 *) &new.csum.hi),
                };
                return;
        }
 
-       if (bch_crc_bytes[csum_type]    <= 16 &&
-           uncompressed_size           <= CRC128_SIZE_MAX &&
-           nonce                       <= CRC128_NONCE_MAX) {
+       if (bch_crc_bytes[new.csum_type]        <= 16 &&
+           new.uncompressed_size               <= CRC128_SIZE_MAX &&
+           new.nonce                           <= CRC128_NONCE_MAX) {
                crc->crc128 = (struct bch_extent_crc128) {
                        .type = 1 << BCH_EXTENT_ENTRY_crc128,
-                       ._compressed_size       = compressed_size - 1,
-                       ._uncompressed_size     = uncompressed_size - 1,
-                       .offset                 = 0,
-                       .nonce                  = nonce,
-                       .compression_type       = compression_type,
-                       .csum_type              = csum_type,
-                       .csum                   = csum,
+                       common_fields(new),
+                       .nonce                  = new.nonce,
+                       .csum                   = new.csum,
                };
                return;
        }
-
+#undef common_fields
        BUG();
 }
 
 void bch2_extent_crc_append(struct bkey_i_extent *e,
-                           unsigned compressed_size,
-                           unsigned uncompressed_size,
-                           unsigned compression_type,
-                           unsigned nonce,
-                           struct bch_csum csum, unsigned csum_type)
+                           struct bch_extent_crc_unpacked new)
 {
-       union bch_extent_crc *crc;
+       struct bch_extent_crc_unpacked crc;
+       const union bch_extent_entry *i;
 
-       BUG_ON(compressed_size > uncompressed_size);
-       BUG_ON(uncompressed_size != e->k.size);
-       BUG_ON(!compressed_size || !uncompressed_size);
+       BUG_ON(new.compressed_size > new.uncompressed_size);
+       BUG_ON(new.live_size != e->k.size);
+       BUG_ON(!new.compressed_size || !new.uncompressed_size);
 
        /*
         * Look up the last crc entry, so we can check if we need to add
         * another:
         */
-       extent_for_each_crc(extent_i_to_s(e), crc)
+       extent_for_each_crc(extent_i_to_s(e), crc, i)
                ;
 
-       if (!crc && !csum_type && !compression_type)
-               return;
-
-       if (crc &&
-           crc_compressed_size(&e->k, crc)     == compressed_size &&
-           crc_uncompressed_size(&e->k, crc)   == uncompressed_size &&
-           crc_offset(crc)                     == 0 &&
-           crc_nonce(crc)                      == nonce &&
-           crc_csum_type(crc)                  == csum_type &&
-           crc_compression_type(crc)           == compression_type &&
-           crc_csum(crc).lo                    == csum.lo &&
-           crc_csum(crc).hi                    == csum.hi)
+       if (!memcmp(&crc, &new, sizeof(crc)))
                return;
 
-       bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)),
-                           compressed_size,
-                           uncompressed_size,
-                           compression_type,
-                           nonce, csum, csum_type);
+       bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new);
        __extent_entry_push(e);
 }
 
@@ -1961,50 +2018,45 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
 
 void bch2_extent_mark_replicas_cached(struct bch_fs *c,
                                      struct bkey_s_extent e,
-                                     unsigned nr_cached)
+                                     unsigned nr_desired_replicas,
+                                     unsigned target)
 {
        struct bch_extent_ptr *ptr;
-       bool have_higher_tier;
-       unsigned tier = 0;
+       int extra = bch2_extent_durability(c, e.c) - nr_desired_replicas;
 
-       if (!nr_cached)
+       if (extra <= 0)
                return;
 
-       do {
-               have_higher_tier = false;
-
-               extent_for_each_ptr(e, ptr) {
-                       if (!ptr->cached &&
-                           PTR_TIER(c, ptr) == tier) {
-                               ptr->cached = true;
-                               nr_cached--;
-                               if (!nr_cached)
-                                       return;
-                       }
+       extent_for_each_ptr(e, ptr) {
+               int n = bch2_extent_ptr_durability(c, ptr);
 
-                       if (PTR_TIER(c, ptr) > tier)
-                               have_higher_tier = true;
+               if (n && n <= extra &&
+                   !dev_in_target(c->devs[ptr->dev], target)) {
+                       ptr->cached = true;
+                       extra -= n;
                }
+       }
+
+       extent_for_each_ptr(e, ptr) {
+               int n = bch2_extent_ptr_durability(c, ptr);
 
-               tier++;
-       } while (have_higher_tier);
+               if (n && n <= extra) {
+                       ptr->cached = true;
+                       extra -= n;
+               }
+       }
 }
 
 /*
- * This picks a non-stale pointer, preferabbly from a device other than
- * avoid.  Avoid can be NULL, meaning pick any.  If there are no non-stale
- * pointers to other devices, it will still pick a pointer from avoid.
- * Note that it prefers lowered-numbered pointers to higher-numbered pointers
- * as the pointers are sorted by tier, hence preferring pointers to tier 0
- * rather than pointers to tier 1.
+ * This picks a non-stale pointer, preferably from a device other than @avoid.
+ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to
+ * other devices, it will still pick a pointer from avoid.
  */
-void bch2_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k,
-                                  struct bch_dev *avoid,
-                                  struct extent_pick_ptr *ret)
+void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k,
+                         struct bch_devs_mask *avoid,
+                         struct extent_pick_ptr *ret)
 {
        struct bkey_s_c_extent e;
-       const union bch_extent_crc *crc;
-       const struct bch_extent_ptr *ptr;
 
        switch (k.k->type) {
        case KEY_TYPE_DELETED:
@@ -2022,32 +2074,7 @@ void bch2_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k,
                e = bkey_s_c_to_extent(k);
                ret->ca = NULL;
 
-               extent_for_each_ptr_crc(e, ptr, crc) {
-                       struct bch_dev *ca = c->devs[ptr->dev];
-
-                       if (ptr->cached && ptr_stale(ca, ptr))
-                               continue;
-
-                       if (ca->mi.state == BCH_MEMBER_STATE_FAILED)
-                               continue;
-
-                       if (ret->ca &&
-                           (ca == avoid ||
-                            ret->ca->mi.tier < ca->mi.tier))
-                               continue;
-
-                       if (!percpu_ref_tryget(&ca->io_ref))
-                               continue;
-
-                       if (ret->ca)
-                               percpu_ref_put(&ret->ca->io_ref);
-
-                       *ret = (struct extent_pick_ptr) {
-                               .crc = crc_to_128(e.k, crc),
-                               .ptr = *ptr,
-                               .ca = ca,
-                       };
-               }
+               extent_pick_read_device(c, bkey_s_c_to_extent(k), avoid, ret);
 
                if (!ret->ca && !bkey_extent_is_cached(e.k))
                        ret->ca = ERR_PTR(-EIO);
@@ -2098,7 +2125,7 @@ static enum merge_result bch2_extent_merge(struct bch_fs *c,
 
                extent_for_each_entry(el, en_l) {
                        struct bch_extent_ptr *lp, *rp;
-                       unsigned bucket_size;
+                       struct bch_dev *ca;
 
                        en_r = vstruct_idx(er.v, (u64 *) en_l - el.v->_data);
 
@@ -2116,10 +2143,9 @@ static enum merge_result bch2_extent_merge(struct bch_fs *c,
                                return BCH_MERGE_NOMERGE;
 
                        /* We don't allow extents to straddle buckets: */
-                       bucket_size = c->devs[lp->dev]->mi.bucket_size;
+                       ca = bch_dev_bkey_exists(c, lp->dev);
 
-                       if ((lp->offset & ~((u64) bucket_size - 1)) !=
-                           (rp->offset & ~((u64) bucket_size - 1)))
+                       if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
                                return BCH_MERGE_NOMERGE;
                }
 
@@ -2179,8 +2205,7 @@ static bool extent_merge_one_overlapping(struct btree_iter *iter,
                                         struct bkey_packed *k, struct bkey uk,
                                         bool check, bool could_pack)
 {
-       struct btree *b = iter->nodes[0];
-       struct btree_node_iter *node_iter = &iter->node_iters[0];
+       struct btree_iter_level *l = &iter->l[0];
 
        BUG_ON(!bkey_deleted(k));
 
@@ -2188,10 +2213,10 @@ static bool extent_merge_one_overlapping(struct btree_iter *iter,
                return !bkey_packed(k) || could_pack;
        } else {
                uk.p = new_pos;
-               extent_save(b, node_iter, k, &uk);
-               bch2_bset_fix_invalidated_key(b, t, k);
-               bch2_btree_node_iter_fix(iter, b, node_iter, t,
-                                       k, k->u64s, k->u64s);
+               extent_save(l->b, &l->iter, k, &uk);
+               bch2_bset_fix_invalidated_key(l->b, t, k);
+               bch2_btree_node_iter_fix(iter, l->b, &l->iter, t,
+                                        k, k->u64s, k->u64s);
                return true;
        }
 }
@@ -2199,8 +2224,9 @@ static bool extent_merge_one_overlapping(struct btree_iter *iter,
 static bool extent_merge_do_overlapping(struct btree_iter *iter,
                                        struct bkey *m, bool back_merge)
 {
-       struct btree *b = iter->nodes[0];
-       struct btree_node_iter *node_iter = &iter->node_iters[0];
+       struct btree_iter_level *l = &iter->l[0];
+       struct btree *b = l->b;
+       struct btree_node_iter *node_iter = &l->iter;
        struct bset_tree *t;
        struct bkey_packed *k;
        struct bkey uk;
@@ -2291,8 +2317,8 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
                                     struct bkey_packed *r,
                                     bool back_merge)
 {
-       struct btree *b = iter->nodes[0];
-       struct btree_node_iter *node_iter = &iter->node_iters[0];
+       struct btree *b = iter->l[0].b;
+       struct btree_node_iter *node_iter = &iter->l[0].iter;
        const struct bkey_format *f = &b->format;
        struct bset_tree *t = bset_tree_last(b);
        struct bkey_packed *m;
@@ -2335,8 +2361,8 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
                if (back_merge)
                        bch2_btree_iter_set_pos_same_leaf(iter, li.k.k.p);
 
-               bch2_btree_node_iter_fix(iter, iter->nodes[0], node_iter,
-                                       t, m, m->u64s, m->u64s);
+               bch2_btree_node_iter_fix(iter, b, node_iter,
+                                        t, m, m->u64s, m->u64s);
 
                if (!back_merge)
                        bkey_copy(packed_to_bkey(l), &li.k);
@@ -2353,14 +2379,38 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
                extent_i_save(b, m, &li.k);
                bch2_bset_fix_invalidated_key(b, t, m);
 
-               bch2_btree_node_iter_fix(iter, iter->nodes[0], node_iter,
-                                       t, m, m->u64s, m->u64s);
+               bch2_btree_node_iter_fix(iter, b, node_iter,
+                                        t, m, m->u64s, m->u64s);
                return true;
        default:
                BUG();
        }
 }
 
+int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
+{
+       struct btree_iter iter;
+       struct bpos end = pos;
+       struct bkey_s_c k;
+       int ret = 0;
+
+       end.offset += size;
+
+       for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos,
+                            BTREE_ITER_SLOTS, k) {
+               if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
+                       break;
+
+               if (!bch2_extent_is_fully_allocated(k)) {
+                       ret = -ENOSPC;
+                       break;
+               }
+       }
+       bch2_btree_iter_unlock(&iter);
+
+       return ret;
+}
+
 const struct bkey_ops bch2_bkey_extent_ops = {
        .key_invalid    = bch2_extent_invalid,
        .key_debugcheck = bch2_extent_debugcheck,