]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to d372ddcbfa bcachefs: Reorganize extents.c
authorKent Overstreet <kent.overstreet@gmail.com>
Mon, 18 Nov 2019 01:36:59 +0000 (20:36 -0500)
committerKent Overstreet <kent.overstreet@gmail.com>
Mon, 18 Nov 2019 01:36:59 +0000 (20:36 -0500)
31 files changed:
.bcachefs_revision
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bkey.h
libbcachefs/bkey_methods.c
libbcachefs/bkey_on_stack.h [new file with mode: 0644]
libbcachefs/bkey_sort.c
libbcachefs/bset.c
libbcachefs/bset.h
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_leaf.c
libbcachefs/ec.c
libbcachefs/extent_update.c [new file with mode: 0644]
libbcachefs/extent_update.h [new file with mode: 0644]
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/fs-io.c
libbcachefs/fs.c
libbcachefs/io.c
libbcachefs/io.h
libbcachefs/journal.c
libbcachefs/journal_io.c
libbcachefs/migrate.c
libbcachefs/move.c
libbcachefs/recovery.c
libbcachefs/reflink.c
libbcachefs/super.c
libbcachefs/util.c
libbcachefs/util.h

index e0172a413c68c3be1fa04b6ad6fb15f58ca2d44f..9543a55003c2b710d8e94bd7fc8d94535939c484 100644 (file)
@@ -1 +1 @@
-b1a4dc53be10a4c3132fccaaf604d73861a52d2d
+d372ddcbfabef5fcfd29bad150865cccc3faf172
index 323b663da278ccc1ddc1da8e833c90798b93f3a3..a6b9b0e61b3d43a0dd37083eddff10deb8e13432 100644 (file)
@@ -725,6 +725,8 @@ struct bch_fs {
 
        atomic64_t              key_version;
 
+       mempool_t               large_bkey_pool;
+
        /* REBALANCE */
        struct bch_fs_rebalance rebalance;
 
index d619e5caf09b11fd330617b2e7db552d738af6da..3d85012a15fdcda704310de7666da26183641d64 100644 (file)
@@ -338,7 +338,8 @@ static inline void bkey_init(struct bkey *k)
        x(quota,                13)                     \
        x(stripe,               14)                     \
        x(reflink_p,            15)                     \
-       x(reflink_v,            16)
+       x(reflink_v,            16)                     \
+       x(inline_data,          17)
 
 enum bch_bkey_type {
 #define x(name, nr) KEY_TYPE_##name    = nr,
@@ -911,6 +912,13 @@ struct bch_reflink_v {
        __u64                   _data[0];
 };
 
+/* Inline data */
+
+struct bch_inline_data {
+       struct bch_val          v;
+       u8                      data[0];
+};
+
 /* Optional/variable size superblock sections: */
 
 struct bch_sb_field {
@@ -1315,6 +1323,7 @@ enum bch_sb_features {
        BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5,
        BCH_FEATURE_REFLINK             = 6,
        BCH_FEATURE_NEW_SIPHASH         = 7,
+       BCH_FEATURE_INLINE_DATA         = 8,
        BCH_FEATURE_NR,
 };
 
index b26f4934b264fa68f094396eacb3b201ccb86da5..f2d5f3009b210e440b639d27b3abd996720e4869 100644 (file)
@@ -33,6 +33,16 @@ struct bkey_s {
 
 #define bkey_next(_k)          vstruct_next(_k)
 
+static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k,
+                                                      struct bkey_packed *end)
+{
+       k = bkey_next(k);
+
+       while (k != end && !k->u64s)
+               k = (void *) ((u64 *) k + 1);
+       return k;
+}
+
 #define bkey_val_u64s(_k)      ((_k)->u64s - BKEY_U64s)
 
 static inline size_t bkey_val_bytes(const struct bkey *k)
@@ -554,6 +564,7 @@ BKEY_VAL_ACCESSORS(quota);
 BKEY_VAL_ACCESSORS(stripe);
 BKEY_VAL_ACCESSORS(reflink_p);
 BKEY_VAL_ACCESSORS(reflink_v);
+BKEY_VAL_ACCESSORS(inline_data);
 
 /* byte order helpers */
 
index f01405dd502bb64612f44303a0ca5daad58810f5..5312184c37f71105327af13bd9036c16c0f6f1f2 100644 (file)
@@ -63,6 +63,23 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c,
        .key_invalid = empty_val_key_invalid,           \
 }
 
+static const char *key_type_inline_data_invalid(const struct bch_fs *c,
+                                          struct bkey_s_c k)
+{
+       return NULL;
+}
+
+static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c,
+                                        struct bkey_s_c k)
+{
+       pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k));
+}
+
+static const struct bkey_ops bch2_bkey_ops_inline_data = {
+       .key_invalid    = key_type_inline_data_invalid,
+       .val_to_text    = key_type_inline_data_to_text,
+};
+
 static const struct bkey_ops bch2_bkey_ops[] = {
 #define x(name, nr) [KEY_TYPE_##name]  = bch2_bkey_ops_##name,
        BCH_BKEY_TYPES()
@@ -83,9 +100,8 @@ const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
        if (k.k->u64s < BKEY_U64s)
                return "u64s too small";
 
-       if ((btree_node_type_is_extents(type) ||
-            type == BKEY_TYPE_BTREE) &&
-           bkey_val_u64s(k.k) > BKEY_EXTENT_VAL_U64s_MAX)
+       if (type == BKEY_TYPE_BTREE &&
+           bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
                return "value too big";
 
        if (btree_node_type_is_extents(type)) {
diff --git a/libbcachefs/bkey_on_stack.h b/libbcachefs/bkey_on_stack.h
new file mode 100644 (file)
index 0000000..d473903
--- /dev/null
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_ON_STACK_H
+#define _BCACHEFS_BKEY_ON_STACK_H
+
+#include "bcachefs.h"
+
+struct bkey_on_stack {
+       struct bkey_i   *k;
+       u64             onstack[12];
+};
+
+static inline void bkey_on_stack_realloc(struct bkey_on_stack *s,
+                                        struct bch_fs *c, unsigned u64s)
+{
+       if (s->k == (void *) s->onstack &&
+           u64s > ARRAY_SIZE(s->onstack)) {
+               s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
+               memcpy(s->k, s->onstack, sizeof(s->onstack));
+       }
+}
+
+static inline void bkey_on_stack_init(struct bkey_on_stack *s)
+{
+       s->k = (void *) s->onstack;
+}
+
+static inline void bkey_on_stack_exit(struct bkey_on_stack *s,
+                                     struct bch_fs *c)
+{
+       if (s->k != (void *) s->onstack)
+               mempool_free(s->k, &c->large_bkey_pool);
+       s->k = NULL;
+}
+
+#endif /* _BCACHEFS_BKEY_ON_STACK_H */
index 2cac269b386f96ec9774b8dab64eb9823fd19336..daef8e5c599f25a7b864f9710d15c2e6928f5c07 100644 (file)
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
+#include "bkey_on_stack.h"
 #include "bkey_sort.h"
 #include "bset.h"
 #include "extents.h"
@@ -74,6 +75,10 @@ static void sort_key_next(struct btree_node_iter_large *iter,
 {
        i->k += __btree_node_offset_to_key(b, i->k)->u64s;
 
+       while (i->k != i->end &&
+              !__btree_node_offset_to_key(b, i->k)->u64s)
+               i->k++;
+
        if (i->k == i->end)
                *i = iter->data[--iter->used];
 }
@@ -118,7 +123,7 @@ static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter)
 
 static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp)
 {
-       iter->data->k = bkey_next(iter->data->k);
+       iter->data->k = bkey_next_skip_noops(iter->data->k, iter->data->end);
 
        BUG_ON(iter->data->k > iter->data->end);
 
@@ -292,8 +297,10 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
        struct bkey l_unpacked, r_unpacked;
        struct bkey_s l, r;
        struct btree_nr_keys nr;
+       struct bkey_on_stack split;
 
        memset(&nr, 0, sizeof(nr));
+       bkey_on_stack_init(&split);
 
        heap_resort(iter, extent_sort_cmp, NULL);
 
@@ -343,29 +350,29 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
                        if (bkey_cmp(l.k->p, r.k->p) >= 0) {
                                sort_key_next(iter, b, _r);
                        } else {
-                               __bch2_cut_front(l.k->p, r);
+                               bch2_cut_front_s(l.k->p, r);
                                extent_save(b, rk, r.k);
                        }
 
                        extent_sort_sift(iter, b, _r - iter->data);
                } else if (bkey_cmp(l.k->p, r.k->p) > 0) {
-                       BKEY_PADDED(k) tmp;
+                       bkey_on_stack_realloc(&split, c, l.k->u64s);
 
                        /*
                         * r wins, but it overlaps in the middle of l - split l:
                         */
-                       bkey_reassemble(&tmp.k, l.s_c);
-                       bch2_cut_back(bkey_start_pos(r.k), &tmp.k.k);
+                       bkey_reassemble(split.k, l.s_c);
+                       bch2_cut_back(bkey_start_pos(r.k), split.k);
 
-                       __bch2_cut_front(r.k->p, l);
+                       bch2_cut_front_s(r.k->p, l);
                        extent_save(b, lk, l.k);
 
                        extent_sort_sift(iter, b, 0);
 
                        extent_sort_append(c, f, &nr, dst->start,
-                                          &prev, bkey_i_to_s(&tmp.k));
+                                          &prev, bkey_i_to_s(split.k));
                } else {
-                       bch2_cut_back(bkey_start_pos(r.k), l.k);
+                       bch2_cut_back_s(bkey_start_pos(r.k), l);
                        extent_save(b, lk, l.k);
                }
        }
@@ -373,6 +380,8 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c,
        extent_sort_advance_prev(f, &nr, dst->start, &prev);
 
        dst->u64s = cpu_to_le16((u64 *) prev - dst->_data);
+
+       bkey_on_stack_exit(&split, c);
        return nr;
 }
 
index b7618e2b28f797998dca2fc394ee2d129b20adda..a0f0b0eadffb07339a9b0196952df8dbce6078e8 100644 (file)
@@ -76,7 +76,7 @@ void bch2_dump_bset(struct btree *b, struct bset *i, unsigned set)
        for (_k = i->start, k = bkey_unpack_key(b, _k);
             _k < vstruct_last(i);
             _k = _n, k = n) {
-               _n = bkey_next(_k);
+               _n = bkey_next_skip_noops(_k, vstruct_last(i));
 
                bch2_bkey_to_text(&PBUF(buf), &k);
                printk(KERN_ERR "block %u key %5u: %s\n", set,
@@ -144,9 +144,7 @@ void __bch2_verify_btree_nr_keys(struct btree *b)
        struct btree_nr_keys nr = { 0 };
 
        for_each_bset(b, t)
-               for (k = btree_bkey_first(b, t);
-                    k != btree_bkey_last(b, t);
-                    k = bkey_next(k))
+               bset_tree_for_each_key(b, t, k)
                        if (!bkey_whiteout(k))
                                btree_keys_account_key_add(&nr, t - b->set, k);
 
@@ -612,7 +610,7 @@ start:
                               rw_aux_tree(b, t)[j - 1].offset);
                }
 
-               k = bkey_next(k);
+               k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
                BUG_ON(k >= btree_bkey_last(b, t));
        }
 }
@@ -803,9 +801,7 @@ static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t)
        rw_aux_tree(b, t)[0].offset =
                __btree_node_key_to_offset(b, btree_bkey_first(b, t));
 
-       for (k = btree_bkey_first(b, t);
-            k != btree_bkey_last(b, t);
-            k = bkey_next(k)) {
+       bset_tree_for_each_key(b, t, k) {
                if (t->size == bset_rw_tree_capacity(b, t))
                        break;
 
@@ -838,7 +834,7 @@ retry:
        /* First we figure out where the first key in each cacheline is */
        eytzinger1_for_each(j, t->size) {
                while (bkey_to_cacheline(b, t, k) < cacheline)
-                       prev = k, k = bkey_next(k);
+                       prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
 
                if (k >= btree_bkey_last(b, t)) {
                        /* XXX: this path sucks */
@@ -854,10 +850,10 @@ retry:
                EBUG_ON(tree_to_bkey(b, t, j) != k);
        }
 
-       while (bkey_next(k) != btree_bkey_last(b, t))
-               k = bkey_next(k);
+       while (k != btree_bkey_last(b, t))
+               prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t));
 
-       t->max_key = bkey_unpack_pos(b, k);
+       t->max_key = bkey_unpack_pos(b, prev);
 
        /* Then we build the tree */
        eytzinger1_for_each(j, t->size)
@@ -983,7 +979,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
        struct bkey_packed *p, *i, *ret = NULL, *orig_k = k;
 
        while ((p = __bkey_prev(b, t, k)) && !ret) {
-               for (i = p; i != k; i = bkey_next(i))
+               for (i = p; i != k; i = bkey_next_skip_noops(i, k))
                        if (i->type >= min_key_type)
                                ret = i;
 
@@ -993,9 +989,11 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b,
        if (btree_keys_expensive_checks(b)) {
                BUG_ON(ret >= orig_k);
 
-               for (i = ret ? bkey_next(ret) : btree_bkey_first(b, t);
+               for (i = ret
+                       ? bkey_next_skip_noops(ret, orig_k)
+                       : btree_bkey_first(b, t);
                     i != orig_k;
-                    i = bkey_next(i))
+                    i = bkey_next_skip_noops(i, orig_k))
                        BUG_ON(i->type >= min_key_type);
        }
 
@@ -1030,7 +1028,7 @@ static void ro_aux_tree_fix_invalidated_key(struct btree *b,
        /* signal to make_bfloat() that they're uninitialized: */
        min_key.u64s = max_key.u64s = 0;
 
-       if (bkey_next(k) == btree_bkey_last(b, t)) {
+       if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) {
                t->max_key = bkey_unpack_pos(b, k);
 
                for (j = 1; j < t->size; j = j * 2 + 1)
@@ -1154,7 +1152,7 @@ static void bch2_bset_fix_lookup_table(struct btree *b,
                struct bkey_packed *k = start;
 
                while (1) {
-                       k = bkey_next(k);
+                       k = bkey_next_skip_noops(k, end);
                        if (k == end)
                                break;
 
@@ -1403,12 +1401,12 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b,
                while (m != btree_bkey_last(b, t) &&
                       bkey_iter_cmp_p_or_unp(b, search, lossy_packed_search,
                                              m) > 0)
-                       m = bkey_next(m);
+                       m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
 
        if (!packed_search)
                while (m != btree_bkey_last(b, t) &&
                       bkey_iter_pos_cmp(b, search, m) > 0)
-                       m = bkey_next(m);
+                       m = bkey_next_skip_noops(m, btree_bkey_last(b, t));
 
        if (btree_keys_expensive_checks(b)) {
                struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m);
@@ -1642,6 +1640,10 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter,
 
        EBUG_ON(iter->data->k > iter->data->end);
 
+       while (!__btree_node_iter_set_end(iter, 0) &&
+              !__bch2_btree_node_iter_peek_all(iter, b)->u64s)
+               iter->data->k++;
+
        if (unlikely(__btree_node_iter_set_end(iter, 0))) {
                bch2_btree_node_iter_set_drop(iter, iter->data);
                return;
index ccc0866d64359ad48029fa785e23987c1e848509..2653a74b3b14ad5741d0ab15af4ef685c2c4455d 100644 (file)
@@ -284,9 +284,14 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b,
        return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), };
 }
 
-#define for_each_bset(_b, _t)                                  \
+#define for_each_bset(_b, _t)                                          \
        for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++)
 
+#define bset_tree_for_each_key(_b, _t, _k)                             \
+       for (_k = btree_bkey_first(_b, _t);                             \
+            _k != btree_bkey_last(_b, _t);                             \
+            _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t)))
+
 static inline bool bset_has_ro_aux_tree(struct bset_tree *t)
 {
        return bset_aux_tree_type(t) == BSET_RO_AUX_TREE;
@@ -564,6 +569,16 @@ static inline void btree_keys_account_key(struct btree_nr_keys *n,
                n->unpacked_keys += sign;
 }
 
+static inline void btree_keys_account_val_delta(struct btree *b,
+                                               struct bkey_packed *k,
+                                               int delta)
+{
+       struct bset_tree *t = bch2_bkey_to_bset(b, k);
+
+       b->nr.live_u64s                 += delta;
+       b->nr.bset_u64s[t - b->set]     += delta;
+}
+
 #define btree_keys_account_key_add(_nr, _bset_idx, _k)         \
        btree_keys_account_key(_nr, _bset_idx, _k, 1)
 #define btree_keys_account_key_drop(_nr, _bset_idx, _k)        \
index c4c2e1a3ee0e944e2240e736b00eb0befc7d192e..8bbf60b07736064098c84f8596d926cca9758958 100644 (file)
@@ -922,7 +922,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
                     k < vstruct_last(s2) &&
                     vstruct_blocks_plus(n1->data, c->block_bits,
                                         u64s + k->u64s) <= blocks;
-                    k = bkey_next(k)) {
+                    k = bkey_next_skip_noops(k, vstruct_last(s2))) {
                        last = k;
                        u64s += k->u64s;
                }
index 591980d2011fa7a44cb4f0c10d07e3c6848e4735..c345262d804b29d643971045f7010854210b69ca 100644 (file)
@@ -26,34 +26,33 @@ static void verify_no_dups(struct btree *b,
                           struct bkey_packed *end)
 {
 #ifdef CONFIG_BCACHEFS_DEBUG
-       struct bkey_packed *k;
+       struct bkey_packed *k, *p;
+
+       if (start == end)
+               return;
 
-       for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) {
-               struct bkey l = bkey_unpack_key(b, k);
-               struct bkey r = bkey_unpack_key(b, bkey_next(k));
+       for (p = start, k = bkey_next_skip_noops(start, end);
+            k != end;
+            p = k, k = bkey_next_skip_noops(k, end)) {
+               struct bkey l = bkey_unpack_key(b, p);
+               struct bkey r = bkey_unpack_key(b, k);
 
                BUG_ON(btree_node_is_extents(b)
                       ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0
                       : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0);
-               //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0);
+               //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0);
        }
 #endif
 }
 
-static void clear_needs_whiteout(struct bset *i)
-{
-       struct bkey_packed *k;
-
-       for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
-               k->needs_whiteout = false;
-}
-
-static void set_needs_whiteout(struct bset *i)
+static void set_needs_whiteout(struct bset *i, int v)
 {
        struct bkey_packed *k;
 
-       for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
-               k->needs_whiteout = true;
+       for (k = i->start;
+            k != vstruct_last(i);
+            k = bkey_next_skip_noops(k, vstruct_last(i)))
+               k->needs_whiteout = v;
 }
 
 static void btree_bounce_free(struct bch_fs *c, unsigned order,
@@ -168,7 +167,7 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b,
                out = i->start;
 
                for (k = start; k != end; k = n) {
-                       n = bkey_next(k);
+                       n = bkey_next_skip_noops(k, end);
 
                        if (bkey_deleted(k) && btree_node_is_extents(b))
                                continue;
@@ -261,7 +260,7 @@ static bool bch2_drop_whiteouts(struct btree *b)
                out = i->start;
 
                for (k = start; k != end; k = n) {
-                       n = bkey_next(k);
+                       n = bkey_next_skip_noops(k, end);
 
                        if (!bkey_whiteout(k)) {
                                bkey_copy(out, k);
@@ -680,14 +679,6 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
                struct bkey tmp;
                const char *invalid;
 
-               if (btree_err_on(!k->u64s,
-                                BTREE_ERR_FIXABLE, c, b, i,
-                                "KEY_U64s 0: %zu bytes of metadata lost",
-                                vstruct_end(i) - (void *) k)) {
-                       i->u64s = cpu_to_le16((u64 *) k - i->_data);
-                       break;
-               }
-
                if (btree_err_on(bkey_next(k) > vstruct_last(i),
                                 BTREE_ERR_FIXABLE, c, b, i,
                                 "key extends past end of bset")) {
@@ -756,7 +747,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
 
                prev_pos = u.k->p;
                prev = k;
-               k = bkey_next(k);
+               k = bkey_next_skip_noops(k, vstruct_last(i));
        }
 
        SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN);
@@ -915,12 +906,12 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry
                        continue;
                }
 
-               k = bkey_next(k);
+               k = bkey_next_skip_noops(k, vstruct_last(i));
        }
 
        bch2_bset_build_aux_tree(b, b->set, false);
 
-       set_needs_whiteout(btree_bset_first(b));
+       set_needs_whiteout(btree_bset_first(b), true);
 
        btree_node_reset_sib_u64s(b);
 out:
@@ -1425,7 +1416,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                : bch2_sort_keys(i->start, &sort_iter, false);
        le16_add_cpu(&i->u64s, u64s);
 
-       clear_needs_whiteout(i);
+       set_needs_whiteout(i, false);
 
        /* do we have data to write? */
        if (b->written && !i->u64s)
@@ -1579,7 +1570,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
        }
 
        for_each_bset(b, t)
-               set_needs_whiteout(bset(b, t));
+               set_needs_whiteout(bset(b, t), true);
 
        bch2_btree_verify(c, b);
 
index 9d5687ec0ead83ef0bb24ed76ac74989cbe592f1..f8a30cb34750b27b7b4ddc767b15814eb8beb62d 100644 (file)
@@ -79,9 +79,7 @@ void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
        bch2_bkey_format_add_pos(s, b->data->min_key);
 
        for_each_bset(b, t)
-               for (k = btree_bkey_first(b, t);
-                    k != btree_bkey_last(b, t);
-                    k = bkey_next(k))
+               bset_tree_for_each_key(b, t, k)
                        if (!bkey_whiteout(k)) {
                                uk = bkey_unpack_key(b, k);
                                bch2_bkey_format_add_key(s, &uk);
@@ -1240,7 +1238,9 @@ static struct btree *__btree_split_node(struct btree_update *as,
         */
        k = set1->start;
        while (1) {
-               if (bkey_next(k) == vstruct_last(set1))
+               struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1));
+
+               if (n == vstruct_last(set1))
                        break;
                if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
                        break;
@@ -1251,7 +1251,7 @@ static struct btree *__btree_split_node(struct btree_update *as,
                        nr_unpacked++;
 
                prev = k;
-               k = bkey_next(k);
+               k = n;
        }
 
        BUG_ON(!prev);
@@ -1315,7 +1315,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
 {
        struct btree_node_iter node_iter;
        struct bkey_i *k = bch2_keylist_front(keys);
-       struct bkey_packed *p;
+       struct bkey_packed *src, *dst, *n;
        struct bset *i;
 
        BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE);
@@ -1340,16 +1340,18 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
         * for the pivot:
         */
        i = btree_bset_first(b);
-       p = i->start;
-       while (p != vstruct_last(i))
-               if (bkey_deleted(p)) {
-                       le16_add_cpu(&i->u64s, -p->u64s);
-                       set_btree_bset_end(b, b->set);
-                       memmove_u64s_down(p, bkey_next(p),
-                                         (u64 *) vstruct_last(i) -
-                                         (u64 *) p);
-               } else
-                       p = bkey_next(p);
+       src = dst = i->start;
+       while (src != vstruct_last(i)) {
+               n = bkey_next_skip_noops(src, vstruct_last(i));
+               if (!bkey_deleted(src)) {
+                       memmove_u64s_down(dst, src, src->u64s);
+                       dst = bkey_next(dst);
+               }
+               src = n;
+       }
+
+       i->u64s = cpu_to_le16((u64 *) dst - i->_data);
+       set_btree_bset_end(b, b->set);
 
        BUG_ON(b->nsets != 1 ||
               b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));
index 051368cd4a77e36ab316b624b702dcfa8801f9cb..54893b7b151e24ba0fb86546f2654ec8e40d5efb 100644 (file)
@@ -10,7 +10,7 @@
 #include "buckets.h"
 #include "debug.h"
 #include "error.h"
-#include "extents.h"
+#include "extent_update.h"
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "keylist.h"
@@ -886,7 +886,7 @@ retry:
 
                        /* create the biggest key we can */
                        bch2_key_resize(&delete.k, max_sectors);
-                       bch2_cut_back(end, &delete.k);
+                       bch2_cut_back(end, &delete);
 
                        ret = bch2_extent_trim_atomic(&delete, iter);
                        if (ret)
index ad92d3b452c068e3dfb1ba1894a87b7c456a0bf1..e0ca0c5dcb6fbdf2d8b2ecdd58cabb1fcfa9d0a8 100644 (file)
@@ -4,6 +4,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
+#include "bkey_on_stack.h"
 #include "bset.h"
 #include "btree_gc.h"
 #include "btree_update.h"
@@ -776,10 +777,10 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
        struct btree_iter *iter;
        struct bkey_s_c k;
        struct bkey_s_extent e;
-       struct bch_extent_ptr *ptr;
-       BKEY_PADDED(k) tmp;
+       struct bkey_on_stack sk;
        int ret = 0, dev, idx;
 
+       bkey_on_stack_init(&sk);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
        iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -789,6 +790,8 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
        while ((k = bch2_btree_iter_peek(iter)).k &&
               !(ret = bkey_err(k)) &&
               bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) {
+               struct bch_extent_ptr *ptr, *ec_ptr = NULL;
+
                if (extent_has_stripe_ptr(k, s->key.k.p.offset)) {
                        bch2_btree_iter_next(iter);
                        continue;
@@ -804,19 +807,20 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 
                dev = s->key.v.ptrs[idx].dev;
 
-               bkey_reassemble(&tmp.k, k);
-               e = bkey_i_to_s_extent(&tmp.k);
+               bkey_on_stack_realloc(&sk, c, k.k->u64s);
+               bkey_reassemble(sk.k, k);
+               e = bkey_i_to_s_extent(sk.k);
 
-               extent_for_each_ptr(e, ptr)
-                       if (ptr->dev != dev)
+               extent_for_each_ptr(e, ptr) {
+                       if (ptr->dev == dev)
+                               ec_ptr = ptr;
+                       else
                                ptr->cached = true;
+               }
 
-               ptr = (void *) bch2_extent_has_device(e.c, dev);
-               BUG_ON(!ptr);
-
-               extent_stripe_ptr_add(e, s, ptr, idx);
+               extent_stripe_ptr_add(e, s, ec_ptr, idx);
 
-               bch2_trans_update(&trans, iter, &tmp.k);
+               bch2_trans_update(&trans, iter, sk.k);
 
                ret = bch2_trans_commit(&trans, NULL, NULL,
                                        BTREE_INSERT_ATOMIC|
@@ -829,6 +833,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
        }
 
        bch2_trans_exit(&trans);
+       bkey_on_stack_exit(&sk, c);
 
        return ret;
 }
diff --git a/libbcachefs/extent_update.c b/libbcachefs/extent_update.c
new file mode 100644 (file)
index 0000000..91ceb5d
--- /dev/null
@@ -0,0 +1,532 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bkey_on_stack.h"
+#include "btree_update.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "debug.h"
+#include "extents.h"
+#include "extent_update.h"
+
+/*
+ * This counts the number of iterators to the alloc & ec btrees we'll need
+ * inserting/removing this extent:
+ */
+static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const union bch_extent_entry *entry;
+       unsigned ret = 0;
+
+       bkey_extent_entry_for_each(ptrs, entry) {
+               switch (__extent_entry_type(entry)) {
+               case BCH_EXTENT_ENTRY_ptr:
+               case BCH_EXTENT_ENTRY_stripe_ptr:
+                       ret++;
+               }
+       }
+
+       return ret;
+}
+
+static int count_iters_for_insert(struct btree_trans *trans,
+                                 struct bkey_s_c k,
+                                 unsigned offset,
+                                 struct bpos *end,
+                                 unsigned *nr_iters,
+                                 unsigned max_iters,
+                                 bool overwrite)
+{
+       int ret = 0;
+
+       switch (k.k->type) {
+       case KEY_TYPE_extent:
+       case KEY_TYPE_reflink_v:
+               *nr_iters += bch2_bkey_nr_alloc_ptrs(k);
+
+               if (*nr_iters >= max_iters) {
+                       *end = bpos_min(*end, k.k->p);
+                       ret = 1;
+               }
+
+               break;
+       case KEY_TYPE_reflink_p: {
+               struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+               u64 idx = le64_to_cpu(p.v->idx);
+               unsigned sectors = bpos_min(*end, p.k->p).offset -
+                       bkey_start_offset(p.k);
+               struct btree_iter *iter;
+               struct bkey_s_c r_k;
+
+               for_each_btree_key(trans, iter,
+                                  BTREE_ID_REFLINK, POS(0, idx + offset),
+                                  BTREE_ITER_SLOTS, r_k, ret) {
+                       if (bkey_cmp(bkey_start_pos(r_k.k),
+                                    POS(0, idx + sectors)) >= 0)
+                               break;
+
+                       *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
+
+                       if (*nr_iters >= max_iters) {
+                               struct bpos pos = bkey_start_pos(k.k);
+                               pos.offset += r_k.k->p.offset - idx;
+
+                               *end = bpos_min(*end, pos);
+                               ret = 1;
+                               break;
+                       }
+               }
+
+               bch2_trans_iter_put(trans, iter);
+               break;
+       }
+       }
+
+       return ret;
+}
+
+#define EXTENT_ITERS_MAX       (BTREE_ITER_MAX / 3)
+
+int bch2_extent_atomic_end(struct btree_iter *iter,
+                          struct bkey_i *insert,
+                          struct bpos *end)
+{
+       struct btree_trans *trans = iter->trans;
+       struct btree *b;
+       struct btree_node_iter  node_iter;
+       struct bkey_packed      *_k;
+       unsigned                nr_iters = 0;
+       int ret;
+
+       ret = bch2_btree_iter_traverse(iter);
+       if (ret)
+               return ret;
+
+       b = iter->l[0].b;
+       node_iter = iter->l[0].iter;
+
+       BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0);
+
+       *end = bpos_min(insert->k.p, b->key.k.p);
+
+       ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
+                                    &nr_iters, EXTENT_ITERS_MAX / 2, false);
+       if (ret < 0)
+               return ret;
+
+       while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+                                                     KEY_TYPE_discard))) {
+               struct bkey     unpacked;
+               struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
+               unsigned offset = 0;
+
+               if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
+                       break;
+
+               if (bkey_cmp(bkey_start_pos(&insert->k),
+                            bkey_start_pos(k.k)) > 0)
+                       offset = bkey_start_offset(&insert->k) -
+                               bkey_start_offset(k.k);
+
+               ret = count_iters_for_insert(trans, k, offset, end,
+                                       &nr_iters, EXTENT_ITERS_MAX, true);
+               if (ret)
+                       break;
+
+               bch2_btree_node_iter_advance(&node_iter, b);
+       }
+
+       return ret < 0 ? ret : 0;
+}
+
+int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
+{
+       struct bpos end;
+       int ret;
+
+       ret = bch2_extent_atomic_end(iter, k, &end);
+       if (ret)
+               return ret;
+
+       bch2_cut_back(end, k);
+       return 0;
+}
+
+int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
+{
+       struct bpos end;
+       int ret;
+
+       ret = bch2_extent_atomic_end(iter, k, &end);
+       if (ret)
+               return ret;
+
+       return !bkey_cmp(end, k->k.p);
+}
+
+enum btree_insert_ret
+bch2_extent_can_insert(struct btree_trans *trans,
+                      struct btree_insert_entry *insert,
+                      unsigned *u64s)
+{
+       struct btree_iter_level *l = &insert->iter->l[0];
+       struct btree_node_iter node_iter = l->iter;
+       enum bch_extent_overlap overlap;
+       struct bkey_packed *_k;
+       struct bkey unpacked;
+       struct bkey_s_c k;
+       int sectors;
+
+       /*
+        * We avoid creating whiteouts whenever possible when deleting, but
+        * those optimizations mean we may potentially insert two whiteouts
+        * instead of one (when we overlap with the front of one extent and the
+        * back of another):
+        */
+       if (bkey_whiteout(&insert->k->k))
+               *u64s += BKEY_U64s;
+
+       _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
+                                             KEY_TYPE_discard);
+       if (!_k)
+               return BTREE_INSERT_OK;
+
+       k = bkey_disassemble(l->b, _k, &unpacked);
+
+       overlap = bch2_extent_overlap(&insert->k->k, k.k);
+
+       /* account for having to split existing extent: */
+       if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
+               *u64s += _k->u64s;
+
+       if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
+           (sectors = bch2_bkey_sectors_compressed(k))) {
+               int flags = trans->flags & BTREE_INSERT_NOFAIL
+                       ? BCH_DISK_RESERVATION_NOFAIL : 0;
+
+               switch (bch2_disk_reservation_add(trans->c,
+                               trans->disk_res,
+                               sectors, flags)) {
+               case 0:
+                       break;
+               case -ENOSPC:
+                       return BTREE_INSERT_ENOSPC;
+               default:
+                       BUG();
+               }
+       }
+
+       return BTREE_INSERT_OK;
+}
+
+static void verify_extent_nonoverlapping(struct bch_fs *c,
+                                        struct btree *b,
+                                        struct btree_node_iter *_iter,
+                                        struct bkey_i *insert)
+{
+#ifdef CONFIG_BCACHEFS_DEBUG
+       struct btree_node_iter iter;
+       struct bkey_packed *k;
+       struct bkey uk;
+
+       if (!expensive_debug_checks(c))
+               return;
+
+       iter = *_iter;
+       k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
+       BUG_ON(k &&
+              (uk = bkey_unpack_key(b, k),
+               bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
+
+       iter = *_iter;
+       k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
+#if 0
+       BUG_ON(k &&
+              (uk = bkey_unpack_key(b, k),
+               bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
+#else
+       if (k &&
+           (uk = bkey_unpack_key(b, k),
+            bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
+               char buf1[100];
+               char buf2[100];
+
+               bch2_bkey_to_text(&PBUF(buf1), &insert->k);
+               bch2_bkey_to_text(&PBUF(buf2), &uk);
+
+               bch2_dump_btree_node(b);
+               panic("insert > next :\n"
+                     "insert %s\n"
+                     "next   %s\n",
+                     buf1, buf2);
+       }
+#endif
+
+#endif
+}
+
+static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
+                              struct bkey_i *insert)
+{
+       struct btree_iter_level *l = &iter->l[0];
+       struct bkey_packed *k =
+               bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
+
+       BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
+
+       EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+       verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
+
+       if (debug_check_bkeys(c))
+               bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
+
+       bch2_bset_insert(l->b, &l->iter, k, insert, 0);
+       bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
+}
+
+static void
+extent_squash(struct bch_fs *c, struct btree_iter *iter,
+             struct bkey_i *insert,
+             struct bkey_packed *_k, struct bkey_s k,
+             enum bch_extent_overlap overlap)
+{
+       struct btree_iter_level *l = &iter->l[0];
+       int u64s_delta;
+
+       switch (overlap) {
+       case BCH_EXTENT_OVERLAP_FRONT:
+               /* insert overlaps with start of k: */
+               u64s_delta = bch2_cut_front_s(insert->k.p, k);
+               btree_keys_account_val_delta(l->b, _k, u64s_delta);
+
+               EBUG_ON(bkey_deleted(k.k));
+               extent_save(l->b, _k, k.k);
+               bch2_btree_iter_fix_key_modified(iter, l->b, _k);
+               break;
+
+       case BCH_EXTENT_OVERLAP_BACK:
+               /* insert overlaps with end of k: */
+               u64s_delta = bch2_cut_back_s(bkey_start_pos(&insert->k), k);
+               btree_keys_account_val_delta(l->b, _k, u64s_delta);
+
+               EBUG_ON(bkey_deleted(k.k));
+               extent_save(l->b, _k, k.k);
+
+               /*
+                * As the auxiliary tree is indexed by the end of the
+                * key and we've just changed the end, update the
+                * auxiliary tree.
+                */
+               bch2_bset_fix_invalidated_key(l->b, _k);
+               bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+                                        _k, _k->u64s, _k->u64s);
+               break;
+
+       case BCH_EXTENT_OVERLAP_ALL: {
+               /* The insert key completely covers k, invalidate k */
+               if (!bkey_whiteout(k.k))
+                       btree_account_key_drop(l->b, _k);
+
+               k.k->size = 0;
+               k.k->type = KEY_TYPE_deleted;
+
+               if (_k >= btree_bset_last(l->b)->start) {
+                       unsigned u64s = _k->u64s;
+
+                       bch2_bset_delete(l->b, _k, _k->u64s);
+                       bch2_btree_node_iter_fix(iter, l->b, &l->iter,
+                                                _k, u64s, 0);
+               } else {
+                       extent_save(l->b, _k, k.k);
+                       bch2_btree_iter_fix_key_modified(iter, l->b, _k);
+               }
+
+               break;
+       }
+       case BCH_EXTENT_OVERLAP_MIDDLE: {
+               struct bkey_on_stack split;
+
+               bkey_on_stack_init(&split);
+               bkey_on_stack_realloc(&split, c, k.k->u64s);
+
+               /*
+                * The insert key falls 'in the middle' of k
+                * The insert key splits k in 3:
+                * - start only in k, preserve
+                * - middle common section, invalidate in k
+                * - end only in k, preserve
+                *
+                * We update the old key to preserve the start,
+                * insert will be the new common section,
+                * we manually insert the end that we are preserving.
+                *
+                * modify k _before_ doing the insert (which will move
+                * what k points to)
+                */
+               bkey_reassemble(split.k, k.s_c);
+               split.k->k.needs_whiteout |= bkey_written(l->b, _k);
+
+               bch2_cut_back(bkey_start_pos(&insert->k), split.k);
+               BUG_ON(bkey_deleted(&split.k->k));
+
+               u64s_delta = bch2_cut_front_s(insert->k.p, k);
+               btree_keys_account_val_delta(l->b, _k, u64s_delta);
+
+               BUG_ON(bkey_deleted(k.k));
+               extent_save(l->b, _k, k.k);
+               bch2_btree_iter_fix_key_modified(iter, l->b, _k);
+
+               extent_bset_insert(c, iter, split.k);
+               bkey_on_stack_exit(&split, c);
+               break;
+       }
+       }
+}
+
+/**
+ * bch_extent_insert_fixup - insert a new extent and deal with overlaps
+ *
+ * this may result in not actually doing the insert, or inserting some subset
+ * of the insert key. For cmpxchg operations this is where that logic lives.
+ *
+ * All subsets of @insert that need to be inserted are inserted using
+ * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
+ * returns false, setting @iter->pos for the prefix of @insert that actually got
+ * inserted.
+ *
+ * BSET INVARIANTS: this function is responsible for maintaining all the
+ * invariants for bsets of extents in memory. things get really hairy with 0
+ * size extents
+ *
+ * within one bset:
+ *
+ * bkey_start_pos(bkey_next(k)) >= k
+ * or bkey_start_offset(bkey_next(k)) >= k->offset
+ *
+ * i.e. strict ordering, no overlapping extents.
+ *
+ * multiple bsets (i.e. full btree node):
+ *
+ * âˆ€ k, j
+ *   k.size != 0 âˆ§ j.size != 0 â†’
+ *     Â¬ (k > bkey_start_pos(j) âˆ§ k < j)
+ *
+ * i.e. no two overlapping keys _of nonzero size_
+ *
+ * We can't realistically maintain this invariant for zero size keys because of
+ * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
+ * there may be another 0 size key between them in another bset, and it will
+ * thus overlap with the merged key.
+ *
+ * In addition, the end of iter->pos indicates how much has been processed.
+ * If the end of iter->pos is not the same as the end of insert, then
+ * key insertion needs to continue/be retried.
+ */
+void bch2_insert_fixup_extent(struct btree_trans *trans,
+                             struct btree_insert_entry *insert_entry)
+{
+       struct bch_fs *c = trans->c;
+       struct btree_iter *iter = insert_entry->iter;
+       struct bkey_i *insert   = insert_entry->k;
+       struct btree_iter_level *l = &iter->l[0];
+       struct btree_node_iter node_iter = l->iter;
+       bool deleting           = bkey_whiteout(&insert->k);
+       bool update_journal     = !deleting;
+       bool update_btree       = !deleting;
+       struct bkey_i whiteout  = *insert;
+       struct bkey_packed *_k;
+       struct bkey unpacked;
+
+       EBUG_ON(iter->level);
+       EBUG_ON(!insert->k.size);
+       EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
+
+       while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
+                                                     KEY_TYPE_discard))) {
+               struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
+               struct bpos cur_end = bpos_min(insert->k.p, k.k->p);
+               enum bch_extent_overlap overlap =
+                       bch2_extent_overlap(&insert->k, k.k);
+
+               if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
+                       break;
+
+               if (!bkey_whiteout(k.k))
+                       update_journal = true;
+
+               if (!update_journal) {
+                       bch2_cut_front(cur_end, insert);
+                       bch2_cut_front(cur_end, &whiteout);
+                       bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
+                       goto next;
+               }
+
+               /*
+                * When deleting, if possible just do it by switching the type
+                * of the key we're deleting, instead of creating and inserting
+                * a new whiteout:
+                */
+               if (deleting &&
+                   !update_btree &&
+                   !bkey_cmp(insert->k.p, k.k->p) &&
+                   !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
+                       if (!bkey_whiteout(k.k)) {
+                               btree_account_key_drop(l->b, _k);
+                               _k->type = KEY_TYPE_discard;
+                               reserve_whiteout(l->b, _k);
+                               bch2_btree_iter_fix_key_modified(iter,
+                                                                l->b, _k);
+                       }
+                       break;
+               }
+
+               if (k.k->needs_whiteout || bkey_written(l->b, _k)) {
+                       insert->k.needs_whiteout = true;
+                       update_btree = true;
+               }
+
+               if (update_btree &&
+                   overlap == BCH_EXTENT_OVERLAP_ALL &&
+                   bkey_whiteout(k.k) &&
+                   k.k->needs_whiteout) {
+                       unreserve_whiteout(l->b, _k);
+                       _k->needs_whiteout = false;
+               }
+
+               extent_squash(c, iter, insert, _k, k, overlap);
+
+               if (!update_btree)
+                       bch2_cut_front(cur_end, insert);
+next:
+               node_iter = l->iter;
+
+               if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
+                   overlap == BCH_EXTENT_OVERLAP_MIDDLE)
+                       break;
+       }
+
+       l->iter = node_iter;
+       bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
+
+       if (update_btree) {
+               if (deleting)
+                       insert->k.type = KEY_TYPE_discard;
+
+               EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+
+               extent_bset_insert(c, iter, insert);
+       }
+
+       if (update_journal) {
+               struct bkey_i *k = !deleting ? insert : &whiteout;
+
+               if (deleting)
+                       k->k.type = KEY_TYPE_discard;
+
+               EBUG_ON(bkey_deleted(&k->k) || !k->k.size);
+
+               bch2_btree_journal_key(trans, iter, k);
+       }
+
+       bch2_cut_front(insert->k.p, insert);
+}
diff --git a/libbcachefs/extent_update.h b/libbcachefs/extent_update.h
new file mode 100644 (file)
index 0000000..89d18e4
--- /dev/null
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENT_UPDATE_H
+#define _BCACHEFS_EXTENT_UPDATE_H
+
+#include "bcachefs.h"
+
+int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *,
+                          struct bpos *);
+int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
+int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
+
+enum btree_insert_ret
+bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *,
+                      unsigned *);
+void bch2_insert_fixup_extent(struct btree_trans *,
+                             struct btree_insert_entry *);
+
+#endif /* _BCACHEFS_EXTENT_UPDATE_H */
index b9c69792f81fb914e8563a95bd05f2d3f4b1424c..6bcc178604b0a81a3470b18b2c3742cd890d10b0 100644 (file)
@@ -9,12 +9,10 @@
 #include "bcachefs.h"
 #include "bkey_methods.h"
 #include "btree_gc.h"
-#include "btree_update.h"
-#include "btree_update_interior.h"
+#include "btree_iter.h"
 #include "buckets.h"
 #include "checksum.h"
 #include "debug.h"
-#include "dirent.h"
 #include "disk_groups.h"
 #include "error.h"
 #include "extents.h"
 #include "super.h"
 #include "super-io.h"
 #include "util.h"
-#include "xattr.h"
 
 #include <trace/events/bcachefs.h>
 
-unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
-{
-       struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-       const struct bch_extent_ptr *ptr;
-       unsigned nr_ptrs = 0;
-
-       bkey_for_each_ptr(p, ptr)
-               nr_ptrs++;
-
-       return nr_ptrs;
-}
-
-unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c k)
-{
-       unsigned nr_ptrs = 0;
-
-       switch (k.k->type) {
-       case KEY_TYPE_btree_ptr:
-       case KEY_TYPE_extent:
-       case KEY_TYPE_reflink_v: {
-               struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-               const struct bch_extent_ptr *ptr;
-
-               bkey_for_each_ptr(p, ptr)
-                       nr_ptrs += !ptr->cached;
-               BUG_ON(!nr_ptrs);
-               break;
-       }
-       case KEY_TYPE_reservation:
-               nr_ptrs = bkey_s_c_to_reservation(k).v->nr_replicas;
-               break;
-       }
-
-       return nr_ptrs;
-}
-
-static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
-                                          struct extent_ptr_decoded p)
-{
-       unsigned durability = 0;
-       struct bch_dev *ca;
-
-       if (p.ptr.cached)
-               return 0;
-
-       ca = bch_dev_bkey_exists(c, p.ptr.dev);
-
-       if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
-               durability = max_t(unsigned, durability, ca->mi.durability);
-
-       if (p.has_ec) {
-               struct stripe *s =
-                       genradix_ptr(&c->stripes[0], p.ec.idx);
-
-               if (WARN_ON(!s))
-                       goto out;
-
-               durability = max_t(unsigned, durability, s->nr_redundant);
-       }
-out:
-       return durability;
-}
-
-unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-       unsigned durability = 0;
-
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-               durability += bch2_extent_ptr_durability(c, p);
+static unsigned bch2_crc_field_size_max[] = {
+       [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
+       [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
+       [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
+};
 
-       return durability;
-}
+static void bch2_extent_crc_pack(union bch_extent_crc *,
+                                struct bch_extent_crc_unpacked,
+                                enum bch_extent_entry_type);
 
 static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f,
                                                   unsigned dev)
@@ -222,172 +153,299 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k,
        return ret;
 }
 
-void bch2_bkey_append_ptr(struct bkey_i *k,
-                         struct bch_extent_ptr ptr)
-{
-       EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
-
-       switch (k->k.type) {
-       case KEY_TYPE_btree_ptr:
-       case KEY_TYPE_extent:
-               EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
-
-               ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
-
-               memcpy((void *) &k->v + bkey_val_bytes(&k->k),
-                      &ptr,
-                      sizeof(ptr));
-               k->u64s++;
-               break;
-       default:
-               BUG();
-       }
-}
+/* KEY_TYPE_btree_ptr: */
 
-void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
+const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
 {
-       struct bch_extent_ptr *ptr;
+       if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
+               return "value too big";
 
-       bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
+       return bch2_bkey_ptrs_invalid(c, k);
 }
 
-const struct bch_extent_ptr *
-bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
+void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const struct bch_extent_ptr *ptr;
+       const char *err;
+       char buf[160];
+       struct bucket_mark mark;
+       struct bch_dev *ca;
 
-       bkey_for_each_ptr(ptrs, ptr)
-               if (ptr->dev == dev)
-                       return ptr;
-
-       return NULL;
-}
-
-bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const struct bch_extent_ptr *ptr;
+       bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+                      !bch2_bkey_replicas_marked(c, k, false), c,
+                      "btree key bad (replicas not marked in superblock):\n%s",
+                      (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
 
-       bkey_for_each_ptr(ptrs, ptr)
-               if (bch2_dev_in_target(c, ptr->dev, target) &&
-                   (!ptr->cached ||
-                    !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
-                       return true;
+       if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
+               return;
 
-       return false;
-}
+       bkey_for_each_ptr(ptrs, ptr) {
+               ca = bch_dev_bkey_exists(c, ptr->dev);
 
-/* extent specific utility code */
+               mark = ptr_bucket_mark(ca, ptr);
 
-const struct bch_extent_ptr *
-bch2_extent_has_device(struct bkey_s_c_extent e, unsigned dev)
-{
-       const struct bch_extent_ptr *ptr;
+               err = "stale";
+               if (gen_after(mark.gen, ptr->gen))
+                       goto err;
 
-       extent_for_each_ptr(e, ptr)
-               if (ptr->dev == dev)
-                       return ptr;
+               err = "inconsistent";
+               if (mark.data_type != BCH_DATA_BTREE ||
+                   mark.dirty_sectors < c->opts.btree_node_size)
+                       goto err;
+       }
 
-       return NULL;
+       return;
+err:
+       bch2_bkey_val_to_text(&PBUF(buf), c, k);
+       bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x",
+                   err, buf, PTR_BUCKET_NR(ca, ptr),
+                   mark.gen, (unsigned) mark.v.counter);
 }
 
-const struct bch_extent_ptr *
-bch2_extent_has_group(struct bch_fs *c, struct bkey_s_c_extent e, unsigned group)
+void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
+                           struct bkey_s_c k)
 {
-       const struct bch_extent_ptr *ptr;
-
-       extent_for_each_ptr(e, ptr) {
-               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+       bch2_bkey_ptrs_to_text(out, c, k);
+}
 
-               if (ca->mi.group &&
-                   ca->mi.group - 1 == group)
-                       return ptr;
-       }
+/* KEY_TYPE_extent: */
 
-       return NULL;
+const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       return bch2_bkey_ptrs_invalid(c, k);
 }
 
-unsigned bch2_extent_is_compressed(struct bkey_s_c k)
+void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
 {
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
        const union bch_extent_entry *entry;
        struct extent_ptr_decoded p;
-       unsigned ret = 0;
+       char buf[160];
 
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-               if (!p.ptr.cached &&
-                   p.crc.compression_type != BCH_COMPRESSION_NONE)
-                       ret += p.crc.compressed_size;
+       /*
+        * XXX: we should be doing most/all of these checks at startup time,
+        * where we check bch2_bkey_invalid() in btree_node_read_done()
+        *
+        * But note that we can't check for stale pointers or incorrect gc marks
+        * until after journal replay is done (it might be an extent that's
+        * going to get overwritten during replay)
+        */
 
-       return ret;
-}
+       if (percpu_down_read_trylock(&c->mark_lock)) {
+               bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+                              !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c,
+                              "extent key bad (replicas not marked in superblock):\n%s",
+                              (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf));
+               percpu_up_read(&c->mark_lock);
+       }
+       /*
+        * If journal replay hasn't finished, we might be seeing keys
+        * that will be overwritten by the time journal replay is done:
+        */
+       if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+               return;
 
-bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
-                          struct bch_extent_ptr m, u64 offset)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
+       extent_for_each_ptr_decode(e, p, entry) {
+               struct bch_dev *ca      = bch_dev_bkey_exists(c, p.ptr.dev);
+               struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr);
+               unsigned stale          = gen_after(mark.gen, p.ptr.gen);
+               unsigned disk_sectors   = ptr_disk_sectors(p);
+               unsigned mark_sectors   = p.ptr.cached
+                       ? mark.cached_sectors
+                       : mark.dirty_sectors;
 
-       bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
-               if (p.ptr.dev   == m.dev &&
-                   p.ptr.gen   == m.gen &&
-                   (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
-                   (s64) m.offset  - offset)
-                       return true;
+               bch2_fs_bug_on(stale && !p.ptr.cached, c,
+                              "stale dirty pointer (ptr gen %u bucket %u",
+                              p.ptr.gen, mark.gen);
 
-       return false;
+               bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale);
+
+               bch2_fs_bug_on(!stale &&
+                              (mark.data_type != BCH_DATA_USER ||
+                               mark_sectors < disk_sectors), c,
+                              "extent pointer not marked: %s:\n"
+                              "type %u sectors %u < %u",
+                              (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf),
+                              mark.data_type,
+                              mark_sectors, disk_sectors);
+       }
 }
 
-static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
-                                         union bch_extent_entry *entry)
+void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
+                        struct bkey_s_c k)
 {
-       union bch_extent_entry *i = ptrs.start;
-
-       if (i == entry)
-               return NULL;
-
-       while (extent_entry_next(i) != entry)
-               i = extent_entry_next(i);
-       return i;
+       bch2_bkey_ptrs_to_text(out, c, k);
 }
 
-union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
-                                          struct bch_extent_ptr *ptr)
+enum merge_result bch2_extent_merge(struct bch_fs *c,
+                                   struct bkey_s _l, struct bkey_s _r)
 {
-       struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-       union bch_extent_entry *dst, *src, *prev;
-       bool drop_crc = true;
+       struct bkey_s_extent l = bkey_s_to_extent(_l);
+       struct bkey_s_extent r = bkey_s_to_extent(_r);
+       union bch_extent_entry *en_l = l.v->start;
+       union bch_extent_entry *en_r = r.v->start;
+       struct bch_extent_crc_unpacked crc_l, crc_r;
 
-       EBUG_ON(ptr < &ptrs.start->ptr ||
-               ptr >= &ptrs.end->ptr);
-       EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
+       if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k))
+               return BCH_MERGE_NOMERGE;
 
-       src = extent_entry_next(to_entry(ptr));
-       if (src != ptrs.end &&
-           !extent_entry_is_crc(src))
-               drop_crc = false;
+       crc_l = bch2_extent_crc_unpack(l.k, NULL);
 
-       dst = to_entry(ptr);
-       while ((prev = extent_entry_prev(ptrs, dst))) {
-               if (extent_entry_is_ptr(prev))
+       extent_for_each_entry(l, en_l) {
+               en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
+
+               if (extent_entry_type(en_l) != extent_entry_type(en_r))
+                       return BCH_MERGE_NOMERGE;
+
+               switch (extent_entry_type(en_l)) {
+               case BCH_EXTENT_ENTRY_ptr: {
+                       const struct bch_extent_ptr *lp = &en_l->ptr;
+                       const struct bch_extent_ptr *rp = &en_r->ptr;
+                       struct bch_dev *ca;
+
+                       if (lp->offset + crc_l.compressed_size != rp->offset ||
+                           lp->dev                     != rp->dev ||
+                           lp->gen                     != rp->gen)
+                               return BCH_MERGE_NOMERGE;
+
+                       /* We don't allow extents to straddle buckets: */
+                       ca = bch_dev_bkey_exists(c, lp->dev);
+
+                       if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
+                               return BCH_MERGE_NOMERGE;
+
+                       break;
+               }
+               case BCH_EXTENT_ENTRY_stripe_ptr:
+                       if (en_l->stripe_ptr.block      != en_r->stripe_ptr.block ||
+                           en_l->stripe_ptr.idx        != en_r->stripe_ptr.idx)
+                               return BCH_MERGE_NOMERGE;
                        break;
+               case BCH_EXTENT_ENTRY_crc32:
+               case BCH_EXTENT_ENTRY_crc64:
+               case BCH_EXTENT_ENTRY_crc128:
+                       crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+                       crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+                       if (crc_l.csum_type             != crc_r.csum_type ||
+                           crc_l.compression_type      != crc_r.compression_type ||
+                           crc_l.nonce                 != crc_r.nonce)
+                               return BCH_MERGE_NOMERGE;
+
+                       if (crc_l.offset + crc_l.live_size != crc_l.compressed_size ||
+                           crc_r.offset)
+                               return BCH_MERGE_NOMERGE;
+
+                       if (!bch2_checksum_mergeable(crc_l.csum_type))
+                               return BCH_MERGE_NOMERGE;
+
+                       if (crc_l.compression_type)
+                               return BCH_MERGE_NOMERGE;
+
+                       if (crc_l.csum_type &&
+                           crc_l.uncompressed_size +
+                           crc_r.uncompressed_size > c->sb.encoded_extent_max)
+                               return BCH_MERGE_NOMERGE;
+
+                       if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 >
+                           bch2_crc_field_size_max[extent_entry_type(en_l)])
+                               return BCH_MERGE_NOMERGE;
 
-               if (extent_entry_is_crc(prev)) {
-                       if (drop_crc)
-                               dst = prev;
                        break;
+               default:
+                       return BCH_MERGE_NOMERGE;
                }
+       }
 
-               dst = prev;
+       extent_for_each_entry(l, en_l) {
+               struct bch_extent_crc_unpacked crc_l, crc_r;
+
+               en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
+
+               if (!extent_entry_is_crc(en_l))
+                       continue;
+
+               crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
+               crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+
+               crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
+                                                crc_l.csum,
+                                                crc_r.csum,
+                                                crc_r.uncompressed_size << 9);
+
+               crc_l.uncompressed_size += crc_r.uncompressed_size;
+               crc_l.compressed_size   += crc_r.compressed_size;
+
+               bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
+                                    extent_entry_type(en_l));
        }
 
-       memmove_u64s_down(dst, src,
-                         (u64 *) ptrs.end - (u64 *) src);
-       k.k->u64s -= (u64 *) src - (u64 *) dst;
+       bch2_key_resize(l.k, l.k->size + r.k->size);
 
-       return dst;
+       return BCH_MERGE_MERGE;
+}
+
+/* KEY_TYPE_reservation: */
+
+const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+       if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
+               return "incorrect value size";
+
+       if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
+               return "invalid nr_replicas";
+
+       return NULL;
+}
+
+void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
+                             struct bkey_s_c k)
+{
+       struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+
+       pr_buf(out, "generation %u replicas %u",
+              le32_to_cpu(r.v->generation),
+              r.v->nr_replicas);
+}
+
+enum merge_result bch2_reservation_merge(struct bch_fs *c,
+                                        struct bkey_s _l, struct bkey_s _r)
+{
+       struct bkey_s_reservation l = bkey_s_to_reservation(_l);
+       struct bkey_s_reservation r = bkey_s_to_reservation(_r);
+
+       if (l.v->generation != r.v->generation ||
+           l.v->nr_replicas != r.v->nr_replicas)
+               return BCH_MERGE_NOMERGE;
+
+       if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
+               bch2_key_resize(l.k, KEY_SIZE_MAX);
+               bch2_cut_front_s(l.k->p, r.s);
+               return BCH_MERGE_PARTIAL;
+       }
+
+       bch2_key_resize(l.k, l.k->size + r.k->size);
+
+       return BCH_MERGE_MERGE;
+}
+
+/* Extent checksum entries: */
+
+/* returns true if not equal */
+static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
+                                        struct bch_extent_crc_unpacked r)
+{
+       return (l.csum_type             != r.csum_type ||
+               l.compression_type      != r.compression_type ||
+               l.compressed_size       != r.compressed_size ||
+               l.uncompressed_size     != r.uncompressed_size ||
+               l.offset                != r.offset ||
+               l.live_size             != r.live_size ||
+               l.nonce                 != r.nonce ||
+               bch2_crc_cmp(l.csum, r.csum));
 }
 
 static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u,
@@ -466,987 +524,237 @@ restart_narrow_pointers:
        return ret;
 }
 
-/* returns true if not equal */
-static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l,
-                                        struct bch_extent_crc_unpacked r)
+static void bch2_extent_crc_pack(union bch_extent_crc *dst,
+                                struct bch_extent_crc_unpacked src,
+                                enum bch_extent_entry_type type)
 {
-       return (l.csum_type             != r.csum_type ||
-               l.compression_type      != r.compression_type ||
-               l.compressed_size       != r.compressed_size ||
-               l.uncompressed_size     != r.uncompressed_size ||
-               l.offset                != r.offset ||
-               l.live_size             != r.live_size ||
-               l.nonce                 != r.nonce ||
-               bch2_crc_cmp(l.csum, r.csum));
+#define set_common_fields(_dst, _src)                                  \
+               _dst.type               = 1 << type;                    \
+               _dst.csum_type          = _src.csum_type,               \
+               _dst.compression_type   = _src.compression_type,        \
+               _dst._compressed_size   = _src.compressed_size - 1,     \
+               _dst._uncompressed_size = _src.uncompressed_size - 1,   \
+               _dst.offset             = _src.offset
+
+       switch (type) {
+       case BCH_EXTENT_ENTRY_crc32:
+               set_common_fields(dst->crc32, src);
+               dst->crc32.csum  = *((__le32 *) &src.csum.lo);
+               break;
+       case BCH_EXTENT_ENTRY_crc64:
+               set_common_fields(dst->crc64, src);
+               dst->crc64.nonce        = src.nonce;
+               dst->crc64.csum_lo      = src.csum.lo;
+               dst->crc64.csum_hi      = *((__le16 *) &src.csum.hi);
+               break;
+       case BCH_EXTENT_ENTRY_crc128:
+               set_common_fields(dst->crc128, src);
+               dst->crc128.nonce       = src.nonce;
+               dst->crc128.csum        = src.csum;
+               break;
+       default:
+               BUG();
+       }
+#undef set_common_fields
 }
 
-void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
+void bch2_extent_crc_append(struct bkey_i *k,
+                           struct bch_extent_crc_unpacked new)
 {
-       union bch_extent_entry *entry;
-       u64 *d = (u64 *) bkeyp_val(f, k);
-       unsigned i;
+       struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
+       union bch_extent_crc *crc = (void *) ptrs.end;
+       enum bch_extent_entry_type type;
 
-       for (i = 0; i < bkeyp_val_u64s(f, k); i++)
-               d[i] = swab64(d[i]);
+       if (bch_crc_bytes[new.csum_type]        <= 4 &&
+           new.uncompressed_size - 1           <= CRC32_SIZE_MAX &&
+           new.nonce                           <= CRC32_NONCE_MAX)
+               type = BCH_EXTENT_ENTRY_crc32;
+       else if (bch_crc_bytes[new.csum_type]   <= 10 &&
+                  new.uncompressed_size - 1    <= CRC64_SIZE_MAX &&
+                  new.nonce                    <= CRC64_NONCE_MAX)
+               type = BCH_EXTENT_ENTRY_crc64;
+       else if (bch_crc_bytes[new.csum_type]   <= 16 &&
+                  new.uncompressed_size - 1    <= CRC128_SIZE_MAX &&
+                  new.nonce                    <= CRC128_NONCE_MAX)
+               type = BCH_EXTENT_ENTRY_crc128;
+       else
+               BUG();
 
-       for (entry = (union bch_extent_entry *) d;
-            entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
-            entry = extent_entry_next(entry)) {
-               switch (extent_entry_type(entry)) {
-               case BCH_EXTENT_ENTRY_ptr:
-                       break;
-               case BCH_EXTENT_ENTRY_crc32:
-                       entry->crc32.csum = swab32(entry->crc32.csum);
-                       break;
-               case BCH_EXTENT_ENTRY_crc64:
-                       entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
-                       entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
-                       break;
-               case BCH_EXTENT_ENTRY_crc128:
-                       entry->crc128.csum.hi = (__force __le64)
-                               swab64((__force u64) entry->crc128.csum.hi);
-                       entry->crc128.csum.lo = (__force __le64)
-                               swab64((__force u64) entry->crc128.csum.lo);
-                       break;
-               case BCH_EXTENT_ENTRY_stripe_ptr:
-                       break;
-               }
-       }
+       bch2_extent_crc_pack(crc, new, type);
+
+       k->k.u64s += extent_entry_u64s(ptrs.end);
+
+       EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
 }
 
-void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
-                           struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct bch_extent_crc_unpacked crc;
-       const struct bch_extent_ptr *ptr;
-       const struct bch_extent_stripe_ptr *ec;
-       struct bch_dev *ca;
-       bool first = true;
-
-       bkey_extent_entry_for_each(ptrs, entry) {
-               if (!first)
-                       pr_buf(out, " ");
-
-               switch (__extent_entry_type(entry)) {
-               case BCH_EXTENT_ENTRY_ptr:
-                       ptr = entry_to_ptr(entry);
-                       ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
-                               ? bch_dev_bkey_exists(c, ptr->dev)
-                               : NULL;
-
-                       pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
-                              (u64) ptr->offset, ptr->gen,
-                              ptr->cached ? " cached" : "",
-                              ca && ptr_stale(ca, ptr)
-                              ? " stale" : "");
-                       break;
-               case BCH_EXTENT_ENTRY_crc32:
-               case BCH_EXTENT_ENTRY_crc64:
-               case BCH_EXTENT_ENTRY_crc128:
-                       crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
-
-                       pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
-                              crc.compressed_size,
-                              crc.uncompressed_size,
-                              crc.offset, crc.nonce,
-                              crc.csum_type,
-                              crc.compression_type);
-                       break;
-               case BCH_EXTENT_ENTRY_stripe_ptr:
-                       ec = &entry->stripe_ptr;
-
-                       pr_buf(out, "ec: idx %llu block %u",
-                              (u64) ec->idx, ec->block);
-                       break;
-               default:
-                       pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
-                       return;
-               }
-
-               first = false;
-       }
-}
-
-static const char *extent_ptr_invalid(const struct bch_fs *c,
-                                     struct bkey_s_c k,
-                                     const struct bch_extent_ptr *ptr,
-                                     unsigned size_ondisk,
-                                     bool metadata)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const struct bch_extent_ptr *ptr2;
-       struct bch_dev *ca;
-
-       if (!bch2_dev_exists2(c, ptr->dev))
-               return "pointer to invalid device";
-
-       ca = bch_dev_bkey_exists(c, ptr->dev);
-       if (!ca)
-               return "pointer to invalid device";
-
-       bkey_for_each_ptr(ptrs, ptr2)
-               if (ptr != ptr2 && ptr->dev == ptr2->dev)
-                       return "multiple pointers to same device";
-
-       if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
-               return "offset past end of device";
-
-       if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
-               return "offset before first bucket";
-
-       if (bucket_remainder(ca, ptr->offset) +
-           size_ondisk > ca->mi.bucket_size)
-               return "spans multiple buckets";
-
-       return NULL;
-}
-
-const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       struct bch_extent_crc_unpacked crc;
-       unsigned size_ondisk = k.k->size;
-       const char *reason;
-       unsigned nonce = UINT_MAX;
-
-       if (k.k->type == KEY_TYPE_btree_ptr)
-               size_ondisk = c->opts.btree_node_size;
-
-       bkey_extent_entry_for_each(ptrs, entry) {
-               if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
-                       return "invalid extent entry type";
-
-               if (k.k->type == KEY_TYPE_btree_ptr &&
-                   !extent_entry_is_ptr(entry))
-                       return "has non ptr field";
-
-               switch (extent_entry_type(entry)) {
-               case BCH_EXTENT_ENTRY_ptr:
-                       reason = extent_ptr_invalid(c, k, &entry->ptr,
-                                                   size_ondisk, false);
-                       if (reason)
-                               return reason;
-                       break;
-               case BCH_EXTENT_ENTRY_crc32:
-               case BCH_EXTENT_ENTRY_crc64:
-               case BCH_EXTENT_ENTRY_crc128:
-                       crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
-
-                       if (crc.offset + crc.live_size >
-                           crc.uncompressed_size)
-                               return "checksum offset + key size > uncompressed size";
-
-                       size_ondisk = crc.compressed_size;
-
-                       if (!bch2_checksum_type_valid(c, crc.csum_type))
-                               return "invalid checksum type";
-
-                       if (crc.compression_type >= BCH_COMPRESSION_NR)
-                               return "invalid compression type";
-
-                       if (bch2_csum_type_is_encryption(crc.csum_type)) {
-                               if (nonce == UINT_MAX)
-                                       nonce = crc.offset + crc.nonce;
-                               else if (nonce != crc.offset + crc.nonce)
-                                       return "incorrect nonce";
-                       }
-                       break;
-               case BCH_EXTENT_ENTRY_stripe_ptr:
-                       break;
-               }
-       }
-
-       return NULL;
-}
-
-/* Btree ptrs */
-
-const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k)
-{
-       if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX)
-               return "value too big";
-
-       return bch2_bkey_ptrs_invalid(c, k);
-}
-
-void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const struct bch_extent_ptr *ptr;
-       const char *err;
-       char buf[160];
-       struct bucket_mark mark;
-       struct bch_dev *ca;
-
-       bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-                      !bch2_bkey_replicas_marked(c, k, false), c,
-                      "btree key bad (replicas not marked in superblock):\n%s",
-                      (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf));
-
-       if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
-               return;
-
-       bkey_for_each_ptr(ptrs, ptr) {
-               ca = bch_dev_bkey_exists(c, ptr->dev);
-
-               mark = ptr_bucket_mark(ca, ptr);
-
-               err = "stale";
-               if (gen_after(mark.gen, ptr->gen))
-                       goto err;
-
-               err = "inconsistent";
-               if (mark.data_type != BCH_DATA_BTREE ||
-                   mark.dirty_sectors < c->opts.btree_node_size)
-                       goto err;
-       }
-
-       return;
-err:
-       bch2_bkey_val_to_text(&PBUF(buf), c, k);
-       bch2_fs_bug(c, "%s btree pointer %s: bucket %zi gen %i mark %08x",
-                   err, buf, PTR_BUCKET_NR(ca, ptr),
-                   mark.gen, (unsigned) mark.v.counter);
-}
-
-void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c,
-                           struct bkey_s_c k)
-{
-       bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-/* Extents */
-
-void __bch2_cut_front(struct bpos where, struct bkey_s k)
-{
-       u64 sub;
-
-       if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
-               return;
-
-       EBUG_ON(bkey_cmp(where, k.k->p) > 0);
-
-       sub = where.offset - bkey_start_offset(k.k);
-
-       k.k->size -= sub;
-
-       if (!k.k->size)
-               k.k->type = KEY_TYPE_deleted;
-
-       switch (k.k->type) {
-       case KEY_TYPE_deleted:
-       case KEY_TYPE_discard:
-       case KEY_TYPE_error:
-       case KEY_TYPE_cookie:
-               break;
-       case KEY_TYPE_extent:
-       case KEY_TYPE_reflink_v: {
-               struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-               union bch_extent_entry *entry;
-               bool seen_crc = false;
-
-               bkey_extent_entry_for_each(ptrs, entry) {
-                       switch (extent_entry_type(entry)) {
-                       case BCH_EXTENT_ENTRY_ptr:
-                               if (!seen_crc)
-                                       entry->ptr.offset += sub;
-                               break;
-                       case BCH_EXTENT_ENTRY_crc32:
-                               entry->crc32.offset += sub;
-                               break;
-                       case BCH_EXTENT_ENTRY_crc64:
-                               entry->crc64.offset += sub;
-                               break;
-                       case BCH_EXTENT_ENTRY_crc128:
-                               entry->crc128.offset += sub;
-                               break;
-                       case BCH_EXTENT_ENTRY_stripe_ptr:
-                               break;
-                       }
-
-                       if (extent_entry_is_crc(entry))
-                               seen_crc = true;
-               }
-
-               break;
-       }
-       case KEY_TYPE_reflink_p: {
-               struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
-
-               le64_add_cpu(&p.v->idx, sub);
-               break;
-       }
-       case KEY_TYPE_reservation:
-               break;
-       default:
-               BUG();
-       }
-}
-
-bool bch2_cut_back(struct bpos where, struct bkey *k)
-{
-       u64 len = 0;
-
-       if (bkey_cmp(where, k->p) >= 0)
-               return false;
-
-       EBUG_ON(bkey_cmp(where, bkey_start_pos(k)) < 0);
-
-       len = where.offset - bkey_start_offset(k);
-
-       k->p = where;
-       k->size = len;
-
-       if (!len)
-               k->type = KEY_TYPE_deleted;
-
-       return true;
-}
-
-static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k)
-{
-       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-       const union bch_extent_entry *entry;
-       unsigned ret = 0;
-
-       bkey_extent_entry_for_each(ptrs, entry) {
-               switch (__extent_entry_type(entry)) {
-               case BCH_EXTENT_ENTRY_ptr:
-               case BCH_EXTENT_ENTRY_stripe_ptr:
-                       ret++;
-               }
-       }
-
-       return ret;
-}
-
-static int count_iters_for_insert(struct btree_trans *trans,
-                                 struct bkey_s_c k,
-                                 unsigned offset,
-                                 struct bpos *end,
-                                 unsigned *nr_iters,
-                                 unsigned max_iters,
-                                 bool overwrite)
-{
-       int ret = 0;
-
-       switch (k.k->type) {
-       case KEY_TYPE_extent:
-       case KEY_TYPE_reflink_v:
-               *nr_iters += bch2_bkey_nr_alloc_ptrs(k);
-
-               if (*nr_iters >= max_iters) {
-                       *end = bpos_min(*end, k.k->p);
-                       ret = 1;
-               }
-
-               break;
-       case KEY_TYPE_reflink_p: {
-               struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-               u64 idx = le64_to_cpu(p.v->idx);
-               unsigned sectors = bpos_min(*end, p.k->p).offset -
-                       bkey_start_offset(p.k);
-               struct btree_iter *iter;
-               struct bkey_s_c r_k;
-
-               for_each_btree_key(trans, iter,
-                                  BTREE_ID_REFLINK, POS(0, idx + offset),
-                                  BTREE_ITER_SLOTS, r_k, ret) {
-                       if (bkey_cmp(bkey_start_pos(r_k.k),
-                                    POS(0, idx + sectors)) >= 0)
-                               break;
-
-                       *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k);
-
-                       if (*nr_iters >= max_iters) {
-                               struct bpos pos = bkey_start_pos(k.k);
-                               pos.offset += r_k.k->p.offset - idx;
-
-                               *end = bpos_min(*end, pos);
-                               ret = 1;
-                               break;
-                       }
-               }
-
-               bch2_trans_iter_put(trans, iter);
-               break;
-       }
-       }
-
-       return ret;
-}
-
-#define EXTENT_ITERS_MAX       (BTREE_ITER_MAX / 3)
-
-int bch2_extent_atomic_end(struct btree_iter *iter,
-                          struct bkey_i *insert,
-                          struct bpos *end)
-{
-       struct btree_trans *trans = iter->trans;
-       struct btree *b;
-       struct btree_node_iter  node_iter;
-       struct bkey_packed      *_k;
-       unsigned                nr_iters = 0;
-       int ret;
-
-       ret = bch2_btree_iter_traverse(iter);
-       if (ret)
-               return ret;
+/* Generic code for keys with pointers: */
 
-       b = iter->l[0].b;
-       node_iter = iter->l[0].iter;
-
-       BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0);
-
-       *end = bpos_min(insert->k.p, b->key.k.p);
-
-       ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end,
-                                    &nr_iters, EXTENT_ITERS_MAX / 2, false);
-       if (ret < 0)
-               return ret;
-
-       while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
-                                                     KEY_TYPE_discard))) {
-               struct bkey     unpacked;
-               struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked);
-               unsigned offset = 0;
-
-               if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0)
-                       break;
-
-               if (bkey_cmp(bkey_start_pos(&insert->k),
-                            bkey_start_pos(k.k)) > 0)
-                       offset = bkey_start_offset(&insert->k) -
-                               bkey_start_offset(k.k);
-
-               ret = count_iters_for_insert(trans, k, offset, end,
-                                       &nr_iters, EXTENT_ITERS_MAX, true);
-               if (ret)
-                       break;
-
-               bch2_btree_node_iter_advance(&node_iter, b);
-       }
-
-       return ret < 0 ? ret : 0;
-}
-
-int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter)
-{
-       struct bpos end;
-       int ret;
-
-       ret = bch2_extent_atomic_end(iter, k, &end);
-       if (ret)
-               return ret;
-
-       bch2_cut_back(end, &k->k);
-       return 0;
-}
-
-int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter)
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k)
 {
-       struct bpos end;
-       int ret;
-
-       ret = bch2_extent_atomic_end(iter, k, &end);
-       if (ret)
-               return ret;
-
-       return !bkey_cmp(end, k->k.p);
+       return bch2_bkey_devs(k).nr;
 }
 
-enum btree_insert_ret
-bch2_extent_can_insert(struct btree_trans *trans,
-                      struct btree_insert_entry *insert,
-                      unsigned *u64s)
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
 {
-       struct btree_iter_level *l = &insert->iter->l[0];
-       struct btree_node_iter node_iter = l->iter;
-       enum bch_extent_overlap overlap;
-       struct bkey_packed *_k;
-       struct bkey unpacked;
-       struct bkey_s_c k;
-       int sectors;
-
-       /*
-        * We avoid creating whiteouts whenever possible when deleting, but
-        * those optimizations mean we may potentially insert two whiteouts
-        * instead of one (when we overlap with the front of one extent and the
-        * back of another):
-        */
-       if (bkey_whiteout(&insert->k->k))
-               *u64s += BKEY_U64s;
-
-       _k = bch2_btree_node_iter_peek_filter(&node_iter, l->b,
-                                             KEY_TYPE_discard);
-       if (!_k)
-               return BTREE_INSERT_OK;
-
-       k = bkey_disassemble(l->b, _k, &unpacked);
-
-       overlap = bch2_extent_overlap(&insert->k->k, k.k);
-
-       /* account for having to split existing extent: */
-       if (overlap == BCH_EXTENT_OVERLAP_MIDDLE)
-               *u64s += _k->u64s;
-
-       if (overlap == BCH_EXTENT_OVERLAP_MIDDLE &&
-           (sectors = bch2_extent_is_compressed(k))) {
-               int flags = trans->flags & BTREE_INSERT_NOFAIL
-                       ? BCH_DISK_RESERVATION_NOFAIL : 0;
-
-               switch (bch2_disk_reservation_add(trans->c,
-                               trans->disk_res,
-                               sectors, flags)) {
-               case 0:
-                       break;
-               case -ENOSPC:
-                       return BTREE_INSERT_ENOSPC;
-               default:
-                       BUG();
-               }
-       }
-
-       return BTREE_INSERT_OK;
+       return k.k->type == KEY_TYPE_reservation
+               ? bkey_s_c_to_reservation(k).v->nr_replicas
+               : bch2_bkey_dirty_devs(k).nr;
 }
 
-static void verify_extent_nonoverlapping(struct bch_fs *c,
-                                        struct btree *b,
-                                        struct btree_node_iter *_iter,
-                                        struct bkey_i *insert)
+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k)
 {
-#ifdef CONFIG_BCACHEFS_DEBUG
-       struct btree_node_iter iter;
-       struct bkey_packed *k;
-       struct bkey uk;
+       unsigned ret = 0;
 
-       if (!expensive_debug_checks(c))
-               return;
+       if (k.k->type == KEY_TYPE_reservation) {
+               ret = bkey_s_c_to_reservation(k).v->nr_replicas;
+       } else {
+               struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+               const union bch_extent_entry *entry;
+               struct extent_ptr_decoded p;
 
-       iter = *_iter;
-       k = bch2_btree_node_iter_prev_filter(&iter, b, KEY_TYPE_discard);
-       BUG_ON(k &&
-              (uk = bkey_unpack_key(b, k),
-               bkey_cmp(uk.p, bkey_start_pos(&insert->k)) > 0));
-
-       iter = *_iter;
-       k = bch2_btree_node_iter_peek_filter(&iter, b, KEY_TYPE_discard);
-#if 0
-       BUG_ON(k &&
-              (uk = bkey_unpack_key(b, k),
-               bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0);
-#else
-       if (k &&
-           (uk = bkey_unpack_key(b, k),
-            bkey_cmp(insert->k.p, bkey_start_pos(&uk))) > 0) {
-               char buf1[100];
-               char buf2[100];
-
-               bch2_bkey_to_text(&PBUF(buf1), &insert->k);
-               bch2_bkey_to_text(&PBUF(buf2), &uk);
-
-               bch2_dump_btree_node(b);
-               panic("insert > next :\n"
-                     "insert %s\n"
-                     "next   %s\n",
-                     buf1, buf2);
+               bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+                       ret += !p.ptr.cached &&
+                               p.crc.compression_type == BCH_COMPRESSION_NONE;
        }
-#endif
 
-#endif
+       return ret;
 }
 
-static void extent_bset_insert(struct bch_fs *c, struct btree_iter *iter,
-                              struct bkey_i *insert)
+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k)
 {
-       struct btree_iter_level *l = &iter->l[0];
-       struct bkey_packed *k =
-               bch2_btree_node_iter_bset_pos(&l->iter, l->b, bset_tree_last(l->b));
-
-       BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, l->b));
-
-       EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
-       verify_extent_nonoverlapping(c, l->b, &l->iter, insert);
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const union bch_extent_entry *entry;
+       struct extent_ptr_decoded p;
+       unsigned ret = 0;
 
-       if (debug_check_bkeys(c))
-               bch2_bkey_debugcheck(c, l->b, bkey_i_to_s_c(insert));
+       bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+               if (!p.ptr.cached &&
+                   p.crc.compression_type != BCH_COMPRESSION_NONE)
+                       ret += p.crc.compressed_size;
 
-       bch2_bset_insert(l->b, &l->iter, k, insert, 0);
-       bch2_btree_node_iter_fix(iter, l->b, &l->iter, k, 0, k->u64s);
+       return ret;
 }
 
-static void
-extent_squash(struct bch_fs *c, struct btree_iter *iter,
-             struct bkey_i *insert,
-             struct bkey_packed *_k, struct bkey_s k,
-             enum bch_extent_overlap overlap)
+bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
+                               unsigned nr_replicas)
 {
-       struct btree_iter_level *l = &iter->l[0];
-
-       switch (overlap) {
-       case BCH_EXTENT_OVERLAP_FRONT:
-               /* insert overlaps with start of k: */
-               __bch2_cut_front(insert->k.p, k);
-               EBUG_ON(bkey_deleted(k.k));
-               extent_save(l->b, _k, k.k);
-               bch2_btree_iter_fix_key_modified(iter, l->b, _k);
-               break;
-
-       case BCH_EXTENT_OVERLAP_BACK:
-               /* insert overlaps with end of k: */
-               bch2_cut_back(bkey_start_pos(&insert->k), k.k);
-               EBUG_ON(bkey_deleted(k.k));
-               extent_save(l->b, _k, k.k);
-
-               /*
-                * As the auxiliary tree is indexed by the end of the
-                * key and we've just changed the end, update the
-                * auxiliary tree.
-                */
-               bch2_bset_fix_invalidated_key(l->b, _k);
-               bch2_btree_node_iter_fix(iter, l->b, &l->iter,
-                                        _k, _k->u64s, _k->u64s);
-               break;
-
-       case BCH_EXTENT_OVERLAP_ALL: {
-               /* The insert key completely covers k, invalidate k */
-               if (!bkey_whiteout(k.k))
-                       btree_account_key_drop(l->b, _k);
-
-               k.k->size = 0;
-               k.k->type = KEY_TYPE_deleted;
-
-               if (_k >= btree_bset_last(l->b)->start) {
-                       unsigned u64s = _k->u64s;
-
-                       bch2_bset_delete(l->b, _k, _k->u64s);
-                       bch2_btree_node_iter_fix(iter, l->b, &l->iter,
-                                                _k, u64s, 0);
-               } else {
-                       extent_save(l->b, _k, k.k);
-                       bch2_btree_iter_fix_key_modified(iter, l->b, _k);
-               }
-
-               break;
-       }
-       case BCH_EXTENT_OVERLAP_MIDDLE: {
-               BKEY_PADDED(k) split;
-               /*
-                * The insert key falls 'in the middle' of k
-                * The insert key splits k in 3:
-                * - start only in k, preserve
-                * - middle common section, invalidate in k
-                * - end only in k, preserve
-                *
-                * We update the old key to preserve the start,
-                * insert will be the new common section,
-                * we manually insert the end that we are preserving.
-                *
-                * modify k _before_ doing the insert (which will move
-                * what k points to)
-                */
-               bkey_reassemble(&split.k, k.s_c);
-               split.k.k.needs_whiteout |= bkey_written(l->b, _k);
-
-               bch2_cut_back(bkey_start_pos(&insert->k), &split.k.k);
-               BUG_ON(bkey_deleted(&split.k.k));
+       struct btree_trans trans;
+       struct btree_iter *iter;
+       struct bpos end = pos;
+       struct bkey_s_c k;
+       bool ret = true;
+       int err;
 
-               __bch2_cut_front(insert->k.p, k);
-               BUG_ON(bkey_deleted(k.k));
-               extent_save(l->b, _k, k.k);
-               bch2_btree_iter_fix_key_modified(iter, l->b, _k);
+       end.offset += size;
 
-               extent_bset_insert(c, iter, &split.k);
-               break;
-       }
-       }
-}
+       bch2_trans_init(&trans, c, 0, 0);
 
-/**
- * bch_extent_insert_fixup - insert a new extent and deal with overlaps
- *
- * this may result in not actually doing the insert, or inserting some subset
- * of the insert key. For cmpxchg operations this is where that logic lives.
- *
- * All subsets of @insert that need to be inserted are inserted using
- * bch2_btree_insert_and_journal(). If @b or @res fills up, this function
- * returns false, setting @iter->pos for the prefix of @insert that actually got
- * inserted.
- *
- * BSET INVARIANTS: this function is responsible for maintaining all the
- * invariants for bsets of extents in memory. things get really hairy with 0
- * size extents
- *
- * within one bset:
- *
- * bkey_start_pos(bkey_next(k)) >= k
- * or bkey_start_offset(bkey_next(k)) >= k->offset
- *
- * i.e. strict ordering, no overlapping extents.
- *
- * multiple bsets (i.e. full btree node):
- *
- * âˆ€ k, j
- *   k.size != 0 âˆ§ j.size != 0 â†’
- *     Â¬ (k > bkey_start_pos(j) âˆ§ k < j)
- *
- * i.e. no two overlapping keys _of nonzero size_
- *
- * We can't realistically maintain this invariant for zero size keys because of
- * the key merging done in bch2_btree_insert_key() - for two mergeable keys k, j
- * there may be another 0 size key between them in another bset, and it will
- * thus overlap with the merged key.
- *
- * In addition, the end of iter->pos indicates how much has been processed.
- * If the end of iter->pos is not the same as the end of insert, then
- * key insertion needs to continue/be retried.
- */
-void bch2_insert_fixup_extent(struct btree_trans *trans,
-                             struct btree_insert_entry *insert_entry)
-{
-       struct bch_fs *c = trans->c;
-       struct btree_iter *iter = insert_entry->iter;
-       struct bkey_i *insert   = insert_entry->k;
-       struct btree_iter_level *l = &iter->l[0];
-       struct btree_node_iter node_iter = l->iter;
-       bool deleting           = bkey_whiteout(&insert->k);
-       bool update_journal     = !deleting;
-       bool update_btree       = !deleting;
-       struct bkey_i whiteout  = *insert;
-       struct bkey_packed *_k;
-       struct bkey unpacked;
-
-       EBUG_ON(iter->level);
-       EBUG_ON(!insert->k.size);
-       EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&insert->k)));
-
-       while ((_k = bch2_btree_node_iter_peek_filter(&l->iter, l->b,
-                                                     KEY_TYPE_discard))) {
-               struct bkey_s k = __bkey_disassemble(l->b, _k, &unpacked);
-               struct bpos cur_end = bpos_min(insert->k.p, k.k->p);
-               enum bch_extent_overlap overlap =
-                       bch2_extent_overlap(&insert->k, k.k);
-
-               if (bkey_cmp(bkey_start_pos(k.k), insert->k.p) >= 0)
+       for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos,
+                          BTREE_ITER_SLOTS, k, err) {
+               if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
                        break;
 
-               if (!bkey_whiteout(k.k))
-                       update_journal = true;
-
-               if (!update_journal) {
-                       bch2_cut_front(cur_end, insert);
-                       bch2_cut_front(cur_end, &whiteout);
-                       bch2_btree_iter_set_pos_same_leaf(iter, cur_end);
-                       goto next;
-               }
-
-               /*
-                * When deleting, if possible just do it by switching the type
-                * of the key we're deleting, instead of creating and inserting
-                * a new whiteout:
-                */
-               if (deleting &&
-                   !update_btree &&
-                   !bkey_cmp(insert->k.p, k.k->p) &&
-                   !bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) {
-                       if (!bkey_whiteout(k.k)) {
-                               btree_account_key_drop(l->b, _k);
-                               _k->type = KEY_TYPE_discard;
-                               reserve_whiteout(l->b, _k);
-                               bch2_btree_iter_fix_key_modified(iter,
-                                                                l->b, _k);
-                       }
+               if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) {
+                       ret = false;
                        break;
                }
-
-               if (k.k->needs_whiteout || bkey_written(l->b, _k)) {
-                       insert->k.needs_whiteout = true;
-                       update_btree = true;
-               }
-
-               if (update_btree &&
-                   overlap == BCH_EXTENT_OVERLAP_ALL &&
-                   bkey_whiteout(k.k) &&
-                   k.k->needs_whiteout) {
-                       unreserve_whiteout(l->b, _k);
-                       _k->needs_whiteout = false;
-               }
-
-               extent_squash(c, iter, insert, _k, k, overlap);
-
-               if (!update_btree)
-                       bch2_cut_front(cur_end, insert);
-next:
-               node_iter = l->iter;
-
-               if (overlap == BCH_EXTENT_OVERLAP_FRONT ||
-                   overlap == BCH_EXTENT_OVERLAP_MIDDLE)
-                       break;
        }
+       bch2_trans_exit(&trans);
 
-       l->iter = node_iter;
-       bch2_btree_iter_set_pos_same_leaf(iter, insert->k.p);
+       return ret;
+}
 
-       if (update_btree) {
-               if (deleting)
-                       insert->k.type = KEY_TYPE_discard;
+static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
+                                          struct extent_ptr_decoded p)
+{
+       unsigned durability = 0;
+       struct bch_dev *ca;
 
-               EBUG_ON(bkey_deleted(&insert->k) || !insert->k.size);
+       if (p.ptr.cached)
+               return 0;
 
-               extent_bset_insert(c, iter, insert);
-       }
+       ca = bch_dev_bkey_exists(c, p.ptr.dev);
 
-       if (update_journal) {
-               struct bkey_i *k = !deleting ? insert : &whiteout;
+       if (ca->mi.state != BCH_MEMBER_STATE_FAILED)
+               durability = max_t(unsigned, durability, ca->mi.durability);
 
-               if (deleting)
-                       k->k.type = KEY_TYPE_discard;
+       if (p.has_ec) {
+               struct stripe *s =
+                       genradix_ptr(&c->stripes[0], p.ec.idx);
 
-               EBUG_ON(bkey_deleted(&k->k) || !k->k.size);
+               if (WARN_ON(!s))
+                       goto out;
 
-               bch2_btree_journal_key(trans, iter, k);
+               durability = max_t(unsigned, durability, s->nr_redundant);
        }
-
-       bch2_cut_front(insert->k.p, insert);
-}
-
-const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k)
-{
-       return bch2_bkey_ptrs_invalid(c, k);
+out:
+       return durability;
 }
 
-void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k)
+unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k)
 {
-       struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
        const union bch_extent_entry *entry;
        struct extent_ptr_decoded p;
-       char buf[160];
-
-       /*
-        * XXX: we should be doing most/all of these checks at startup time,
-        * where we check bch2_bkey_invalid() in btree_node_read_done()
-        *
-        * But note that we can't check for stale pointers or incorrect gc marks
-        * until after journal replay is done (it might be an extent that's
-        * going to get overwritten during replay)
-        */
-
-       if (percpu_down_read_trylock(&c->mark_lock)) {
-               bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
-                              !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c,
-                              "extent key bad (replicas not marked in superblock):\n%s",
-                              (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf));
-               percpu_up_read(&c->mark_lock);
-       }
-       /*
-        * If journal replay hasn't finished, we might be seeing keys
-        * that will be overwritten by the time journal replay is done:
-        */
-       if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
-               return;
-
-       extent_for_each_ptr_decode(e, p, entry) {
-               struct bch_dev *ca      = bch_dev_bkey_exists(c, p.ptr.dev);
-               struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr);
-               unsigned stale          = gen_after(mark.gen, p.ptr.gen);
-               unsigned disk_sectors   = ptr_disk_sectors(p);
-               unsigned mark_sectors   = p.ptr.cached
-                       ? mark.cached_sectors
-                       : mark.dirty_sectors;
-
-               bch2_fs_bug_on(stale && !p.ptr.cached, c,
-                              "stale dirty pointer (ptr gen %u bucket %u",
-                              p.ptr.gen, mark.gen);
+       unsigned durability = 0;
 
-               bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale);
+       bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+               durability += bch2_extent_ptr_durability(c, p);
 
-               bch2_fs_bug_on(!stale &&
-                              (mark.data_type != BCH_DATA_USER ||
-                               mark_sectors < disk_sectors), c,
-                              "extent pointer not marked: %s:\n"
-                              "type %u sectors %u < %u",
-                              (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf),
-                              mark.data_type,
-                              mark_sectors, disk_sectors);
-       }
+       return durability;
 }
 
-void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c,
-                        struct bkey_s_c k)
+void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
+                                   unsigned target,
+                                   unsigned nr_desired_replicas)
 {
-       bch2_bkey_ptrs_to_text(out, c, k);
-}
-
-static unsigned bch2_crc_field_size_max[] = {
-       [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX,
-       [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX,
-       [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX,
-};
+       struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+       union bch_extent_entry *entry;
+       struct extent_ptr_decoded p;
+       int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
 
-static void bch2_extent_crc_pack(union bch_extent_crc *dst,
-                                struct bch_extent_crc_unpacked src,
-                                enum bch_extent_entry_type type)
-{
-#define set_common_fields(_dst, _src)                                  \
-               _dst.type               = 1 << type;                    \
-               _dst.csum_type          = _src.csum_type,               \
-               _dst.compression_type   = _src.compression_type,        \
-               _dst._compressed_size   = _src.compressed_size - 1,     \
-               _dst._uncompressed_size = _src.uncompressed_size - 1,   \
-               _dst.offset             = _src.offset
+       if (target && extra > 0)
+               bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+                       int n = bch2_extent_ptr_durability(c, p);
 
-       switch (type) {
-       case BCH_EXTENT_ENTRY_crc32:
-               set_common_fields(dst->crc32, src);
-               dst->crc32.csum  = *((__le32 *) &src.csum.lo);
-               break;
-       case BCH_EXTENT_ENTRY_crc64:
-               set_common_fields(dst->crc64, src);
-               dst->crc64.nonce        = src.nonce;
-               dst->crc64.csum_lo      = src.csum.lo;
-               dst->crc64.csum_hi      = *((__le16 *) &src.csum.hi);
-               break;
-       case BCH_EXTENT_ENTRY_crc128:
-               set_common_fields(dst->crc128, src);
-               dst->crc128.nonce       = src.nonce;
-               dst->crc128.csum        = src.csum;
-               break;
-       default:
-               BUG();
-       }
-#undef set_common_fields
+                       if (n && n <= extra &&
+                           !bch2_dev_in_target(c, p.ptr.dev, target)) {
+                               entry->ptr.cached = true;
+                               extra -= n;
+                       }
+               }
+
+       if (extra > 0)
+               bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+                       int n = bch2_extent_ptr_durability(c, p);
+
+                       if (n && n <= extra) {
+                               entry->ptr.cached = true;
+                               extra -= n;
+                       }
+               }
 }
 
-void bch2_extent_crc_append(struct bkey_i *k,
-                           struct bch_extent_crc_unpacked new)
+void bch2_bkey_append_ptr(struct bkey_i *k,
+                         struct bch_extent_ptr ptr)
 {
-       struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k));
-       union bch_extent_crc *crc = (void *) ptrs.end;
-       enum bch_extent_entry_type type;
-
-       if (bch_crc_bytes[new.csum_type]        <= 4 &&
-           new.uncompressed_size - 1           <= CRC32_SIZE_MAX &&
-           new.nonce                           <= CRC32_NONCE_MAX)
-               type = BCH_EXTENT_ENTRY_crc32;
-       else if (bch_crc_bytes[new.csum_type]   <= 10 &&
-                  new.uncompressed_size - 1    <= CRC64_SIZE_MAX &&
-                  new.nonce                    <= CRC64_NONCE_MAX)
-               type = BCH_EXTENT_ENTRY_crc64;
-       else if (bch_crc_bytes[new.csum_type]   <= 16 &&
-                  new.uncompressed_size - 1    <= CRC128_SIZE_MAX &&
-                  new.nonce                    <= CRC128_NONCE_MAX)
-               type = BCH_EXTENT_ENTRY_crc128;
-       else
-               BUG();
+       EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev));
 
-       bch2_extent_crc_pack(crc, new, type);
+       switch (k->k.type) {
+       case KEY_TYPE_btree_ptr:
+       case KEY_TYPE_extent:
+               EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX);
 
-       k->k.u64s += extent_entry_u64s(ptrs.end);
+               ptr.type = 1 << BCH_EXTENT_ENTRY_ptr;
 
-       EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX);
+               memcpy((void *) &k->v + bkey_val_bytes(&k->k),
+                      &ptr,
+                      sizeof(ptr));
+               k->u64s++;
+               break;
+       default:
+               BUG();
+       }
 }
 
 static inline void __extent_entry_insert(struct bkey_i *k,
@@ -1492,6 +800,107 @@ found:
        }
 }
 
+static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
+                                         union bch_extent_entry *entry)
+{
+       union bch_extent_entry *i = ptrs.start;
+
+       if (i == entry)
+               return NULL;
+
+       while (extent_entry_next(i) != entry)
+               i = extent_entry_next(i);
+       return i;
+}
+
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
+                                          struct bch_extent_ptr *ptr)
+{
+       struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+       union bch_extent_entry *dst, *src, *prev;
+       bool drop_crc = true;
+
+       EBUG_ON(ptr < &ptrs.start->ptr ||
+               ptr >= &ptrs.end->ptr);
+       EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
+
+       src = extent_entry_next(to_entry(ptr));
+       if (src != ptrs.end &&
+           !extent_entry_is_crc(src))
+               drop_crc = false;
+
+       dst = to_entry(ptr);
+       while ((prev = extent_entry_prev(ptrs, dst))) {
+               if (extent_entry_is_ptr(prev))
+                       break;
+
+               if (extent_entry_is_crc(prev)) {
+                       if (drop_crc)
+                               dst = prev;
+                       break;
+               }
+
+               dst = prev;
+       }
+
+       memmove_u64s_down(dst, src,
+                         (u64 *) ptrs.end - (u64 *) src);
+       k.k->u64s -= (u64 *) src - (u64 *) dst;
+
+       return dst;
+}
+
+void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
+{
+       struct bch_extent_ptr *ptr;
+
+       bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev);
+}
+
+const struct bch_extent_ptr *
+bch2_bkey_has_device(struct bkey_s_c k, unsigned dev)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const struct bch_extent_ptr *ptr;
+
+       bkey_for_each_ptr(ptrs, ptr)
+               if (ptr->dev == dev)
+                       return ptr;
+
+       return NULL;
+}
+
+bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const struct bch_extent_ptr *ptr;
+
+       bkey_for_each_ptr(ptrs, ptr)
+               if (bch2_dev_in_target(c, ptr->dev, target) &&
+                   (!ptr->cached ||
+                    !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)))
+                       return true;
+
+       return false;
+}
+
+bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k,
+                          struct bch_extent_ptr m, u64 offset)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const union bch_extent_entry *entry;
+       struct extent_ptr_decoded p;
+
+       bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
+               if (p.ptr.dev   == m.dev &&
+                   p.ptr.gen   == m.gen &&
+                   (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) ==
+                   (s64) m.offset  - offset)
+                       return true;
+
+       return false;
+}
+
 /*
  * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
  *
@@ -1509,245 +918,307 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
                ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
 
        /* will only happen if all pointers were cached: */
-       if (!bkey_val_u64s(k.k))
+       if (!bch2_bkey_nr_ptrs(k.s_c))
                k.k->type = KEY_TYPE_discard;
 
        return bkey_whiteout(k.k);
 }
 
-void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k,
-                                   unsigned target,
-                                   unsigned nr_desired_replicas)
+void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+                           struct bkey_s_c k)
 {
-       struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-       union bch_extent_entry *entry;
-       struct extent_ptr_decoded p;
-       int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas;
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const union bch_extent_entry *entry;
+       struct bch_extent_crc_unpacked crc;
+       const struct bch_extent_ptr *ptr;
+       const struct bch_extent_stripe_ptr *ec;
+       struct bch_dev *ca;
+       bool first = true;
 
-       if (target && extra > 0)
-               bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-                       int n = bch2_extent_ptr_durability(c, p);
+       bkey_extent_entry_for_each(ptrs, entry) {
+               if (!first)
+                       pr_buf(out, " ");
 
-                       if (n && n <= extra &&
-                           !bch2_dev_in_target(c, p.ptr.dev, target)) {
-                               entry->ptr.cached = true;
-                               extra -= n;
-                       }
-               }
+               switch (__extent_entry_type(entry)) {
+               case BCH_EXTENT_ENTRY_ptr:
+                       ptr = entry_to_ptr(entry);
+                       ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+                               ? bch_dev_bkey_exists(c, ptr->dev)
+                               : NULL;
 
-       if (extra > 0)
-               bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
-                       int n = bch2_extent_ptr_durability(c, p);
+                       pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev,
+                              (u64) ptr->offset, ptr->gen,
+                              ptr->cached ? " cached" : "",
+                              ca && ptr_stale(ca, ptr)
+                              ? " stale" : "");
+                       break;
+               case BCH_EXTENT_ENTRY_crc32:
+               case BCH_EXTENT_ENTRY_crc64:
+               case BCH_EXTENT_ENTRY_crc128:
+                       crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
-                       if (n && n <= extra) {
-                               entry->ptr.cached = true;
-                               extra -= n;
-                       }
+                       pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u",
+                              crc.compressed_size,
+                              crc.uncompressed_size,
+                              crc.offset, crc.nonce,
+                              crc.csum_type,
+                              crc.compression_type);
+                       break;
+               case BCH_EXTENT_ENTRY_stripe_ptr:
+                       ec = &entry->stripe_ptr;
+
+                       pr_buf(out, "ec: idx %llu block %u",
+                              (u64) ec->idx, ec->block);
+                       break;
+               default:
+                       pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry));
+                       return;
                }
+
+               first = false;
+       }
 }
 
-enum merge_result bch2_extent_merge(struct bch_fs *c,
-                                   struct bkey_s _l, struct bkey_s _r)
+static const char *extent_ptr_invalid(const struct bch_fs *c,
+                                     struct bkey_s_c k,
+                                     const struct bch_extent_ptr *ptr,
+                                     unsigned size_ondisk,
+                                     bool metadata)
 {
-       struct bkey_s_extent l = bkey_s_to_extent(_l);
-       struct bkey_s_extent r = bkey_s_to_extent(_r);
-       union bch_extent_entry *en_l = l.v->start;
-       union bch_extent_entry *en_r = r.v->start;
-       struct bch_extent_crc_unpacked crc_l, crc_r;
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const struct bch_extent_ptr *ptr2;
+       struct bch_dev *ca;
 
-       if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k))
-               return BCH_MERGE_NOMERGE;
+       if (!bch2_dev_exists2(c, ptr->dev))
+               return "pointer to invalid device";
 
-       crc_l = bch2_extent_crc_unpack(l.k, NULL);
+       ca = bch_dev_bkey_exists(c, ptr->dev);
+       if (!ca)
+               return "pointer to invalid device";
+
+       bkey_for_each_ptr(ptrs, ptr2)
+               if (ptr != ptr2 && ptr->dev == ptr2->dev)
+                       return "multiple pointers to same device";
+
+       if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets))
+               return "offset past end of device";
+
+       if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket))
+               return "offset before first bucket";
 
-       extent_for_each_entry(l, en_l) {
-               en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
+       if (bucket_remainder(ca, ptr->offset) +
+           size_ondisk > ca->mi.bucket_size)
+               return "spans multiple buckets";
 
-               if (extent_entry_type(en_l) != extent_entry_type(en_r))
-                       return BCH_MERGE_NOMERGE;
+       return NULL;
+}
 
-               switch (extent_entry_type(en_l)) {
-               case BCH_EXTENT_ENTRY_ptr: {
-                       const struct bch_extent_ptr *lp = &en_l->ptr;
-                       const struct bch_extent_ptr *rp = &en_r->ptr;
-                       struct bch_dev *ca;
+const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const union bch_extent_entry *entry;
+       struct bch_extent_crc_unpacked crc;
+       unsigned size_ondisk = k.k->size;
+       const char *reason;
+       unsigned nonce = UINT_MAX;
 
-                       if (lp->offset + crc_l.compressed_size != rp->offset ||
-                           lp->dev                     != rp->dev ||
-                           lp->gen                     != rp->gen)
-                               return BCH_MERGE_NOMERGE;
+       if (k.k->type == KEY_TYPE_btree_ptr)
+               size_ondisk = c->opts.btree_node_size;
 
-                       /* We don't allow extents to straddle buckets: */
-                       ca = bch_dev_bkey_exists(c, lp->dev);
+       bkey_extent_entry_for_each(ptrs, entry) {
+               if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
+                       return "invalid extent entry type";
 
-                       if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp))
-                               return BCH_MERGE_NOMERGE;
+               if (k.k->type == KEY_TYPE_btree_ptr &&
+                   !extent_entry_is_ptr(entry))
+                       return "has non ptr field";
 
-                       break;
-               }
-               case BCH_EXTENT_ENTRY_stripe_ptr:
-                       if (en_l->stripe_ptr.block      != en_r->stripe_ptr.block ||
-                           en_l->stripe_ptr.idx        != en_r->stripe_ptr.idx)
-                               return BCH_MERGE_NOMERGE;
+               switch (extent_entry_type(entry)) {
+               case BCH_EXTENT_ENTRY_ptr:
+                       reason = extent_ptr_invalid(c, k, &entry->ptr,
+                                                   size_ondisk, false);
+                       if (reason)
+                               return reason;
                        break;
                case BCH_EXTENT_ENTRY_crc32:
                case BCH_EXTENT_ENTRY_crc64:
                case BCH_EXTENT_ENTRY_crc128:
-                       crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
-                       crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
-
-                       if (crc_l.csum_type             != crc_r.csum_type ||
-                           crc_l.compression_type      != crc_r.compression_type ||
-                           crc_l.nonce                 != crc_r.nonce)
-                               return BCH_MERGE_NOMERGE;
-
-                       if (crc_l.offset + crc_l.live_size != crc_l.compressed_size ||
-                           crc_r.offset)
-                               return BCH_MERGE_NOMERGE;
+                       crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
-                       if (!bch2_checksum_mergeable(crc_l.csum_type))
-                               return BCH_MERGE_NOMERGE;
+                       if (crc.offset + crc.live_size >
+                           crc.uncompressed_size)
+                               return "checksum offset + key size > uncompressed size";
 
-                       if (crc_l.compression_type)
-                               return BCH_MERGE_NOMERGE;
+                       size_ondisk = crc.compressed_size;
 
-                       if (crc_l.csum_type &&
-                           crc_l.uncompressed_size +
-                           crc_r.uncompressed_size > c->sb.encoded_extent_max)
-                               return BCH_MERGE_NOMERGE;
+                       if (!bch2_checksum_type_valid(c, crc.csum_type))
+                               return "invalid checksum type";
 
-                       if (crc_l.uncompressed_size + crc_r.uncompressed_size - 1 >
-                           bch2_crc_field_size_max[extent_entry_type(en_l)])
-                               return BCH_MERGE_NOMERGE;
+                       if (crc.compression_type >= BCH_COMPRESSION_NR)
+                               return "invalid compression type";
 
+                       if (bch2_csum_type_is_encryption(crc.csum_type)) {
+                               if (nonce == UINT_MAX)
+                                       nonce = crc.offset + crc.nonce;
+                               else if (nonce != crc.offset + crc.nonce)
+                                       return "incorrect nonce";
+                       }
+                       break;
+               case BCH_EXTENT_ENTRY_stripe_ptr:
                        break;
-               default:
-                       return BCH_MERGE_NOMERGE;
                }
        }
 
-       extent_for_each_entry(l, en_l) {
-               struct bch_extent_crc_unpacked crc_l, crc_r;
-
-               en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data);
-
-               if (!extent_entry_is_crc(en_l))
-                       continue;
-
-               crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l));
-               crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r));
+       return NULL;
+}
 
-               crc_l.csum = bch2_checksum_merge(crc_l.csum_type,
-                                                crc_l.csum,
-                                                crc_r.csum,
-                                                crc_r.uncompressed_size << 9);
+void bch2_ptr_swab(const struct bkey_format *f, struct bkey_packed *k)
+{
+       union bch_extent_entry *entry;
+       u64 *d = (u64 *) bkeyp_val(f, k);
+       unsigned i;
 
-               crc_l.uncompressed_size += crc_r.uncompressed_size;
-               crc_l.compressed_size   += crc_r.compressed_size;
+       for (i = 0; i < bkeyp_val_u64s(f, k); i++)
+               d[i] = swab64(d[i]);
 
-               bch2_extent_crc_pack(entry_to_crc(en_l), crc_l,
-                                    extent_entry_type(en_l));
+       for (entry = (union bch_extent_entry *) d;
+            entry < (union bch_extent_entry *) (d + bkeyp_val_u64s(f, k));
+            entry = extent_entry_next(entry)) {
+               switch (extent_entry_type(entry)) {
+               case BCH_EXTENT_ENTRY_ptr:
+                       break;
+               case BCH_EXTENT_ENTRY_crc32:
+                       entry->crc32.csum = swab32(entry->crc32.csum);
+                       break;
+               case BCH_EXTENT_ENTRY_crc64:
+                       entry->crc64.csum_hi = swab16(entry->crc64.csum_hi);
+                       entry->crc64.csum_lo = swab64(entry->crc64.csum_lo);
+                       break;
+               case BCH_EXTENT_ENTRY_crc128:
+                       entry->crc128.csum.hi = (__force __le64)
+                               swab64((__force u64) entry->crc128.csum.hi);
+                       entry->crc128.csum.lo = (__force __le64)
+                               swab64((__force u64) entry->crc128.csum.lo);
+                       break;
+               case BCH_EXTENT_ENTRY_stripe_ptr:
+                       break;
+               }
        }
-
-       bch2_key_resize(l.k, l.k->size + r.k->size);
-
-       return BCH_MERGE_MERGE;
 }
 
-bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
-                              unsigned nr_replicas)
+/* Generic extent code: */
+
+int bch2_cut_front_s(struct bpos where, struct bkey_s k)
 {
-       struct btree_trans trans;
-       struct btree_iter *iter;
-       struct bpos end = pos;
-       struct bkey_s_c k;
-       bool ret = true;
-       int err;
+       unsigned new_val_u64s = bkey_val_u64s(k.k);
+       int val_u64s_delta;
+       u64 sub;
 
-       end.offset += size;
+       if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0)
+               return 0;
 
-       bch2_trans_init(&trans, c, 0, 0);
+       EBUG_ON(bkey_cmp(where, k.k->p) > 0);
 
-       for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos,
-                          BTREE_ITER_SLOTS, k, err) {
-               if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
-                       break;
+       sub = where.offset - bkey_start_offset(k.k);
 
-               if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) {
-                       ret = false;
-                       break;
-               }
+       k.k->size -= sub;
+
+       if (!k.k->size) {
+               k.k->type = KEY_TYPE_deleted;
+               new_val_u64s = 0;
        }
-       bch2_trans_exit(&trans);
 
-       return ret;
-}
+       switch (k.k->type) {
+       case KEY_TYPE_extent:
+       case KEY_TYPE_reflink_v: {
+               struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
+               union bch_extent_entry *entry;
+               bool seen_crc = false;
 
-unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
-{
-       unsigned ret = 0;
+               bkey_extent_entry_for_each(ptrs, entry) {
+                       switch (extent_entry_type(entry)) {
+                       case BCH_EXTENT_ENTRY_ptr:
+                               if (!seen_crc)
+                                       entry->ptr.offset += sub;
+                               break;
+                       case BCH_EXTENT_ENTRY_crc32:
+                               entry->crc32.offset += sub;
+                               break;
+                       case BCH_EXTENT_ENTRY_crc64:
+                               entry->crc64.offset += sub;
+                               break;
+                       case BCH_EXTENT_ENTRY_crc128:
+                               entry->crc128.offset += sub;
+                               break;
+                       case BCH_EXTENT_ENTRY_stripe_ptr:
+                               break;
+                       }
 
-       switch (k.k->type) {
-       case KEY_TYPE_extent: {
-               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-               const union bch_extent_entry *entry;
-               struct extent_ptr_decoded p;
+                       if (extent_entry_is_crc(entry))
+                               seen_crc = true;
+               }
 
-               extent_for_each_ptr_decode(e, p, entry)
-                       ret += !p.ptr.cached &&
-                               p.crc.compression_type == BCH_COMPRESSION_NONE;
                break;
        }
-       case KEY_TYPE_reservation:
-               ret = bkey_s_c_to_reservation(k).v->nr_replicas;
+       case KEY_TYPE_reflink_p: {
+               struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k);
+
+               le64_add_cpu(&p.v->idx, sub);
                break;
        }
+       case KEY_TYPE_inline_data: {
+               struct bkey_s_inline_data d = bkey_s_to_inline_data(k);
 
-       return ret;
-}
-
-/* KEY_TYPE_reservation: */
+               sub = min_t(u64, sub << 9, bkey_val_bytes(d.k));
 
-const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
-{
-       struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+               memmove(d.v->data,
+                       d.v->data + sub,
+                       bkey_val_bytes(d.k) - sub);
 
-       if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation))
-               return "incorrect value size";
+               new_val_u64s -= sub >> 3;
+               break;
+       }
+       }
 
-       if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX)
-               return "invalid nr_replicas";
+       val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
+       BUG_ON(val_u64s_delta < 0);
 
-       return NULL;
+       set_bkey_val_u64s(k.k, new_val_u64s);
+       memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
+       return -val_u64s_delta;
 }
 
-void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c,
-                             struct bkey_s_c k)
+int bch2_cut_back_s(struct bpos where, struct bkey_s k)
 {
-       struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k);
+       unsigned new_val_u64s = bkey_val_u64s(k.k);
+       int val_u64s_delta;
+       u64 len = 0;
 
-       pr_buf(out, "generation %u replicas %u",
-              le32_to_cpu(r.v->generation),
-              r.v->nr_replicas);
-}
+       if (bkey_cmp(where, k.k->p) >= 0)
+               return 0;
 
-enum merge_result bch2_reservation_merge(struct bch_fs *c,
-                                        struct bkey_s _l, struct bkey_s _r)
-{
-       struct bkey_s_reservation l = bkey_s_to_reservation(_l);
-       struct bkey_s_reservation r = bkey_s_to_reservation(_r);
+       EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0);
 
-       if (l.v->generation != r.v->generation ||
-           l.v->nr_replicas != r.v->nr_replicas)
-               return BCH_MERGE_NOMERGE;
+       len = where.offset - bkey_start_offset(k.k);
 
-       if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
-               bch2_key_resize(l.k, KEY_SIZE_MAX);
-               __bch2_cut_front(l.k->p, r.s);
-               return BCH_MERGE_PARTIAL;
+       k.k->p = where;
+       k.k->size = len;
+
+       if (!len) {
+               k.k->type = KEY_TYPE_deleted;
+               new_val_u64s = 0;
        }
 
-       bch2_key_resize(l.k, l.k->size + r.k->size);
+       switch (k.k->type) {
+       case KEY_TYPE_inline_data:
+               new_val_u64s = min(new_val_u64s, k.k->size << 6);
+               break;
+       }
 
-       return BCH_MERGE_MERGE;
+       val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s;
+       BUG_ON(val_u64s_delta < 0);
+
+       set_bkey_val_u64s(k.k, new_val_u64s);
+       memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64));
+       return -val_u64s_delta;
 }
index cc7ee9067b50a7ee69bc65f7d053d1f9b60a614d..1140d01a42ab34c675e90471f78697f08434062a 100644 (file)
@@ -40,6 +40,9 @@ struct btree_insert_entry;
                (union bch_extent_entry *) (_entry));                   \
 })
 
+#define extent_entry_next(_entry)                                      \
+       ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+
 static inline unsigned
 __extent_entry_type(const union bch_extent_entry *e)
 {
@@ -185,10 +188,52 @@ struct bkey_ptrs {
        union bch_extent_entry  *end;
 };
 
-/* iterate over bkey ptrs */
+static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
+{
+       switch (k.k->type) {
+       case KEY_TYPE_btree_ptr: {
+               struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
+               return (struct bkey_ptrs_c) {
+                       to_entry(&e.v->start[0]),
+                       to_entry(extent_entry_last(e))
+               };
+       }
+       case KEY_TYPE_extent: {
+               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+               return (struct bkey_ptrs_c) {
+                       e.v->start,
+                       extent_entry_last(e)
+               };
+       }
+       case KEY_TYPE_stripe: {
+               struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+               return (struct bkey_ptrs_c) {
+                       to_entry(&s.v->ptrs[0]),
+                       to_entry(&s.v->ptrs[s.v->nr_blocks]),
+               };
+       }
+       case KEY_TYPE_reflink_v: {
+               struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
 
-#define extent_entry_next(_entry)                                      \
-       ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
+               return (struct bkey_ptrs_c) {
+                       r.v->start,
+                       bkey_val_end(r),
+               };
+       }
+       default:
+               return (struct bkey_ptrs_c) { NULL, NULL };
+       }
+}
+
+static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
+{
+       struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
+
+       return (struct bkey_ptrs) {
+               (void *) p.start,
+               (void *) p.end
+       };
+}
 
 #define __bkey_extent_entry_for_each_from(_start, _end, _entry)                \
        for ((_entry) = (_start);                                       \
@@ -281,96 +326,26 @@ out:                                                                      \
 #define bkey_for_each_crc(_k, _p, _crc, _iter)                         \
        __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter)
 
-/* utility code common to all keys with pointers: */
+/* Iterate over pointers in KEY_TYPE_extent: */
 
-static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k)
-{
-       switch (k.k->type) {
-       case KEY_TYPE_btree_ptr: {
-               struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k);
-               return (struct bkey_ptrs_c) {
-                       to_entry(&e.v->start[0]),
-                       to_entry(extent_entry_last(e))
-               };
-       }
-       case KEY_TYPE_extent: {
-               struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
-               return (struct bkey_ptrs_c) {
-                       e.v->start,
-                       extent_entry_last(e)
-               };
-       }
-       case KEY_TYPE_stripe: {
-               struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-               return (struct bkey_ptrs_c) {
-                       to_entry(&s.v->ptrs[0]),
-                       to_entry(&s.v->ptrs[s.v->nr_blocks]),
-               };
-       }
-       case KEY_TYPE_reflink_v: {
-               struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k);
-
-               return (struct bkey_ptrs_c) {
-                       r.v->start,
-                       bkey_val_end(r),
-               };
-       }
-       default:
-               return (struct bkey_ptrs_c) { NULL, NULL };
-       }
-}
-
-static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
-{
-       struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c);
-
-       return (struct bkey_ptrs) {
-               (void *) p.start,
-               (void *) p.end
-       };
-}
-
-static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
-{
-       struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-       struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-       const struct bch_extent_ptr *ptr;
-
-       bkey_for_each_ptr(p, ptr)
-               ret.devs[ret.nr++] = ptr->dev;
-
-       return ret;
-}
-
-static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
-{
-       struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-       struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-       const struct bch_extent_ptr *ptr;
-
-       bkey_for_each_ptr(p, ptr)
-               if (!ptr->cached)
-                       ret.devs[ret.nr++] = ptr->dev;
+#define extent_for_each_entry_from(_e, _entry, _start)                 \
+       __bkey_extent_entry_for_each_from(_start,                       \
+                               extent_entry_last(_e),_entry)
 
-       return ret;
-}
+#define extent_for_each_entry(_e, _entry)                              \
+       extent_for_each_entry_from(_e, _entry, (_e).v->start)
 
-static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
-{
-       struct bch_devs_list ret = (struct bch_devs_list) { 0 };
-       struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
-       const struct bch_extent_ptr *ptr;
+#define extent_ptr_next(_e, _ptr)                                      \
+       __bkey_ptr_next(_ptr, extent_entry_last(_e))
 
-       bkey_for_each_ptr(p, ptr)
-               if (ptr->cached)
-                       ret.devs[ret.nr++] = ptr->dev;
+#define extent_for_each_ptr(_e, _ptr)                                  \
+       __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
 
-       return ret;
-}
+#define extent_for_each_ptr_decode(_e, _ptr, _entry)                   \
+       __bkey_for_each_ptr_decode((_e).k, (_e).v->start,               \
+                                  extent_entry_last(_e), _ptr, _entry)
 
-unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
-unsigned bch2_bkey_nr_dirty_ptrs(struct bkey_s_c);
-unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
+/* utility code common to all keys with pointers: */
 
 void bch2_mark_io_failure(struct bch_io_failures *,
                          struct extent_ptr_decoded *);
@@ -378,22 +353,12 @@ int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c,
                               struct bch_io_failures *,
                               struct extent_ptr_decoded *);
 
-void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
-void bch2_bkey_drop_device(struct bkey_s, unsigned);
-const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
-bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
-
-void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
-                           struct bkey_s_c);
-const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
-
-/* bch_btree_ptr: */
+/* KEY_TYPE_btree_ptr: */
 
 const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c);
 void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *,
                            struct bkey_s_c);
-void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
 
 #define bch2_bkey_ops_btree_ptr (struct bkey_ops) {            \
        .key_invalid    = bch2_btree_ptr_invalid,               \
@@ -402,12 +367,11 @@ void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
        .swab           = bch2_ptr_swab,                        \
 }
 
-/* bch_extent: */
+/* KEY_TYPE_extent: */
 
 const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c);
 void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
 enum merge_result bch2_extent_merge(struct bch_fs *,
                                    struct bkey_s, struct bkey_s);
 
@@ -420,7 +384,7 @@ enum merge_result bch2_extent_merge(struct bch_fs *,
        .key_merge      = bch2_extent_merge,                    \
 }
 
-/* bch_reservation: */
+/* KEY_TYPE_reservation: */
 
 const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c);
 void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
@@ -433,27 +397,15 @@ enum merge_result bch2_reservation_merge(struct bch_fs *,
        .key_merge      = bch2_reservation_merge,               \
 }
 
-int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *,
-                          struct bpos *);
-int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *);
-int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *);
-
-enum btree_insert_ret
-bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *,
-                      unsigned *);
-void bch2_insert_fixup_extent(struct btree_trans *,
-                             struct btree_insert_entry *);
-
-void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
-                                   unsigned, unsigned);
-
-const struct bch_extent_ptr *
-bch2_extent_has_device(struct bkey_s_c_extent, unsigned);
+/* Extent checksum entries: */
 
-unsigned bch2_extent_is_compressed(struct bkey_s_c);
+bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
+                                struct bch_extent_crc_unpacked);
+bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
+void bch2_extent_crc_append(struct bkey_i *,
+                           struct bch_extent_crc_unpacked);
 
-bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
-                          struct bch_extent_ptr, u64);
+/* Generic code for keys with pointers: */
 
 static inline bool bkey_extent_is_direct_data(const struct bkey *k)
 {
@@ -470,6 +422,7 @@ static inline bool bkey_extent_is_direct_data(const struct bkey *k)
 static inline bool bkey_extent_is_data(const struct bkey *k)
 {
        return bkey_extent_is_direct_data(k) ||
+               k->type == KEY_TYPE_inline_data ||
                k->type == KEY_TYPE_reflink_p;
 }
 
@@ -483,40 +436,64 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
        case KEY_TYPE_reservation:
        case KEY_TYPE_reflink_p:
        case KEY_TYPE_reflink_v:
+       case KEY_TYPE_inline_data:
                return true;
        default:
                return false;
        }
 }
 
-/* Extent entry iteration: */
+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
+{
+       struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+       struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+       const struct bch_extent_ptr *ptr;
 
-#define extent_for_each_entry_from(_e, _entry, _start)                 \
-       __bkey_extent_entry_for_each_from(_start,                       \
-                               extent_entry_last(_e),_entry)
+       bkey_for_each_ptr(p, ptr)
+               ret.devs[ret.nr++] = ptr->dev;
 
-#define extent_for_each_entry(_e, _entry)                              \
-       extent_for_each_entry_from(_e, _entry, (_e).v->start)
+       return ret;
+}
 
-#define extent_ptr_next(_e, _ptr)                                      \
-       __bkey_ptr_next(_ptr, extent_entry_last(_e))
+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
+{
+       struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+       struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+       const struct bch_extent_ptr *ptr;
 
-#define extent_for_each_ptr(_e, _ptr)                                  \
-       __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr)
+       bkey_for_each_ptr(p, ptr)
+               if (!ptr->cached)
+                       ret.devs[ret.nr++] = ptr->dev;
 
-#define extent_for_each_ptr_decode(_e, _ptr, _entry)                   \
-       __bkey_for_each_ptr_decode((_e).k, (_e).v->start,               \
-                                  extent_entry_last(_e), _ptr, _entry)
+       return ret;
+}
 
-void bch2_extent_crc_append(struct bkey_i *,
-                           struct bch_extent_crc_unpacked);
-void bch2_extent_ptr_decoded_append(struct bkey_i *,
-                                   struct extent_ptr_decoded *);
+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
+{
+       struct bch_devs_list ret = (struct bch_devs_list) { 0 };
+       struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
+       const struct bch_extent_ptr *ptr;
 
-bool bch2_can_narrow_extent_crcs(struct bkey_s_c,
-                                struct bch_extent_crc_unpacked);
-bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked);
+       bkey_for_each_ptr(p, ptr)
+               if (ptr->cached)
+                       ret.devs[ret.nr++] = ptr->dev;
+
+       return ret;
+}
 
+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
+bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
+unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
+
+void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
+                                   unsigned, unsigned);
+
+void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
+void bch2_extent_ptr_decoded_append(struct bkey_i *,
+                                   struct extent_ptr_decoded *);
 union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
                                           struct bch_extent_ptr *);
 
@@ -537,14 +514,34 @@ do {                                                                      \
        }                                                               \
 } while (0)
 
-void __bch2_cut_front(struct bpos, struct bkey_s);
+void bch2_bkey_drop_device(struct bkey_s, unsigned);
+const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned);
+bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned);
+
+bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c,
+                          struct bch_extent_ptr, u64);
+
+bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
+                           struct bkey_s_c);
+const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c);
+
+void bch2_ptr_swab(const struct bkey_format *, struct bkey_packed *);
+
+/* Generic extent code: */
+
+int bch2_cut_front_s(struct bpos, struct bkey_s);
+int bch2_cut_back_s(struct bpos, struct bkey_s);
 
 static inline void bch2_cut_front(struct bpos where, struct bkey_i *k)
 {
-       __bch2_cut_front(where, bkey_i_to_s(k));
+       bch2_cut_front_s(where, bkey_i_to_s(k));
 }
 
-bool bch2_cut_back(struct bpos, struct bkey *);
+static inline void bch2_cut_back(struct bpos where, struct bkey_i *k)
+{
+       bch2_cut_back_s(where, bkey_i_to_s(k));
+}
 
 /**
  * bch_key_resize - adjust size of @k
@@ -576,7 +573,4 @@ static inline void extent_save(struct btree *b, struct bkey_packed *dst,
                BUG_ON(!bch2_bkey_pack_key(dst, src, f));
 }
 
-bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
-unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
-
 #endif /* _BCACHEFS_EXTENTS_H */
index fd6eb00e144c2bcc07cad1c00d8f5b197453b877..bce25dde1172cc93cfb7524d0359d77cf10ae275 100644 (file)
@@ -3,11 +3,13 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
+#include "bkey_on_stack.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "clock.h"
 #include "error.h"
 #include "extents.h"
+#include "extent_update.h"
 #include "fs.h"
 #include "fs-io.h"
 #include "fsck.h"
@@ -730,7 +732,7 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
        struct bvec_iter iter;
        struct bio_vec bv;
        unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v
-               ? 0 : bch2_bkey_nr_ptrs_allocated(k);
+               ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k);
        unsigned state = k.k->type == KEY_TYPE_reservation
                ? SECTOR_RESERVED
                : SECTOR_ALLOCATED;
@@ -748,6 +750,18 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
        }
 }
 
+static bool extent_partial_reads_expensive(struct bkey_s_c k)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       struct bch_extent_crc_unpacked crc;
+       const union bch_extent_entry *i;
+
+       bkey_for_each_crc(k.k, ptrs, crc, i)
+               if (crc.csum_type || crc.compression_type)
+                       return true;
+       return false;
+}
+
 static void readpage_bio_extend(struct readpages_iter *iter,
                                struct bio *bio,
                                unsigned sectors_this_extent,
@@ -801,15 +815,17 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
                       struct readpages_iter *readpages_iter)
 {
        struct bch_fs *c = trans->c;
+       struct bkey_on_stack sk;
        int flags = BCH_READ_RETRY_IF_STALE|
                BCH_READ_MAY_PROMOTE;
        int ret = 0;
 
        rbio->c = c;
        rbio->start_time = local_clock();
+
+       bkey_on_stack_init(&sk);
 retry:
        while (1) {
-               BKEY_PADDED(k) tmp;
                struct bkey_s_c k;
                unsigned bytes, sectors, offset_into_extent;
 
@@ -821,15 +837,16 @@ retry:
                if (ret)
                        break;
 
-               bkey_reassemble(&tmp.k, k);
-               k = bkey_i_to_s_c(&tmp.k);
+               bkey_on_stack_realloc(&sk, c, k.k->u64s);
+               bkey_reassemble(sk.k, k);
+               k = bkey_i_to_s_c(sk.k);
 
                offset_into_extent = iter->pos.offset -
                        bkey_start_offset(k.k);
                sectors = k.k->size - offset_into_extent;
 
                ret = bch2_read_indirect_extent(trans,
-                                       &offset_into_extent, &tmp.k);
+                                       &offset_into_extent, sk.k);
                if (ret)
                        break;
 
@@ -837,22 +854,9 @@ retry:
 
                bch2_trans_unlock(trans);
 
-               if (readpages_iter) {
-                       bool want_full_extent = false;
-
-                       if (bkey_extent_is_data(k.k)) {
-                               struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
-                               const union bch_extent_entry *i;
-                               struct extent_ptr_decoded p;
-
-                               bkey_for_each_ptr_decode(k.k, ptrs, p, i)
-                                       want_full_extent |= ((p.crc.csum_type != 0) |
-                                                            (p.crc.compression_type != 0));
-                       }
-
-                       readpage_bio_extend(readpages_iter, &rbio->bio,
-                                           sectors, want_full_extent);
-               }
+               if (readpages_iter)
+                       readpage_bio_extend(readpages_iter, &rbio->bio, sectors,
+                                           extent_partial_reads_expensive(k));
 
                bytes = min(sectors, bio_sectors(&rbio->bio)) << 9;
                swap(rbio->bio.bi_iter.bi_size, bytes);
@@ -866,7 +870,7 @@ retry:
                bch2_read_extent(c, rbio, k, offset_into_extent, flags);
 
                if (flags & BCH_READ_LAST_FRAGMENT)
-                       return;
+                       break;
 
                swap(rbio->bio.bi_iter.bi_size, bytes);
                bio_advance(&rbio->bio, bytes);
@@ -875,8 +879,12 @@ retry:
        if (ret == -EINTR)
                goto retry;
 
-       bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
-       bio_endio(&rbio->bio);
+       if (ret) {
+               bcache_io_error(c, &rbio->bio, "btree IO error %i", ret);
+               bio_endio(&rbio->bio);
+       }
+
+       bkey_on_stack_exit(&sk, c);
 }
 
 int bch2_readpages(struct file *file, struct address_space *mapping,
@@ -1046,6 +1054,18 @@ static void bch2_writepage_io_done(struct closure *cl)
                }
        }
 
+       if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) {
+               bio_for_each_segment_all(bvec, bio, iter) {
+                       struct bch_page_state *s;
+
+                       s = __bch2_page_state(bvec->bv_page);
+                       spin_lock(&s->lock);
+                       for (i = 0; i < PAGE_SECTORS; i++)
+                               s->s[i].nr_replicas = 0;
+                       spin_unlock(&s->lock);
+               }
+       }
+
        /*
         * racing with fallocate can cause us to add fewer sectors than
         * expected - but we shouldn't add more sectors than expected:
@@ -1089,6 +1109,7 @@ static void bch2_writepage_do_io(struct bch_writepage_state *w)
  * possible, else allocating a new one:
  */
 static void bch2_writepage_io_alloc(struct bch_fs *c,
+                                   struct writeback_control *wbc,
                                    struct bch_writepage_state *w,
                                    struct bch_inode_info *inode,
                                    u64 sector,
@@ -1113,6 +1134,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c,
        op->write_point         = writepoint_hashed(inode->ei_last_dirtied);
        op->pos                 = POS(inode->v.i_ino, sector);
        op->wbio.bio.bi_iter.bi_sector = sector;
+       op->wbio.bio.bi_opf     = wbc_to_write_flags(wbc);
 }
 
 static int __bch2_writepage(struct page *page,
@@ -1223,7 +1245,7 @@ do_io:
                        bch2_writepage_do_io(w);
 
                if (!w->io)
-                       bch2_writepage_io_alloc(c, w, inode, sector,
+                       bch2_writepage_io_alloc(c, wbc, w, inode, sector,
                                                nr_replicas_this_write);
 
                atomic_inc(&s->write_count);
@@ -1240,9 +1262,6 @@ do_io:
                w->io->op.i_sectors_delta -= dirty_sectors;
                w->io->op.new_i_size = i_size;
 
-               if (wbc->sync_mode == WB_SYNC_ALL)
-                       w->io->op.wbio.bio.bi_opf |= REQ_SYNC;
-
                offset += sectors;
        }
 
@@ -2382,6 +2401,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct address_space *mapping = inode->v.i_mapping;
+       struct bkey_on_stack copy;
        struct btree_trans trans;
        struct btree_iter *src, *dst, *del = NULL;
        loff_t shift, new_size;
@@ -2391,6 +2411,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
        if ((offset | len) & (block_bytes(c) - 1))
                return -EINVAL;
 
+       bkey_on_stack_init(&copy);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
 
        /*
@@ -2459,7 +2480,6 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
        while (1) {
                struct disk_reservation disk_res =
                        bch2_disk_reservation_init(c, 0);
-               BKEY_PADDED(k) copy;
                struct bkey_i delete;
                struct bkey_s_c k;
                struct bpos next_pos;
@@ -2484,34 +2504,35 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
                    bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0)
                        break;
 reassemble:
-               bkey_reassemble(&copy.k, k);
+               bkey_on_stack_realloc(&copy, c, k.k->u64s);
+               bkey_reassemble(copy.k, k);
 
                if (insert &&
                    bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) {
-                       bch2_cut_front(move_pos, &copy.k);
-                       bch2_btree_iter_set_pos(src, bkey_start_pos(&copy.k.k));
+                       bch2_cut_front(move_pos, copy.k);
+                       bch2_btree_iter_set_pos(src, bkey_start_pos(&copy.k->k));
                }
 
-               copy.k.k.p.offset += shift >> 9;
-               bch2_btree_iter_set_pos(dst, bkey_start_pos(&copy.k.k));
+               copy.k->k.p.offset += shift >> 9;
+               bch2_btree_iter_set_pos(dst, bkey_start_pos(&copy.k->k));
 
-               ret = bch2_extent_atomic_end(dst, &copy.k, &atomic_end);
+               ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end);
                if (ret)
                        goto bkey_err;
 
-               if (bkey_cmp(atomic_end, copy.k.k.p)) {
+               if (bkey_cmp(atomic_end, copy.k->k.p)) {
                        if (insert) {
                                move_pos = atomic_end;
                                move_pos.offset -= shift >> 9;
                                goto reassemble;
                        } else {
-                               bch2_cut_back(atomic_end, &copy.k.k);
+                               bch2_cut_back(atomic_end, copy.k);
                        }
                }
 
                bkey_init(&delete.k);
                delete.k.p = src->pos;
-               bch2_key_resize(&delete.k, copy.k.k.size);
+               bch2_key_resize(&delete.k, copy.k->k.size);
 
                next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p;
 
@@ -2524,12 +2545,12 @@ reassemble:
                 * by the triggers machinery:
                 */
                if (insert &&
-                   bkey_cmp(bkey_start_pos(&copy.k.k), delete.k.p) < 0) {
-                       bch2_cut_back(bkey_start_pos(&copy.k.k), &delete.k);
+                   bkey_cmp(bkey_start_pos(&copy.k->k), delete.k.p) < 0) {
+                       bch2_cut_back(bkey_start_pos(&copy.k->k), &delete);
                } else if (!insert &&
-                          bkey_cmp(copy.k.k.p,
+                          bkey_cmp(copy.k->k.p,
                                    bkey_start_pos(&delete.k)) > 0) {
-                       bch2_cut_front(copy.k.k.p, &delete);
+                       bch2_cut_front(copy.k->k.p, &delete);
 
                        del = bch2_trans_copy_iter(&trans, src);
                        BUG_ON(IS_ERR_OR_NULL(del));
@@ -2538,10 +2559,10 @@ reassemble:
                                bkey_start_pos(&delete.k));
                }
 
-               bch2_trans_update(&trans, dst, &copy.k);
+               bch2_trans_update(&trans, dst, copy.k);
                bch2_trans_update(&trans, del ?: src, &delete);
 
-               if (copy.k.k.size == k.k->size) {
+               if (copy.k->k.size == k.k->size) {
                        /*
                         * If we're moving the entire extent, we can skip
                         * running triggers:
@@ -2550,10 +2571,10 @@ reassemble:
                } else {
                        /* We might end up splitting compressed extents: */
                        unsigned nr_ptrs =
-                               bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&copy.k));
+                               bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k));
 
                        ret = bch2_disk_reservation_get(c, &disk_res,
-                                       copy.k.k.size, nr_ptrs,
+                                       copy.k->k.size, nr_ptrs,
                                        BCH_DISK_RESERVATION_NOFAIL);
                        BUG_ON(ret);
                }
@@ -2588,6 +2609,7 @@ bkey_err:
        }
 err:
        bch2_trans_exit(&trans);
+       bkey_on_stack_exit(&copy, c);
        bch2_pagecache_block_put(&inode->ei_pagecache_lock);
        inode_unlock(&inode->v);
        return ret;
@@ -2671,11 +2693,11 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
                reservation.k.p         = k.k->p;
                reservation.k.size      = k.k->size;
 
-               bch2_cut_front(iter->pos, &reservation.k_i);
-               bch2_cut_back(end_pos, &reservation.k);
+               bch2_cut_front(iter->pos,       &reservation.k_i);
+               bch2_cut_back(end_pos,          &reservation.k_i);
 
                sectors = reservation.k.size;
-               reservation.v.nr_replicas = bch2_bkey_nr_dirty_ptrs(k);
+               reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k);
 
                if (!bkey_extent_is_allocation(k.k)) {
                        ret = bch2_quota_reservation_add(c, inode,
@@ -2686,7 +2708,7 @@ static long bchfs_fallocate(struct bch_inode_info *inode, int mode,
                }
 
                if (reservation.v.nr_replicas < replicas ||
-                   bch2_extent_is_compressed(k)) {
+                   bch2_bkey_sectors_compressed(k)) {
                        ret = bch2_disk_reservation_get(c, &disk_res, sectors,
                                                        replicas, 0);
                        if (unlikely(ret))
index cd3540d092f9b4b218ac65f4f09377408b381267..1a0e3942635fbaf7a2454ac76f7e861c382a12ee 100644 (file)
@@ -3,6 +3,7 @@
 
 #include "bcachefs.h"
 #include "acl.h"
+#include "bkey_on_stack.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "chardev.h"
@@ -850,7 +851,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
-       BKEY_PADDED(k) cur, prev;
+       struct bkey_on_stack cur, prev;
        struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
        unsigned offset_into_extent, sectors;
        bool have_extent = false;
@@ -859,6 +860,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
        if (start + len < start)
                return -EINVAL;
 
+       bkey_on_stack_init(&cur);
+       bkey_on_stack_init(&prev);
        bch2_trans_init(&trans, c, 0, 0);
 
        iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -873,15 +876,17 @@ retry:
                        continue;
                }
 
-               bkey_reassemble(&cur.k, k);
-               k = bkey_i_to_s_c(&cur.k);
+               bkey_on_stack_realloc(&cur, c, k.k->u64s);
+               bkey_on_stack_realloc(&prev, c, k.k->u64s);
+               bkey_reassemble(cur.k, k);
+               k = bkey_i_to_s_c(cur.k);
 
                offset_into_extent      = iter->pos.offset -
                        bkey_start_offset(k.k);
                sectors                 = k.k->size - offset_into_extent;
 
                ret = bch2_read_indirect_extent(&trans,
-                                       &offset_into_extent, &cur.k);
+                                       &offset_into_extent, cur.k);
                if (ret)
                        break;
 
@@ -891,19 +896,19 @@ retry:
                        bch2_cut_front(POS(k.k->p.inode,
                                           bkey_start_offset(k.k) +
                                           offset_into_extent),
-                                      &cur.k);
-               bch2_key_resize(&cur.k.k, sectors);
-               cur.k.k.p = iter->pos;
-               cur.k.k.p.offset += cur.k.k.size;
+                                      cur.k);
+               bch2_key_resize(&cur.k->k, sectors);
+               cur.k->k.p = iter->pos;
+               cur.k->k.p.offset += cur.k->k.size;
 
                if (have_extent) {
                        ret = bch2_fill_extent(c, info,
-                                       bkey_i_to_s_c(&prev.k), 0);
+                                       bkey_i_to_s_c(prev.k), 0);
                        if (ret)
                                break;
                }
 
-               bkey_copy(&prev.k, &cur.k);
+               bkey_copy(prev.k, cur.k);
                have_extent = true;
 
                if (k.k->type == KEY_TYPE_reflink_v)
@@ -916,10 +921,12 @@ retry:
                goto retry;
 
        if (!ret && have_extent)
-               ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k),
+               ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k),
                                       FIEMAP_EXTENT_LAST);
 
        ret = bch2_trans_exit(&trans) ?: ret;
+       bkey_on_stack_exit(&cur, c);
+       bkey_on_stack_exit(&prev, c);
        return ret < 0 ? ret : 0;
 }
 
index e3ef662e2a12e2a8c9bdc5cbbd11cdc6c6faa96b..ca891b52706f0e6c7576cba2e62244f7ee40313d 100644 (file)
@@ -8,6 +8,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
+#include "bkey_on_stack.h"
 #include "bset.h"
 #include "btree_update.h"
 #include "buckets.h"
@@ -18,7 +19,7 @@
 #include "disk_groups.h"
 #include "ec.h"
 #include "error.h"
-#include "extents.h"
+#include "extent_update.h"
 #include "inode.h"
 #include "io.h"
 #include "journal.h"
@@ -191,8 +192,8 @@ static int sum_sector_overwrites(struct btree_trans *trans,
 
        for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) {
                if (!may_allocate &&
-                   bch2_bkey_nr_ptrs_allocated(old) <
-                   bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new))) {
+                   bch2_bkey_nr_ptrs_fully_allocated(old) <
+                   bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) {
                        ret = -ENOSPC;
                        break;
                }
@@ -334,7 +335,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter,
 
                /* create the biggest key we can */
                bch2_key_resize(&delete.k, max_sectors);
-               bch2_cut_back(end, &delete.k);
+               bch2_cut_back(end, &delete);
 
                bch2_trans_begin_updates(trans);
 
@@ -384,12 +385,14 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
 int bch2_write_index_default(struct bch_write_op *op)
 {
        struct bch_fs *c = op->c;
+       struct bkey_on_stack sk;
        struct keylist *keys = &op->insert_keys;
        struct bkey_i *k = bch2_keylist_front(keys);
        struct btree_trans trans;
        struct btree_iter *iter;
        int ret;
 
+       bkey_on_stack_init(&sk);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
        iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -397,13 +400,15 @@ int bch2_write_index_default(struct bch_write_op *op)
                                   BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
 
        do {
-               BKEY_PADDED(k) tmp;
+               k = bch2_keylist_front(keys);
 
-               bkey_copy(&tmp.k, bch2_keylist_front(keys));
+               bkey_on_stack_realloc(&sk, c, k->k.u64s);
+               bkey_copy(sk.k, k);
+               bch2_cut_front(iter->pos, sk.k);
 
                bch2_trans_begin_updates(&trans);
 
-               ret = bch2_extent_update(&trans, iter, &tmp.k,
+               ret = bch2_extent_update(&trans, iter, sk.k,
                                         &op->res, op_journal_seq(op),
                                         op->new_i_size, &op->i_sectors_delta);
                if (ret == -EINTR)
@@ -411,13 +416,12 @@ int bch2_write_index_default(struct bch_write_op *op)
                if (ret)
                        break;
 
-               if (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) < 0)
-                       bch2_cut_front(iter->pos, bch2_keylist_front(keys));
-               else
+               if (bkey_cmp(iter->pos, k->k.p) >= 0)
                        bch2_keylist_pop_front(keys);
        } while (!bch2_keylist_empty(keys));
 
        bch2_trans_exit(&trans);
+       bkey_on_stack_exit(&sk, c);
 
        return ret;
 }
@@ -519,16 +523,19 @@ static void __bch2_write_index(struct bch_write_op *op)
 
        for (src = keys->keys; src != keys->top; src = n) {
                n = bkey_next(src);
-               bkey_copy(dst, src);
 
-               bch2_bkey_drop_ptrs(bkey_i_to_s(dst), ptr,
-                       test_bit(ptr->dev, op->failed.d));
+               if (bkey_extent_is_direct_data(&src->k)) {
+                       bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr,
+                                           test_bit(ptr->dev, op->failed.d));
 
-               if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(dst))) {
-                       ret = -EIO;
-                       goto err;
+                       if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) {
+                               ret = -EIO;
+                               goto err;
+                       }
                }
 
+               if (dst != src)
+                       memmove_u64s_down(dst, src, src->u64s);
                dst = bkey_next(dst);
        }
 
@@ -1086,7 +1093,7 @@ again:
 
                bio->bi_end_io  = bch2_write_endio;
                bio->bi_private = &op->cl;
-               bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+               bio->bi_opf |= REQ_OP_WRITE;
 
                if (!skip_put)
                        closure_get(bio->bi_private);
@@ -1123,6 +1130,47 @@ flush_io:
        goto again;
 }
 
+static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len)
+{
+       struct closure *cl = &op->cl;
+       struct bio *bio = &op->wbio.bio;
+       struct bvec_iter iter;
+       struct bkey_i_inline_data *id;
+       unsigned sectors;
+       int ret;
+
+       ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys,
+                                  ARRAY_SIZE(op->inline_keys),
+                                  BKEY_U64s + DIV_ROUND_UP(data_len, 8));
+       if (ret) {
+               op->error = ret;
+               goto err;
+       }
+
+       sectors = bio_sectors(bio);
+       op->pos.offset += sectors;
+
+       id = bkey_inline_data_init(op->insert_keys.top);
+       id->k.p         = op->pos;
+       id->k.version   = op->version;
+       id->k.size      = sectors;
+
+       iter = bio->bi_iter;
+       iter.bi_size = data_len;
+       memcpy_from_bio(id->v.data, bio, iter);
+
+       while (data_len & 7)
+               id->v.data[data_len++] = '\0';
+       set_bkey_val_bytes(&id->k, data_len);
+       bch2_keylist_push(&op->insert_keys);
+
+       op->flags |= BCH_WRITE_WROTE_DATA_INLINE;
+       continue_at_nobarrier(cl, bch2_write_index, NULL);
+       return;
+err:
+       bch2_write_done(&op->cl);
+}
+
 /**
  * bch_write - handle a write to a cache device or flash only volume
  *
@@ -1144,22 +1192,22 @@ void bch2_write(struct closure *cl)
        struct bch_write_op *op = container_of(cl, struct bch_write_op, cl);
        struct bio *bio = &op->wbio.bio;
        struct bch_fs *c = op->c;
+       unsigned data_len;
 
        BUG_ON(!op->nr_replicas);
        BUG_ON(!op->write_point.v);
        BUG_ON(!bkey_cmp(op->pos, POS_MAX));
 
+       op->start_time = local_clock();
+       bch2_keylist_init(&op->insert_keys, op->inline_keys);
+       wbio_init(bio)->put_bio = false;
+
        if (bio_sectors(bio) & (c->opts.block_size - 1)) {
                __bcache_io_error(c, "misaligned write");
                op->error = -EIO;
                goto err;
        }
 
-       op->start_time = local_clock();
-
-       bch2_keylist_init(&op->insert_keys, op->inline_keys);
-       wbio_init(bio)->put_bio = false;
-
        if (c->opts.nochanges ||
            !percpu_ref_tryget(&c->writes)) {
                __bcache_io_error(c, "read only");
@@ -1169,12 +1217,25 @@ void bch2_write(struct closure *cl)
 
        bch2_increment_clock(c, bio_sectors(bio), WRITE);
 
+       data_len = min_t(u64, bio->bi_iter.bi_size,
+                        op->new_i_size - (op->pos.offset << 9));
+
+       if (data_len <= min(block_bytes(c) / 2, 1024U)) {
+               bch2_write_data_inline(op, data_len);
+               return;
+       }
+
        continue_at_nobarrier(cl, __bch2_write, NULL);
        return;
 err:
        if (!(op->flags & BCH_WRITE_NOPUT_RESERVATION))
                bch2_disk_reservation_put(c, &op->res);
-       closure_return(cl);
+       if (op->end_io)
+               op->end_io(op);
+       if (cl->parent)
+               closure_return(cl);
+       else
+               closure_debug_destroy(cl);
 }
 
 /* Cache promotion on read */
@@ -1456,13 +1517,14 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
 {
        struct btree_trans trans;
        struct btree_iter *iter;
-       BKEY_PADDED(k) tmp;
+       struct bkey_on_stack sk;
        struct bkey_s_c k;
        int ret;
 
        flags &= ~BCH_READ_LAST_FRAGMENT;
        flags |= BCH_READ_MUST_CLONE;
 
+       bkey_on_stack_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
 
        iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -1474,11 +1536,12 @@ retry:
        if (bkey_err(k))
                goto err;
 
-       bkey_reassemble(&tmp.k, k);
-       k = bkey_i_to_s_c(&tmp.k);
+       bkey_on_stack_realloc(&sk, c, k.k->u64s);
+       bkey_reassemble(sk.k, k);
+       k = bkey_i_to_s_c(sk.k);
        bch2_trans_unlock(&trans);
 
-       if (!bch2_bkey_matches_ptr(c, bkey_i_to_s_c(&tmp.k),
+       if (!bch2_bkey_matches_ptr(c, k,
                                   rbio->pick.ptr,
                                   rbio->pos.offset -
                                   rbio->pick.crc.offset)) {
@@ -1495,6 +1558,7 @@ retry:
 out:
        bch2_rbio_done(rbio);
        bch2_trans_exit(&trans);
+       bkey_on_stack_exit(&sk, c);
        return;
 err:
        rbio->bio.bi_status = BLK_STS_IOERR;
@@ -1507,12 +1571,14 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
 {
        struct btree_trans trans;
        struct btree_iter *iter;
+       struct bkey_on_stack sk;
        struct bkey_s_c k;
        int ret;
 
        flags &= ~BCH_READ_LAST_FRAGMENT;
        flags |= BCH_READ_MUST_CLONE;
 
+       bkey_on_stack_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
 retry:
        bch2_trans_begin(&trans);
@@ -1520,18 +1586,18 @@ retry:
        for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS,
                           POS(inode, bvec_iter.bi_sector),
                           BTREE_ITER_SLOTS, k, ret) {
-               BKEY_PADDED(k) tmp;
                unsigned bytes, sectors, offset_into_extent;
 
-               bkey_reassemble(&tmp.k, k);
-               k = bkey_i_to_s_c(&tmp.k);
+               bkey_on_stack_realloc(&sk, c, k.k->u64s);
+               bkey_reassemble(sk.k, k);
+               k = bkey_i_to_s_c(sk.k);
 
                offset_into_extent = iter->pos.offset -
                        bkey_start_offset(k.k);
                sectors = k.k->size - offset_into_extent;
 
                ret = bch2_read_indirect_extent(&trans,
-                                       &offset_into_extent, &tmp.k);
+                                       &offset_into_extent, sk.k);
                if (ret)
                        break;
 
@@ -1570,6 +1636,7 @@ err:
        rbio->bio.bi_status = BLK_STS_IOERR;
 out:
        bch2_trans_exit(&trans);
+       bkey_on_stack_exit(&sk, c);
        bch2_rbio_done(rbio);
 }
 
@@ -1626,7 +1693,7 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
-       BKEY_PADDED(k) new;
+       struct bkey_on_stack new;
        struct bch_extent_crc_unpacked new_crc;
        u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset;
        int ret;
@@ -1634,6 +1701,7 @@ static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
        if (rbio->pick.crc.compression_type)
                return;
 
+       bkey_on_stack_init(&new);
        bch2_trans_init(&trans, c, 0, 0);
 retry:
        bch2_trans_begin(&trans);
@@ -1644,8 +1712,9 @@ retry:
        if (IS_ERR_OR_NULL(k.k))
                goto out;
 
-       bkey_reassemble(&new.k, k);
-       k = bkey_i_to_s_c(&new.k);
+       bkey_on_stack_realloc(&new, c, k.k->u64s);
+       bkey_reassemble(new.k, k);
+       k = bkey_i_to_s_c(new.k);
 
        if (bversion_cmp(k.k->version, rbio->version) ||
            !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
@@ -1664,10 +1733,10 @@ retry:
                goto out;
        }
 
-       if (!bch2_bkey_narrow_crcs(&new.k, new_crc))
+       if (!bch2_bkey_narrow_crcs(new.k, new_crc))
                goto out;
 
-       bch2_trans_update(&trans, iter, &new.k);
+       bch2_trans_update(&trans, iter, new.k);
        ret = bch2_trans_commit(&trans, NULL, NULL,
                                BTREE_INSERT_ATOMIC|
                                BTREE_INSERT_NOFAIL|
@@ -1676,6 +1745,7 @@ retry:
                goto retry;
 out:
        bch2_trans_exit(&trans);
+       bkey_on_stack_exit(&new, c);
 }
 
 /* Inner part that may run in process context */
@@ -1872,6 +1942,19 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig,
        struct bpos pos = bkey_start_pos(k.k);
        int pick_ret;
 
+       if (k.k->type == KEY_TYPE_inline_data) {
+               struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k);
+               unsigned bytes = min_t(unsigned, iter.bi_size,
+                                      bkey_val_bytes(d.k));
+
+               swap(iter.bi_size, bytes);
+               memcpy_to_bio(&orig->bio, iter, d.v->data);
+               swap(iter.bi_size, bytes);
+               bio_advance_iter(&orig->bio, &iter, bytes);
+               zero_fill_bio_iter(&orig->bio, iter);
+               goto out_read_done;
+       }
+
        pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick);
 
        /* hole or reservation - just zero fill: */
@@ -2100,6 +2183,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
+       struct bkey_on_stack sk;
        struct bkey_s_c k;
        unsigned flags = BCH_READ_RETRY_IF_STALE|
                BCH_READ_MAY_PROMOTE|
@@ -2113,6 +2197,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
        rbio->c = c;
        rbio->start_time = local_clock();
 
+       bkey_on_stack_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
 retry:
        bch2_trans_begin(&trans);
@@ -2121,7 +2206,6 @@ retry:
                                   POS(inode, rbio->bio.bi_iter.bi_sector),
                                   BTREE_ITER_SLOTS);
        while (1) {
-               BKEY_PADDED(k) tmp;
                unsigned bytes, sectors, offset_into_extent;
 
                bch2_btree_iter_set_pos(iter,
@@ -2132,15 +2216,16 @@ retry:
                if (ret)
                        goto err;
 
-               bkey_reassemble(&tmp.k, k);
-               k = bkey_i_to_s_c(&tmp.k);
-
                offset_into_extent = iter->pos.offset -
                        bkey_start_offset(k.k);
                sectors = k.k->size - offset_into_extent;
 
+               bkey_on_stack_realloc(&sk, c, k.k->u64s);
+               bkey_reassemble(sk.k, k);
+               k = bkey_i_to_s_c(sk.k);
+
                ret = bch2_read_indirect_extent(&trans,
-                                       &offset_into_extent, &tmp.k);
+                                       &offset_into_extent, sk.k);
                if (ret)
                        goto err;
 
@@ -2172,6 +2257,7 @@ retry:
        }
 out:
        bch2_trans_exit(&trans);
+       bkey_on_stack_exit(&sk, c);
        return;
 err:
        if (ret == -EINTR)
index 91aaa58fce4e54c94681b52abc71f2d4b34c789b..45c950942d784f89c1e2f2f1355f6446e46b4e7c 100644 (file)
@@ -30,10 +30,11 @@ enum bch_write_flags {
        BCH_WRITE_PAGES_OWNED           = (1 << 5),
        BCH_WRITE_ONLY_SPECIFIED_DEVS   = (1 << 6),
        BCH_WRITE_NOPUT_RESERVATION     = (1 << 7),
+       BCH_WRITE_WROTE_DATA_INLINE     = (1 << 8),
 
        /* Internal: */
-       BCH_WRITE_JOURNAL_SEQ_PTR       = (1 << 8),
-       BCH_WRITE_SKIP_CLOSURE_PUT      = (1 << 9),
+       BCH_WRITE_JOURNAL_SEQ_PTR       = (1 << 9),
+       BCH_WRITE_SKIP_CLOSURE_PUT      = (1 << 10),
 };
 
 static inline u64 *op_journal_seq(struct bch_write_op *op)
index 5c3e146e3942ac673735c2fb1bcde80782b6e3ca..9f03a479c9a2f01342607987ec712ed5df2d553b 100644 (file)
@@ -945,7 +945,7 @@ static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx)
        w = j->buf + !state.idx;
 
        ret = state.prev_buf_unwritten &&
-               bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), dev_idx);
+               bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx);
        spin_unlock(&j->lock);
 
        return ret;
index 387377dadab53c729b75b8df729c9ec5e53443fc..7112a25d0600e1d246517cbaf5263c8a827a9fde 100644 (file)
@@ -1100,7 +1100,7 @@ void bch2_journal_write(struct closure *cl)
 
        for_each_rw_member(ca, c, i)
                if (journal_flushes_device(ca) &&
-                   !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) {
+                   !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
                        percpu_ref_get(&ca->io_ref);
 
                        bio = ca->journal.bio;
index de8522f754e284acc986d115ac8f5f099b4d9bd5..4dacbd637d021b5121877f23311989d7b9d19b0c 100644 (file)
@@ -4,6 +4,7 @@
  */
 
 #include "bcachefs.h"
+#include "bkey_on_stack.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "buckets.h"
@@ -40,9 +41,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
-       BKEY_PADDED(key) tmp;
+       struct bkey_on_stack sk;
        int ret = 0;
 
+       bkey_on_stack_init(&sk);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
        iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
@@ -58,9 +60,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
                        continue;
                }
 
-               bkey_reassemble(&tmp.key, k);
+               bkey_on_stack_realloc(&sk, c, k.k->u64s);
+               bkey_reassemble(sk.k, k);
 
-               ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.key),
+               ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k),
                                    dev_idx, flags, false);
                if (ret)
                        break;
@@ -70,11 +73,11 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
                 * will do the appropriate thing with it (turning it into a
                 * KEY_TYPE_error key, or just a discard if it was a cached extent)
                 */
-               bch2_extent_normalize(c, bkey_i_to_s(&tmp.key));
+               bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
-               bch2_btree_iter_set_pos(iter, bkey_start_pos(&tmp.key.k));
+               bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
 
-               bch2_trans_update(&trans, iter, &tmp.key);
+               bch2_trans_update(&trans, iter, sk.k);
 
                ret = bch2_trans_commit(&trans, NULL, NULL,
                                        BTREE_INSERT_ATOMIC|
@@ -92,6 +95,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
        }
 
        ret = bch2_trans_exit(&trans) ?: ret;
+       bkey_on_stack_exit(&sk, c);
 
        BUG_ON(ret == -EINTR);
 
index ab20e981145b578f5f71f0243001d6ca3b0b3297..acdc1730e218d80789ae61858ee119ad749dfc11 100644 (file)
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
+#include "bkey_on_stack.h"
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
@@ -96,10 +97,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
 
                bkey_copy(&_new.k, bch2_keylist_front(keys));
                new = bkey_i_to_extent(&_new.k);
+               bch2_cut_front(iter->pos, &new->k_i);
 
-               bch2_cut_front(iter->pos, insert);
-               bch2_cut_back(new->k.p, &insert->k);
-               bch2_cut_back(insert->k.p, &new->k);
+               bch2_cut_front(iter->pos,       insert);
+               bch2_cut_back(new->k.p,         insert);
+               bch2_cut_back(insert->k.p,      &new->k_i);
 
                if (m->data_cmd == DATA_REWRITE)
                        bch2_bkey_drop_device(bkey_i_to_s(insert),
@@ -133,11 +135,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                 * If we're not fully overwriting @k, and it's compressed, we
                 * need a reservation for all the pointers in @insert
                 */
-               nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(insert)) -
+               nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) -
                         m->nr_ptrs_reserved;
 
                if (insert->k.size < k.k->size &&
-                   bch2_extent_is_compressed(k) &&
+                   bch2_bkey_sectors_compressed(k) &&
                    nr > 0) {
                        ret = bch2_disk_reservation_add(c, &op->res,
                                        keylist_sectors(keys) * nr, 0);
@@ -168,8 +170,6 @@ next:
                        if (bch2_keylist_empty(keys))
                                goto out;
                }
-
-               bch2_cut_front(iter->pos, bch2_keylist_front(keys));
                continue;
 nomatch:
                if (m->ctxt)
@@ -251,7 +251,7 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m,
                 */
 #if 0
                int nr = (int) io_opts.data_replicas -
-                       bch2_bkey_nr_dirty_ptrs(k);
+                       bch2_bkey_nr_ptrs_allocated(k);
 #endif
                int nr = (int) io_opts.data_replicas;
 
@@ -490,7 +490,7 @@ static int __bch2_move_data(struct bch_fs *c,
 {
        bool kthread = (current->flags & PF_KTHREAD) != 0;
        struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
-       BKEY_PADDED(k) tmp;
+       struct bkey_on_stack sk;
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
@@ -499,6 +499,7 @@ static int __bch2_move_data(struct bch_fs *c,
        u64 delay, cur_inum = U64_MAX;
        int ret = 0, ret2;
 
+       bkey_on_stack_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
 
        stats->data_type = BCH_DATA_USER;
@@ -578,8 +579,9 @@ peek:
                }
 
                /* unlock before doing IO: */
-               bkey_reassemble(&tmp.k, k);
-               k = bkey_i_to_s_c(&tmp.k);
+               bkey_on_stack_realloc(&sk, c, k.k->u64s);
+               bkey_reassemble(sk.k, k);
+               k = bkey_i_to_s_c(sk.k);
                bch2_trans_unlock(&trans);
 
                ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k,
@@ -598,7 +600,7 @@ peek:
                if (rate)
                        bch2_ratelimit_increment(rate, k.k->size);
 next:
-               atomic64_add(k.k->size * bch2_bkey_nr_dirty_ptrs(k),
+               atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k),
                             &stats->sectors_seen);
 next_nondata:
                bch2_btree_iter_next(iter);
@@ -606,6 +608,7 @@ next_nondata:
        }
 out:
        ret = bch2_trans_exit(&trans) ?: ret;
+       bkey_on_stack_exit(&sk, c);
 
        return ret;
 }
index d1184bf62cae4e120cc61adf56df7351e2ff1921..d4002b7fc917e4dc1645787bc490ec7dee775b56 100644 (file)
@@ -177,7 +177,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
                        if ((cmp_int(i[0].journal_seq, i[1].journal_seq) ?:
                             cmp_int(i[0].journal_offset, i[1].journal_offset)) < 0) {
                                if (bkey_cmp(i[0].k->k.p, i[1].k->k.p) <= 0) {
-                                       bch2_cut_back(bkey_start_pos(&i[1].k->k), &i[0].k->k);
+                                       bch2_cut_back(bkey_start_pos(&i[1].k->k), i[0].k);
                                } else {
                                        struct bkey_i *split =
                                                kmalloc(bkey_bytes(i[0].k), GFP_KERNEL);
@@ -186,7 +186,7 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
                                                goto err;
 
                                        bkey_copy(split, i[0].k);
-                                       bch2_cut_back(bkey_start_pos(&i[1].k->k), &split->k);
+                                       bch2_cut_back(bkey_start_pos(&i[1].k->k), split);
                                        keys_deduped.d[keys_deduped.nr++] = (struct journal_key) {
                                                .btree_id       = i[0].btree_id,
                                                .allocated      = true,
@@ -254,7 +254,7 @@ static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id,
         * Some extents aren't equivalent - w.r.t. what the triggers do
         * - if they're split:
         */
-       bool remark_if_split = bch2_extent_is_compressed(bkey_i_to_s_c(k)) ||
+       bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) ||
                k->k.type == KEY_TYPE_reflink_p;
        bool remark = false;
        int ret;
@@ -289,7 +289,7 @@ retry:
                    bkey_cmp(atomic_end, k->k.p) < 0) {
                        ret = bch2_disk_reservation_add(c, &disk_res,
                                        k->k.size *
-                                       bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(k)),
+                                       bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)),
                                        BCH_DISK_RESERVATION_NOFAIL);
                        BUG_ON(ret);
 
@@ -298,7 +298,7 @@ retry:
 
                bkey_copy(split, k);
                bch2_cut_front(split_iter->pos, split);
-               bch2_cut_back(atomic_end, &split->k);
+               bch2_cut_back(atomic_end, split);
 
                bch2_trans_update(&trans, split_iter, split);
                bch2_btree_iter_set_pos(iter, split->k.p);
@@ -913,6 +913,12 @@ int bch2_fs_recovery(struct bch_fs *c)
                write_sb = true;
        }
 
+       if (!(c->sb.features & (1ULL << BCH_FEATURE_INLINE_DATA))) {
+               c->disk_sb.sb->features[0] |=
+                       cpu_to_le64(1ULL << BCH_FEATURE_INLINE_DATA);
+               write_sb = true;
+       }
+
        if (!test_bit(BCH_FS_ERROR, &c->flags)) {
                c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO;
                write_sb = true;
index 6e71c5e8f9a20d078423ce615b169aca4419e8ef..4de65bf70362e8d0354538bdea61c8bd80475249 100644 (file)
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
+#include "bkey_on_stack.h"
 #include "btree_update.h"
 #include "extents.h"
 #include "inode.h"
@@ -39,7 +40,7 @@ enum merge_result bch2_reflink_p_merge(struct bch_fs *c,
 
        if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) {
                bch2_key_resize(l.k, KEY_SIZE_MAX);
-               __bch2_cut_front(l.k->p, _r);
+               bch2_cut_front_s(l.k->p, _r);
                return BCH_MERGE_PARTIAL;
        }
 
@@ -160,7 +161,8 @@ s64 bch2_remap_range(struct bch_fs *c,
        struct btree_trans trans;
        struct btree_iter *dst_iter, *src_iter;
        struct bkey_s_c src_k;
-       BKEY_PADDED(k) new_dst, new_src;
+       BKEY_PADDED(k) new_dst;
+       struct bkey_on_stack new_src;
        struct bpos dst_end = dst_start, src_end = src_start;
        struct bpos dst_want, src_want;
        u64 src_done, dst_done;
@@ -183,6 +185,7 @@ s64 bch2_remap_range(struct bch_fs *c,
        dst_end.offset += remap_sectors;
        src_end.offset += remap_sectors;
 
+       bkey_on_stack_init(&new_src);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
 
        src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
@@ -222,14 +225,15 @@ s64 bch2_remap_range(struct bch_fs *c,
                        break;
 
                if (src_k.k->type == KEY_TYPE_extent) {
-                       bkey_reassemble(&new_src.k, src_k);
-                       src_k = bkey_i_to_s_c(&new_src.k);
+                       bkey_on_stack_realloc(&new_src, c, src_k.k->u64s);
+                       bkey_reassemble(new_src.k, src_k);
+                       src_k = bkey_i_to_s_c(new_src.k);
 
-                       bch2_cut_front(src_iter->pos,   &new_src.k);
-                       bch2_cut_back(src_end,          &new_src.k.k);
+                       bch2_cut_front(src_iter->pos,   new_src.k);
+                       bch2_cut_back(src_end,          new_src.k);
 
                        ret = bch2_make_extent_indirect(&trans, src_iter,
-                                               bkey_i_to_extent(&new_src.k));
+                                               bkey_i_to_extent(new_src.k));
                        if (ret)
                                goto btree_err;
 
@@ -299,6 +303,7 @@ err:
        } while (ret2 == -EINTR);
 
        ret = bch2_trans_exit(&trans) ?: ret;
+       bkey_on_stack_exit(&new_src, c);
 
        percpu_ref_put(&c->writes);
 
index 4145832f48566db522d8b9789c37b9e5ebc4649d..ac2f31e35ef44102d6e3abab3ff8a6014033a007 100644 (file)
@@ -506,6 +506,7 @@ static void bch2_fs_free(struct bch_fs *c)
        free_percpu(c->usage[0]);
        kfree(c->usage_base);
        free_percpu(c->pcpu);
+       mempool_exit(&c->large_bkey_pool);
        mempool_exit(&c->btree_bounce_pool);
        bioset_exit(&c->btree_bio);
        mempool_exit(&c->btree_interior_update_pool);
@@ -758,6 +759,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
            !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
            mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
                                        btree_bytes(c)) ||
+           mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
            bch2_io_clock_init(&c->io_clock[READ]) ||
            bch2_io_clock_init(&c->io_clock[WRITE]) ||
            bch2_fs_journal_init(&c->journal) ||
index 2cc433ec0e3a5a83aba673570828034a0fe7b3f9..e69d03d1109ff0cf4cd3acb1ecc886b65c5e3a4c 100644 (file)
@@ -550,7 +550,7 @@ size_t bch2_rand_range(size_t max)
        return rand;
 }
 
-void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, void *src)
+void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src)
 {
        struct bio_vec bv;
        struct bvec_iter iter;
index 8e704b4a6ffd4fbfbf1a0ee720edcb73fac59b24..0128daba5970f122e715e4ed7eb2c7c5e177a8dd 100644 (file)
@@ -547,7 +547,7 @@ do {                                                                        \
 
 size_t bch2_rand_range(size_t);
 
-void memcpy_to_bio(struct bio *, struct bvec_iter, void *);
+void memcpy_to_bio(struct bio *, struct bvec_iter, const void *);
 void memcpy_from_bio(void *, struct bio *, struct bvec_iter);
 
 static inline void memcpy_u64s_small(void *dst, const void *src,