]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to fcf8a0889c bcachefs: bch2_alloc_write() should be writing...
authorKent Overstreet <kent.overstreet@gmail.com>
Fri, 8 Jan 2021 00:49:15 +0000 (19:49 -0500)
committerKent Overstreet <kent.overstreet@gmail.com>
Sat, 9 Jan 2021 02:33:27 +0000 (21:33 -0500)
40 files changed:
.bcachefs_revision
cmd_migrate.c
libbcachefs/alloc_background.c
libbcachefs/alloc_foreground.c
libbcachefs/alloc_types.h
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/bkey_buf.h [new file with mode: 0644]
libbcachefs/bkey_on_stack.h [deleted file]
libbcachefs/bkey_sort.c
libbcachefs/btree_cache.c
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_iter.c
libbcachefs/btree_key_cache.c
libbcachefs/btree_types.h
libbcachefs/btree_update.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/compress.c
libbcachefs/ec.c
libbcachefs/ec.h
libbcachefs/extent_update.c
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/fs-io.c
libbcachefs/fs.c
libbcachefs/fsck.c
libbcachefs/io.c
libbcachefs/io.h
libbcachefs/journal.c
libbcachefs/journal_io.c
libbcachefs/journal_types.h
libbcachefs/migrate.c
libbcachefs/move.c
libbcachefs/movinggc.c
libbcachefs/recovery.c
libbcachefs/reflink.c
libbcachefs/sysfs.c

index 6bdc42aaf14a0698d952ed8c525a85c1fdde1a63..14540446bad48ed2e08b726af60052114192ff29 100644 (file)
@@ -1 +1 @@
-5241335413ef160e309fd41ab909532fec656a3a
+fcf8a0889c125511ae841960c73df62237ab05a7
index 42fbc2bc5feabb60b4907bff80941600c4dbee2b..40d72671d46a686a49575201129f5d2fb67ff37d 100644 (file)
@@ -301,7 +301,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst,
 
        while (length) {
                struct bkey_i_extent *e;
-               BKEY_PADDED(k) k;
+               __BKEY_PADDED(k, BKEY_EXTENT_VAL_U64s_MAX) k;
                u64 b = sector_to_bucket(ca, physical);
                struct disk_reservation res;
                unsigned sectors;
index 62ca9b7aaefa10a2af9d1f2177d797f7f36145f0..60c2c38bb4b03e959a4375838ba8b1ca23f764cf 100644 (file)
@@ -319,9 +319,7 @@ retry:
        bch2_trans_update(trans, iter, &a->k_i,
                          BTREE_TRIGGER_NORUN);
        ret = bch2_trans_commit(trans, NULL, NULL,
-                               BTREE_INSERT_NOFAIL|
-                               BTREE_INSERT_USE_RESERVE|
-                               flags);
+                               BTREE_INSERT_NOFAIL|flags);
 err:
        if (ret == -EINTR)
                goto retry;
@@ -368,7 +366,7 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags)
        unsigned i;
        int ret = 0;
 
-       for_each_rw_member(ca, c, i) {
+       for_each_member_device(ca, c, i) {
                bch2_dev_alloc_write(c, ca, flags);
                if (ret) {
                        percpu_ref_put(&ca->io_ref);
@@ -575,8 +573,7 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca)
 
                if (available > fifo_free(&ca->free_inc) ||
                    (available &&
-                    (!fifo_full(&ca->free[RESERVE_BTREE]) ||
-                     !fifo_full(&ca->free[RESERVE_MOVINGGC]))))
+                    !fifo_full(&ca->free[RESERVE_MOVINGGC])))
                        break;
 
                up_read(&c->gc_lock);
@@ -977,8 +974,7 @@ retry:
                                BTREE_INSERT_NOUNLOCK|
                                BTREE_INSERT_NOCHECK_RW|
                                BTREE_INSERT_NOFAIL|
-                               BTREE_INSERT_USE_RESERVE|
-                               BTREE_INSERT_USE_ALLOC_RESERVE|
+                               BTREE_INSERT_JOURNAL_RESERVED|
                                flags);
        if (ret == -EINTR)
                goto retry;
index 7a92e3d532548a2219985381e38f2c4081794b1b..dcbe04040a39c913e199e33f97e27057b05ef881 100644 (file)
@@ -204,9 +204,10 @@ success:
 static inline unsigned open_buckets_reserved(enum alloc_reserve reserve)
 {
        switch (reserve) {
-       case RESERVE_ALLOC:
-               return 0;
        case RESERVE_BTREE:
+       case RESERVE_BTREE_MOVINGGC:
+               return 0;
+       case RESERVE_MOVINGGC:
                return OPEN_BUCKETS_COUNT / 4;
        default:
                return OPEN_BUCKETS_COUNT / 2;
@@ -263,16 +264,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
                goto out;
 
        switch (reserve) {
-       case RESERVE_ALLOC:
-               if (fifo_pop(&ca->free[RESERVE_BTREE], bucket))
-                       goto out;
-               break;
-       case RESERVE_BTREE:
-               if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >=
-                   ca->free[RESERVE_BTREE].size &&
-                   fifo_pop(&ca->free[RESERVE_BTREE], bucket))
-                       goto out;
-               break;
+       case RESERVE_BTREE_MOVINGGC:
        case RESERVE_MOVINGGC:
                if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket))
                        goto out;
@@ -458,16 +450,18 @@ bch2_bucket_alloc_set(struct bch_fs *c,
  * it's to a device we don't want:
  */
 
-static void bucket_alloc_from_stripe(struct bch_fs *c,
-                                    struct open_buckets *ptrs,
-                                    struct write_point *wp,
-                                    struct bch_devs_mask *devs_may_alloc,
-                                    u16 target,
-                                    unsigned erasure_code,
-                                    unsigned nr_replicas,
-                                    unsigned *nr_effective,
-                                    bool *have_cache,
-                                    unsigned flags)
+static enum bucket_alloc_ret
+bucket_alloc_from_stripe(struct bch_fs *c,
+                        struct open_buckets *ptrs,
+                        struct write_point *wp,
+                        struct bch_devs_mask *devs_may_alloc,
+                        u16 target,
+                        unsigned erasure_code,
+                        unsigned nr_replicas,
+                        unsigned *nr_effective,
+                        bool *have_cache,
+                        unsigned flags,
+                        struct closure *cl)
 {
        struct dev_alloc_list devs_sorted;
        struct ec_stripe_head *h;
@@ -476,17 +470,21 @@ static void bucket_alloc_from_stripe(struct bch_fs *c,
        unsigned i, ec_idx;
 
        if (!erasure_code)
-               return;
+               return 0;
 
        if (nr_replicas < 2)
-               return;
+               return 0;
 
        if (ec_open_bucket(c, ptrs))
-               return;
+               return 0;
 
-       h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1);
+       h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1,
+                                   wp == &c->copygc_write_point,
+                                   cl);
+       if (IS_ERR(h))
+               return -PTR_ERR(h);
        if (!h)
-               return;
+               return 0;
 
        devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc);
 
@@ -508,6 +506,7 @@ got_bucket:
        atomic_inc(&h->s->pin);
 out_put_head:
        bch2_ec_stripe_head_put(c, h);
+       return 0;
 }
 
 /* Sector allocator */
@@ -585,10 +584,13 @@ open_bucket_add_buckets(struct bch_fs *c,
                }
 
                if (!ec_open_bucket(c, ptrs)) {
-                       bucket_alloc_from_stripe(c, ptrs, wp, &devs,
+                       ret = bucket_alloc_from_stripe(c, ptrs, wp, &devs,
                                                 target, erasure_code,
                                                 nr_replicas, nr_effective,
-                                                have_cache, flags);
+                                                have_cache, flags, _cl);
+                       if (ret == FREELIST_EMPTY ||
+                           ret == OPEN_BUCKETS_EMPTY)
+                               return ret;
                        if (*nr_effective >= nr_replicas)
                                return 0;
                }
index 20705460bb0aa10ef24dc1910716ea78493e7074..1abfff5290bc52e5ef263558a3dfca7287751a42 100644 (file)
@@ -34,14 +34,12 @@ struct bucket_clock {
        struct mutex            lock;
 };
 
-/* There is one reserve for each type of btree, one for prios and gens
- * and one for moving GC */
 enum alloc_reserve {
-       RESERVE_ALLOC           = -1,
-       RESERVE_BTREE           = 0,
-       RESERVE_MOVINGGC        = 1,
-       RESERVE_NONE            = 2,
-       RESERVE_NR              = 3,
+       RESERVE_BTREE_MOVINGGC  = -2,
+       RESERVE_BTREE           = -1,
+       RESERVE_MOVINGGC        = 0,
+       RESERVE_NONE            = 1,
+       RESERVE_NR              = 2,
 };
 
 typedef FIFO(long)     alloc_fifo;
@@ -89,7 +87,6 @@ struct write_point {
        u64                     last_used;
        unsigned long           write_point;
        enum bch_data_type      type;
-       bool                    is_ec;
 
        /* calculated based on how many pointers we're actually going to use: */
        unsigned                sectors_free;
index eb5b4080477388f7d2b0dde43259b70b488f4b96..505777ba8b54a40e7a724350704473e1d4f77868 100644 (file)
@@ -510,7 +510,7 @@ enum {
 
        /* misc: */
        BCH_FS_FIXED_GENS,
-       BCH_FS_ALLOC_WRITTEN,
+       BCH_FS_NEED_ALLOC_WRITE,
        BCH_FS_REBUILD_REPLICAS,
        BCH_FS_HOLD_BTREE_WRITES,
 };
index 9f59c6b3a25e023440e45dfc620df8c875a1bc87..307d5523a52d63a58ccd68b701857b06d21e0e19 100644 (file)
@@ -634,8 +634,6 @@ struct bch_reservation {
 #define BKEY_EXTENT_VAL_U64s_MAX                               \
        (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
 
-#define BKEY_PADDED(key)       __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX)
-
 /* * Maximum possible size of an entire extent, key + value: */
 #define BKEY_EXTENT_U64s_MAX           (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
 
diff --git a/libbcachefs/bkey_buf.h b/libbcachefs/bkey_buf.h
new file mode 100644 (file)
index 0000000..0d7c67a
--- /dev/null
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_BUF_H
+#define _BCACHEFS_BKEY_BUF_H
+
+#include "bcachefs.h"
+
+struct bkey_buf {
+       struct bkey_i   *k;
+       u64             onstack[12];
+};
+
+static inline void bch2_bkey_buf_realloc(struct bkey_buf *s,
+                                        struct bch_fs *c, unsigned u64s)
+{
+       if (s->k == (void *) s->onstack &&
+           u64s > ARRAY_SIZE(s->onstack)) {
+               s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
+               memcpy(s->k, s->onstack, sizeof(s->onstack));
+       }
+}
+
+static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s,
+                                           struct bch_fs *c,
+                                           struct bkey_s_c k)
+{
+       bch2_bkey_buf_realloc(s, c, k.k->u64s);
+       bkey_reassemble(s->k, k);
+}
+
+static inline void bch2_bkey_buf_copy(struct bkey_buf *s,
+                                     struct bch_fs *c,
+                                     struct bkey_i *src)
+{
+       bch2_bkey_buf_realloc(s, c, src->k.u64s);
+       bkey_copy(s->k, src);
+}
+
+static inline void bch2_bkey_buf_unpack(struct bkey_buf *s,
+                                       struct bch_fs *c,
+                                       struct btree *b,
+                                       struct bkey_packed *src)
+{
+       bch2_bkey_buf_realloc(s, c, BKEY_U64s +
+                             bkeyp_val_u64s(&b->format, src));
+       bch2_bkey_unpack(b, s->k, src);
+}
+
+static inline void bch2_bkey_buf_init(struct bkey_buf *s)
+{
+       s->k = (void *) s->onstack;
+}
+
+static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c)
+{
+       if (s->k != (void *) s->onstack)
+               mempool_free(s->k, &c->large_bkey_pool);
+       s->k = NULL;
+}
+
+#endif /* _BCACHEFS_BKEY_BUF_H */
diff --git a/libbcachefs/bkey_on_stack.h b/libbcachefs/bkey_on_stack.h
deleted file mode 100644 (file)
index f607a0c..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_BKEY_ON_STACK_H
-#define _BCACHEFS_BKEY_ON_STACK_H
-
-#include "bcachefs.h"
-
-struct bkey_on_stack {
-       struct bkey_i   *k;
-       u64             onstack[12];
-};
-
-static inline void bkey_on_stack_realloc(struct bkey_on_stack *s,
-                                        struct bch_fs *c, unsigned u64s)
-{
-       if (s->k == (void *) s->onstack &&
-           u64s > ARRAY_SIZE(s->onstack)) {
-               s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS);
-               memcpy(s->k, s->onstack, sizeof(s->onstack));
-       }
-}
-
-static inline void bkey_on_stack_reassemble(struct bkey_on_stack *s,
-                                           struct bch_fs *c,
-                                           struct bkey_s_c k)
-{
-       bkey_on_stack_realloc(s, c, k.k->u64s);
-       bkey_reassemble(s->k, k);
-}
-
-static inline void bkey_on_stack_init(struct bkey_on_stack *s)
-{
-       s->k = (void *) s->onstack;
-}
-
-static inline void bkey_on_stack_exit(struct bkey_on_stack *s,
-                                     struct bch_fs *c)
-{
-       if (s->k != (void *) s->onstack)
-               mempool_free(s->k, &c->large_bkey_pool);
-       s->k = NULL;
-}
-
-#endif /* _BCACHEFS_BKEY_ON_STACK_H */
index 99e0a4011faeeefc497d204a00362111676b5366..2e1d9cd65f430939bd109e3334b95fba0a3e5dde 100644 (file)
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "bkey_sort.h"
 #include "bset.h"
 #include "extents.h"
@@ -187,11 +187,11 @@ bch2_sort_repack_merge(struct bch_fs *c,
                       bool filter_whiteouts)
 {
        struct bkey_packed *out = vstruct_last(dst), *k_packed;
-       struct bkey_on_stack k;
+       struct bkey_buf k;
        struct btree_nr_keys nr;
 
        memset(&nr, 0, sizeof(nr));
-       bkey_on_stack_init(&k);
+       bch2_bkey_buf_init(&k);
 
        while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) {
                if (filter_whiteouts && bkey_whiteout(k_packed))
@@ -204,7 +204,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
                 * node; we have to make a copy of the entire key before calling
                 * normalize
                 */
-               bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s);
+               bch2_bkey_buf_realloc(&k, c, k_packed->u64s + BKEY_U64s);
                bch2_bkey_unpack(src, k.k, k_packed);
 
                if (filter_whiteouts &&
@@ -215,7 +215,7 @@ bch2_sort_repack_merge(struct bch_fs *c,
        }
 
        dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
-       bkey_on_stack_exit(&k, c);
+       bch2_bkey_buf_exit(&k, c);
        return nr;
 }
 
@@ -315,11 +315,11 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
        struct bkey l_unpacked, r_unpacked;
        struct bkey_s l, r;
        struct btree_nr_keys nr;
-       struct bkey_on_stack split;
+       struct bkey_buf split;
        unsigned i;
 
        memset(&nr, 0, sizeof(nr));
-       bkey_on_stack_init(&split);
+       bch2_bkey_buf_init(&split);
 
        sort_iter_sort(iter, extent_sort_fix_overlapping_cmp);
        for (i = 0; i < iter->used;) {
@@ -379,7 +379,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
                        /*
                         * r wins, but it overlaps in the middle of l - split l:
                         */
-                       bkey_on_stack_reassemble(&split, c, l.s_c);
+                       bch2_bkey_buf_reassemble(&split, c, l.s_c);
                        bch2_cut_back(bkey_start_pos(r.k), split.k);
 
                        bch2_cut_front_s(r.k->p, l);
@@ -398,7 +398,7 @@ bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst,
 
        dst->u64s = cpu_to_le16((u64 *) out - dst->_data);
 
-       bkey_on_stack_exit(&split, c);
+       bch2_bkey_buf_exit(&split, c);
        return nr;
 }
 
index 09774f56f11c38c9aa36c22518567cc1b909baba..fda6540be0359fb8cd8ff3e69b1a4fb15905a33c 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bkey_buf.h"
 #include "btree_cache.h"
 #include "btree_io.h"
 #include "btree_iter.h"
@@ -898,10 +899,12 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
        struct btree *parent;
        struct btree_node_iter node_iter;
        struct bkey_packed *k;
-       BKEY_PADDED(k) tmp;
+       struct bkey_buf tmp;
        struct btree *ret = NULL;
        unsigned level = b->c.level;
 
+       bch2_bkey_buf_init(&tmp);
+
        parent = btree_iter_node(iter, level + 1);
        if (!parent)
                return NULL;
@@ -935,9 +938,9 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
        if (!k)
                goto out;
 
-       bch2_bkey_unpack(parent, &tmp.k, k);
+       bch2_bkey_buf_unpack(&tmp, c, parent, k);
 
-       ret = bch2_btree_node_get(c, iter, &tmp.k, level,
+       ret = bch2_btree_node_get(c, iter, tmp.k, level,
                                  SIX_LOCK_intent, _THIS_IP_);
 
        if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) {
@@ -957,7 +960,7 @@ struct btree *bch2_btree_node_get_sibling(struct bch_fs *c,
                if (sib == btree_prev_sib)
                        btree_node_unlock(iter, level);
 
-               ret = bch2_btree_node_get(c, iter, &tmp.k, level,
+               ret = bch2_btree_node_get(c, iter, tmp.k, level,
                                          SIX_LOCK_intent, _THIS_IP_);
 
                /*
@@ -998,6 +1001,8 @@ out:
 
        bch2_btree_trans_verify_locks(trans);
 
+       bch2_bkey_buf_exit(&tmp, c);
+
        return ret;
 }
 
index 6268ea637d19f7a717b1a3fafd776bd306643011..6b06f60799086167a0e643920b1a6430eb779808 100644 (file)
@@ -8,7 +8,7 @@
 #include "alloc_background.h"
 #include "alloc_foreground.h"
 #include "bkey_methods.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_locking.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
@@ -132,6 +132,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
                                        ptr->gen)) {
                                g2->_mark.gen   = g->_mark.gen          = ptr->gen;
                                g2->gen_valid   = g->gen_valid          = true;
+                               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
                        }
 
                        if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
@@ -145,6 +146,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
                                g2->_mark.dirty_sectors         = 0;
                                g2->_mark.cached_sectors        = 0;
                                set_bit(BCH_FS_FIXED_GENS, &c->flags);
+                               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
                        }
                }
        }
@@ -233,7 +235,6 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
                        if (max_stale > 64)
                                bch2_btree_node_rewrite(c, iter,
                                                b->data->keys.seq,
-                                               BTREE_INSERT_USE_RESERVE|
                                                BTREE_INSERT_NOWAIT|
                                                BTREE_INSERT_GC_LOCK_HELD);
                        else if (!bch2_btree_gc_rewrite_disabled &&
@@ -268,10 +269,12 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
        struct btree_and_journal_iter iter;
        struct bkey_s_c k;
        struct bpos next_node_start = b->data->min_key;
+       struct bkey_buf tmp;
        u8 max_stale = 0;
        int ret = 0;
 
        bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
+       bch2_bkey_buf_init(&tmp);
 
        while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
                bch2_bkey_debugcheck(c, b, k);
@@ -285,10 +288,9 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
 
                if (b->c.level) {
                        struct btree *child;
-                       BKEY_PADDED(k) tmp;
 
-                       bkey_reassemble(&tmp.k, k);
-                       k = bkey_i_to_s_c(&tmp.k);
+                       bch2_bkey_buf_reassemble(&tmp, c, k);
+                       k = bkey_i_to_s_c(tmp.k);
 
                        bch2_btree_and_journal_iter_advance(&iter);
 
@@ -300,7 +302,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
                                break;
 
                        if (b->c.level > target_depth) {
-                               child = bch2_btree_node_get_noiter(c, &tmp.k,
+                               child = bch2_btree_node_get_noiter(c, tmp.k,
                                                        b->c.btree_id, b->c.level - 1);
                                ret = PTR_ERR_OR_ZERO(child);
                                if (ret)
@@ -318,6 +320,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
                }
        }
 
+       bch2_bkey_buf_exit(&tmp, c);
        return ret;
 }
 
@@ -570,7 +573,7 @@ static int bch2_gc_done(struct bch_fs *c,
                        fsck_err(c, _msg ": got %llu, should be %llu"   \
                                , ##__VA_ARGS__, dst->_f, src->_f);     \
                dst->_f = src->_f;                                      \
-               ret = 1;                                                \
+               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);            \
        }
 #define copy_stripe_field(_f, _msg, ...)                               \
        if (dst->_f != src->_f) {                                       \
@@ -581,7 +584,7 @@ static int bch2_gc_done(struct bch_fs *c,
                                dst->_f, src->_f);                      \
                dst->_f = src->_f;                                      \
                dst->dirty = true;                                      \
-               ret = 1;                                                \
+               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);            \
        }
 #define copy_bucket_field(_f)                                          \
        if (dst->b[b].mark._f != src->b[b].mark._f) {                   \
@@ -592,7 +595,7 @@ static int bch2_gc_done(struct bch_fs *c,
                                bch2_data_types[dst->b[b].mark.data_type],\
                                dst->b[b].mark._f, src->b[b].mark._f);  \
                dst->b[b]._mark._f = src->b[b].mark._f;                 \
-               ret = 1;                                                \
+               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);            \
        }
 #define copy_dev_field(_f, _msg, ...)                                  \
        copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
@@ -930,10 +933,10 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
-       struct bkey_on_stack sk;
+       struct bkey_buf sk;
        int ret = 0;
 
-       bkey_on_stack_init(&sk);
+       bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
 
        iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
@@ -942,7 +945,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
        while ((k = bch2_btree_iter_peek(iter)).k &&
               !(ret = bkey_err(k))) {
                if (gc_btree_gens_key(c, k)) {
-                       bkey_on_stack_reassemble(&sk, c, k);
+                       bch2_bkey_buf_reassemble(&sk, c, k);
                        bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
                        bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
@@ -962,7 +965,7 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
        }
 
        bch2_trans_exit(&trans);
-       bkey_on_stack_exit(&sk, c);
+       bch2_bkey_buf_exit(&sk, c);
 
        return ret;
 }
@@ -1074,7 +1077,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
                }
 
        if (bch2_keylist_realloc(&keylist, NULL, 0,
-                       (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) {
+                       BKEY_BTREE_PTR_U64s_MAX * nr_old_nodes)) {
                trace_btree_gc_coalesce_fail(c,
                                BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC);
                return;
index 4dde972d353a3f917bf084df250ded7471562fb3..768fc85eaa4e4c32d4d7508ca27670a6e2b710b1 100644 (file)
@@ -1320,12 +1320,13 @@ static void bch2_btree_node_write_error(struct bch_fs *c,
                                        struct btree_write_bio *wbio)
 {
        struct btree *b         = wbio->wbio.bio.bi_private;
-       __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
+       struct bkey_buf k;
        struct bch_extent_ptr *ptr;
        struct btree_trans trans;
        struct btree_iter *iter;
        int ret;
 
+       bch2_bkey_buf_init(&k);
        bch2_trans_init(&trans, c, 0, 0);
 
        iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p,
@@ -1344,21 +1345,22 @@ retry:
 
        BUG_ON(!btree_node_hashed(b));
 
-       bkey_copy(&tmp.k, &b->key);
+       bch2_bkey_buf_copy(&k, c, &b->key);
 
-       bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr,
+       bch2_bkey_drop_ptrs(bkey_i_to_s(k.k), ptr,
                bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
 
-       if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&tmp.k)))
+       if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k.k)))
                goto err;
 
-       ret = bch2_btree_node_update_key(c, iter, b, &tmp.k);
+       ret = bch2_btree_node_update_key(c, iter, b, k.k);
        if (ret == -EINTR)
                goto retry;
        if (ret)
                goto err;
 out:
        bch2_trans_exit(&trans);
+       bch2_bkey_buf_exit(&k, c);
        bio_put(&wbio->wbio.bio);
        btree_node_write_done(c, b);
        return;
@@ -1476,7 +1478,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
        struct bset *i;
        struct btree_node *bn = NULL;
        struct btree_node_entry *bne = NULL;
-       BKEY_PADDED(key) k;
+       struct bkey_buf k;
        struct bch_extent_ptr *ptr;
        struct sort_iter sort_iter;
        struct nonce nonce;
@@ -1487,6 +1489,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
        bool validate_before_checksum = false;
        void *data;
 
+       bch2_bkey_buf_init(&k);
+
        if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
                return;
 
@@ -1695,15 +1699,16 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
         * just make all btree node writes FUA to keep things sane.
         */
 
-       bkey_copy(&k.key, &b->key);
+       bch2_bkey_buf_copy(&k, c, &b->key);
 
-       bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr)
+       bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(k.k)), ptr)
                ptr->offset += b->written;
 
        b->written += sectors_to_write;
 
        /* XXX: submitting IO with btree locks held: */
-       bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, &k.key);
+       bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, k.k);
+       bch2_bkey_buf_exit(&k, c);
        return;
 err:
        set_btree_node_noevict(b);
index 8c35e39ea97fce2a33bc750cc3aa8015ece7c9ea..4d825cac22ce10cfeedc58438599d04ab611492c 100644 (file)
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "bkey_methods.h"
+#include "bkey_buf.h"
 #include "btree_cache.h"
 #include "btree_iter.h"
 #include "btree_key_cache.h"
@@ -1048,27 +1049,31 @@ static void btree_iter_prefetch(struct btree_iter *iter)
        struct btree_iter_level *l = &iter->l[iter->level];
        struct btree_node_iter node_iter = l->iter;
        struct bkey_packed *k;
-       BKEY_PADDED(k) tmp;
+       struct bkey_buf tmp;
        unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
                ? (iter->level > 1 ? 0 :  2)
                : (iter->level > 1 ? 1 : 16);
        bool was_locked = btree_node_locked(iter, iter->level);
 
+       bch2_bkey_buf_init(&tmp);
+
        while (nr) {
                if (!bch2_btree_node_relock(iter, iter->level))
-                       return;
+                       break;
 
                bch2_btree_node_iter_advance(&node_iter, l->b);
                k = bch2_btree_node_iter_peek(&node_iter, l->b);
                if (!k)
                        break;
 
-               bch2_bkey_unpack(l->b, &tmp.k, k);
-               bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1);
+               bch2_bkey_buf_unpack(&tmp, c, l->b, k);
+               bch2_btree_node_prefetch(c, iter, tmp.k, iter->level - 1);
        }
 
        if (!was_locked)
                btree_node_unlock(iter, iter->level);
+
+       bch2_bkey_buf_exit(&tmp, c);
 }
 
 static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
@@ -1100,30 +1105,34 @@ static __always_inline int btree_iter_down(struct btree_iter *iter,
        struct btree *b;
        unsigned level = iter->level - 1;
        enum six_lock_type lock_type = __btree_lock_want(iter, level);
-       BKEY_PADDED(k) tmp;
+       struct bkey_buf tmp;
+       int ret;
 
        EBUG_ON(!btree_node_locked(iter, iter->level));
 
-       bch2_bkey_unpack(l->b, &tmp.k,
+       bch2_bkey_buf_init(&tmp);
+       bch2_bkey_buf_unpack(&tmp, c, l->b,
                         bch2_btree_node_iter_peek(&l->iter, l->b));
 
-       b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, trace_ip);
-       if (unlikely(IS_ERR(b)))
-               return PTR_ERR(b);
+       b = bch2_btree_node_get(c, iter, tmp.k, level, lock_type, trace_ip);
+       ret = PTR_ERR_OR_ZERO(b);
+       if (unlikely(ret))
+               goto err;
 
        mark_btree_node_locked(iter, level, lock_type);
        btree_iter_node_set(iter, b);
 
-       if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 &&
-           unlikely(b != btree_node_mem_ptr(&tmp.k)))
+       if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
+           unlikely(b != btree_node_mem_ptr(tmp.k)))
                btree_node_mem_ptr_set(iter, level + 1, b);
 
        if (iter->flags & BTREE_ITER_PREFETCH)
                btree_iter_prefetch(iter);
 
        iter->level = level;
-
-       return 0;
+err:
+       bch2_bkey_buf_exit(&tmp, c);
+       return ret;
 }
 
 static void btree_iter_up(struct btree_iter *iter)
@@ -2124,9 +2133,12 @@ static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
        iter->flags &= ~BTREE_ITER_USER_FLAGS;
        iter->flags |= flags & BTREE_ITER_USER_FLAGS;
 
-       if (iter->flags & BTREE_ITER_INTENT)
-               bch2_btree_iter_upgrade(iter, 1);
-       else
+       if (iter->flags & BTREE_ITER_INTENT) {
+               if (!iter->locks_want) {
+                       __bch2_btree_iter_unlock(iter);
+                       iter->locks_want = 1;
+               }
+       } else
                bch2_btree_iter_downgrade(iter);
 
        BUG_ON(iter->btree_id != btree_id);
index 1a557b753bc1ed8a03d6fa01cadf0052adb8a050..4357aefdb668fd79b01ae2f7b588c543a42b19bb 100644 (file)
@@ -349,8 +349,6 @@ retry:
                                  BTREE_INSERT_NOUNLOCK|
                                  BTREE_INSERT_NOCHECK_RW|
                                  BTREE_INSERT_NOFAIL|
-                                 BTREE_INSERT_USE_RESERVE|
-                                 BTREE_INSERT_USE_ALLOC_RESERVE|
                                  BTREE_INSERT_JOURNAL_RESERVED|
                                  BTREE_INSERT_JOURNAL_RECLAIM);
 err:
index dc7de27112c66d2c7b4c707eb99b767fd479b101..631bf4694f4dd8fee7f6b13c373b7f8da6892877 100644 (file)
@@ -57,7 +57,7 @@ struct btree_write {
 
 struct btree_alloc {
        struct open_buckets     ob;
-       BKEY_PADDED(k);
+       __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX);
 };
 
 struct btree_bkey_cached_common {
index adb07043cbb3d7f232711c08bc721e5e6c57bc7a..a251380801692fc232ff2944e7d9d127e6b415e1 100644 (file)
@@ -20,7 +20,6 @@ enum btree_insert_flags {
        __BTREE_INSERT_NOCHECK_RW,
        __BTREE_INSERT_LAZY_RW,
        __BTREE_INSERT_USE_RESERVE,
-       __BTREE_INSERT_USE_ALLOC_RESERVE,
        __BTREE_INSERT_JOURNAL_REPLAY,
        __BTREE_INSERT_JOURNAL_RESERVED,
        __BTREE_INSERT_JOURNAL_RECLAIM,
@@ -43,7 +42,6 @@ enum btree_insert_flags {
 
 /* for copygc, or when merging btree nodes */
 #define BTREE_INSERT_USE_RESERVE       (1 << __BTREE_INSERT_USE_RESERVE)
-#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE)
 
 /* Insert is for journal replay - don't get journal reservations: */
 #define BTREE_INSERT_JOURNAL_REPLAY    (1 << __BTREE_INSERT_JOURNAL_REPLAY)
index 8f96756ba648f4718d9e7badefd54c0c0b07a4c4..5bb653298c6ca9b3b52a4e60cda4a276afe2c482 100644 (file)
@@ -195,21 +195,18 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
 {
        struct write_point *wp;
        struct btree *b;
-       BKEY_PADDED(k) tmp;
+       __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
        struct open_buckets ob = { .nr = 0 };
        struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
        unsigned nr_reserve;
        enum alloc_reserve alloc_reserve;
 
-       if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) {
+       if (flags & BTREE_INSERT_USE_RESERVE) {
                nr_reserve      = 0;
-               alloc_reserve   = RESERVE_ALLOC;
-       } else if (flags & BTREE_INSERT_USE_RESERVE) {
-               nr_reserve      = BTREE_NODE_RESERVE / 2;
-               alloc_reserve   = RESERVE_BTREE;
+               alloc_reserve   = RESERVE_BTREE_MOVINGGC;
        } else {
                nr_reserve      = BTREE_NODE_RESERVE;
-               alloc_reserve   = RESERVE_NONE;
+               alloc_reserve   = RESERVE_BTREE;
        }
 
        mutex_lock(&c->btree_reserve_cache_lock);
@@ -577,8 +574,6 @@ static void btree_update_nodes_written(struct btree_update *as)
        bch2_trans_init(&trans, c, 0, 512);
        ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq,
                              BTREE_INSERT_NOFAIL|
-                             BTREE_INSERT_USE_RESERVE|
-                             BTREE_INSERT_USE_ALLOC_RESERVE|
                              BTREE_INSERT_NOCHECK_RW|
                              BTREE_INSERT_JOURNAL_RECLAIM|
                              BTREE_INSERT_JOURNAL_RESERVED,
@@ -1232,6 +1227,9 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
                src = n;
        }
 
+       /* Also clear out the unwritten whiteouts area: */
+       b->whiteout_u64s = 0;
+
        i->u64s = cpu_to_le16((u64 *) dst - i->_data);
        set_btree_bset_end(b, b->set);
 
@@ -1457,15 +1455,6 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter,
        struct btree_update *as;
        struct closure cl;
        int ret = 0;
-       struct btree_insert_entry *i;
-
-       /*
-        * We already have a disk reservation and open buckets pinned; this
-        * allocation must not block:
-        */
-       trans_for_each_update(trans, i)
-               if (btree_node_type_needs_gc(i->iter->btree_id))
-                       flags |= BTREE_INSERT_USE_RESERVE;
 
        closure_init_stack(&cl);
 
@@ -1926,10 +1915,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter,
 retry:
        as = bch2_btree_update_start(iter->trans, iter->btree_id,
                parent ? btree_update_reserve_required(c, parent) : 0,
-               BTREE_INSERT_NOFAIL|
-               BTREE_INSERT_USE_RESERVE|
-               BTREE_INSERT_USE_ALLOC_RESERVE,
-               &cl);
+               BTREE_INSERT_NOFAIL, &cl);
 
        if (IS_ERR(as)) {
                ret = PTR_ERR(as);
index 64734f9158c391ea9b84487cd7a573ba4188d44a..c490df4709ba18f724ae40651cc5eb69071542e0 100644 (file)
@@ -869,8 +869,8 @@ int __bch2_trans_commit(struct btree_trans *trans)
                trans_trigger_run = false;
 
                trans_for_each_update(trans, i) {
-                       if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK &&
-                                    (ret = bch2_btree_iter_traverse(i->iter)))) {
+                       ret = bch2_btree_iter_traverse(i->iter);
+                       if (unlikely(ret)) {
                                trace_trans_restart_traverse(trans->ip);
                                goto out;
                        }
@@ -879,8 +879,8 @@ int __bch2_trans_commit(struct btree_trans *trans)
                         * We're not using bch2_btree_iter_upgrade here because
                         * we know trans->nounlock can't be set:
                         */
-                       if (unlikely(i->iter->locks_want < 1 &&
-                                    !__bch2_btree_iter_upgrade(i->iter, 1))) {
+                       if (unlikely(!btree_node_intent_locked(i->iter, i->iter->level) &&
+                                    !__bch2_btree_iter_upgrade(i->iter, i->iter->level + 1))) {
                                trace_trans_restart_upgrade(trans->ip);
                                ret = -EINTR;
                                goto out;
@@ -1084,8 +1084,7 @@ int bch2_btree_delete_at(struct btree_trans *trans,
 
        bch2_trans_update(trans, iter, &k, 0);
        return bch2_trans_commit(trans, NULL, NULL,
-                                BTREE_INSERT_NOFAIL|
-                                BTREE_INSERT_USE_RESERVE|flags);
+                                BTREE_INSERT_NOFAIL|flags);
 }
 
 int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
index 1934b845ea15096ee0a5bfa6dd35ec0e70126378..8bbf958d64e41390fee3071e7a82d4b6708a5fdb 100644 (file)
@@ -2192,7 +2192,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
                             ca->mi.bucket_size / c->opts.btree_node_size);
        /* XXX: these should be tunable */
        size_t reserve_none     = max_t(size_t, 1, nbuckets >> 9);
-       size_t copygc_reserve   = max_t(size_t, 2, nbuckets >> 7);
+       size_t copygc_reserve   = max_t(size_t, 2, nbuckets >> 6);
        size_t free_inc_nr      = max(max_t(size_t, 1, nbuckets >> 12),
                                      btree_reserve * 2);
        bool resize = ca->buckets[0] != NULL;
@@ -2209,7 +2209,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
            !(buckets_nouse     = kvpmalloc(BITS_TO_LONGS(nbuckets) *
                                            sizeof(unsigned long),
                                            GFP_KERNEL|__GFP_ZERO)) ||
-           !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
            !init_fifo(&free[RESERVE_MOVINGGC],
                       copygc_reserve, GFP_KERNEL) ||
            !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
index aebf46bb1d21e5fd269e94f3193c73a23f56cc2a..f63651d291e53de8737c52254fe222bf07512ed3 100644 (file)
@@ -336,8 +336,19 @@ static int attempt_compress(struct bch_fs *c,
                ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace,
                        ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams));
 
+               /*
+                * ZSTD requires that when we decompress we pass in the exact
+                * compressed size - rounding it up to the nearest sector
+                * doesn't work, so we use the first 4 bytes of the buffer for
+                * that.
+                *
+                * Additionally, the ZSTD code seems to have a bug where it will
+                * write just past the end of the buffer - so subtract a fudge
+                * factor (7 bytes) from the dst buffer size to account for
+                * that.
+                */
                size_t len = ZSTD_compressCCtx(ctx,
-                               dst + 4,        dst_len - 4,
+                               dst + 4,        dst_len - 4 - 7,
                                src,            src_len,
                                c->zstd_params);
                if (ZSTD_isError(len))
index 8f39c4de6672a49702f1eef235e24d952dc629a5..1c08f563c2a254939d491fdb87db52c5ee1dabac 100644 (file)
@@ -4,7 +4,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "bset.h"
 #include "btree_gc.h"
 #include "btree_update.h"
@@ -200,6 +200,36 @@ static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx)
        return false;
 }
 
+/* Stripe bufs: */
+
+static void ec_stripe_buf_free(struct ec_stripe_buf *stripe)
+{
+       unsigned i;
+
+       for (i = 0; i < stripe->key.v.nr_blocks; i++) {
+               kvpfree(stripe->data[i], stripe->size << 9);
+               stripe->data[i] = NULL;
+       }
+}
+
+static int ec_stripe_buf_alloc(struct ec_stripe_buf *stripe)
+{
+       unsigned i;
+
+       memset(stripe->valid, 0xFF, sizeof(stripe->valid));
+
+       for (i = 0; i < stripe->key.v.nr_blocks; i++) {
+               stripe->data[i] = kvpmalloc(stripe->size << 9, GFP_KERNEL);
+               if (!stripe->data[i])
+                       goto err;
+       }
+
+       return 0;
+err:
+       ec_stripe_buf_free(stripe);
+       return -ENOMEM;
+}
+
 /* Checksumming: */
 
 static void ec_generate_checksums(struct ec_stripe_buf *buf)
@@ -287,14 +317,10 @@ static void ec_generate_ec(struct ec_stripe_buf *buf)
        raid_gen(nr_data, v->nr_redundant, bytes, buf->data);
 }
 
-static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr)
-{
-       return nr - bitmap_weight(buf->valid, nr);
-}
-
 static unsigned ec_nr_failed(struct ec_stripe_buf *buf)
 {
-       return __ec_nr_failed(buf, buf->key.v.nr_blocks);
+       return buf->key.v.nr_blocks -
+               bitmap_weight(buf->valid, buf->key.v.nr_blocks);
 }
 
 static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf)
@@ -757,10 +783,10 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
        struct btree_iter *iter;
        struct bkey_s_c k;
        struct bkey_s_extent e;
-       struct bkey_on_stack sk;
+       struct bkey_buf sk;
        int ret = 0, dev, idx;
 
-       bkey_on_stack_init(&sk);
+       bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
        /* XXX this doesn't support the reflink btree */
@@ -787,7 +813,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
 
                dev = s->key.v.ptrs[idx].dev;
 
-               bkey_on_stack_reassemble(&sk, c, k);
+               bch2_bkey_buf_reassemble(&sk, c, k);
                e = bkey_i_to_s_extent(sk.k);
 
                bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev);
@@ -800,8 +826,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
                bch2_trans_update(&trans, iter, sk.k, 0);
 
                ret = bch2_trans_commit(&trans, NULL, NULL,
-                                       BTREE_INSERT_NOFAIL|
-                                       BTREE_INSERT_USE_RESERVE);
+                                       BTREE_INSERT_NOFAIL);
                if (ret == -EINTR)
                        ret = 0;
                if (ret)
@@ -809,7 +834,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c,
        }
 
        bch2_trans_exit(&trans);
-       bkey_on_stack_exit(&sk, c);
+       bch2_bkey_buf_exit(&sk, c);
 
        return ret;
 }
@@ -823,14 +848,13 @@ static void ec_stripe_create(struct ec_stripe_new *s)
        struct open_bucket *ob;
        struct bkey_i *k;
        struct stripe *m;
-       struct bch_stripe *v = &s->stripe.key.v;
+       struct bch_stripe *v = &s->new_stripe.key.v;
        unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
-       struct closure cl;
        int ret;
 
        BUG_ON(s->h->s == s);
 
-       closure_init_stack(&cl);
+       closure_sync(&s->iodone);
 
        if (s->err) {
                if (s->err != -EROFS)
@@ -838,6 +862,22 @@ static void ec_stripe_create(struct ec_stripe_new *s)
                goto err;
        }
 
+       if (s->have_existing_stripe) {
+               ec_validate_checksums(c, &s->existing_stripe);
+
+               if (ec_do_recov(c, &s->existing_stripe)) {
+                       bch_err(c, "error creating stripe: error reading existing stripe");
+                       goto err;
+               }
+
+               for (i = 0; i < nr_data; i++)
+                       if (stripe_blockcount_get(&s->existing_stripe.key.v, i))
+                               swap(s->new_stripe.data[i],
+                                    s->existing_stripe.data[i]);
+
+               ec_stripe_buf_free(&s->existing_stripe);
+       }
+
        BUG_ON(!s->allocated);
 
        if (!percpu_ref_tryget(&c->writes))
@@ -846,33 +886,31 @@ static void ec_stripe_create(struct ec_stripe_new *s)
        BUG_ON(bitmap_weight(s->blocks_allocated,
                             s->blocks.nr) != s->blocks.nr);
 
-       ec_generate_ec(&s->stripe);
+       ec_generate_ec(&s->new_stripe);
 
-       ec_generate_checksums(&s->stripe);
+       ec_generate_checksums(&s->new_stripe);
 
        /* write p/q: */
        for (i = nr_data; i < v->nr_blocks; i++)
-               ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl);
-
-       closure_sync(&cl);
+               ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone);
+       closure_sync(&s->iodone);
 
-       for (i = nr_data; i < v->nr_blocks; i++)
-               if (!test_bit(i, s->stripe.valid)) {
-                       bch_err(c, "error creating stripe: error writing redundancy buckets");
-                       goto err_put_writes;
-               }
+       if (ec_nr_failed(&s->new_stripe)) {
+               bch_err(c, "error creating stripe: error writing redundancy buckets");
+               goto err_put_writes;
+       }
 
-       ret = s->existing_stripe
-               ? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i,
+       ret = s->have_existing_stripe
+               ? bch2_btree_insert(c, BTREE_ID_EC, &s->new_stripe.key.k_i,
                                    &s->res, NULL, BTREE_INSERT_NOFAIL)
-               : ec_stripe_bkey_insert(c, s, &s->stripe.key);
+               : ec_stripe_bkey_insert(c, s, &s->new_stripe.key);
        if (ret) {
                bch_err(c, "error creating stripe: error creating stripe key");
                goto err_put_writes;
        }
 
        for_each_keylist_key(&s->keys, k) {
-               ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k);
+               ret = ec_stripe_update_ptrs(c, &s->new_stripe, &k->k);
                if (ret) {
                        bch_err(c, "error creating stripe: error %i updating pointers", ret);
                        break;
@@ -880,14 +918,14 @@ static void ec_stripe_create(struct ec_stripe_new *s)
        }
 
        spin_lock(&c->ec_stripes_heap_lock);
-       m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset);
+       m = genradix_ptr(&c->stripes[0], s->new_stripe.key.k.p.offset);
 #if 0
        pr_info("created a %s stripe %llu",
-               s->existing_stripe ? "existing" : "new",
+               s->have_existing_stripe ? "existing" : "new",
                s->stripe.key.k.p.offset);
 #endif
        BUG_ON(m->on_heap);
-       bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset);
+       bch2_stripes_heap_insert(c, m, s->new_stripe.key.k.p.offset);
        spin_unlock(&c->ec_stripes_heap_lock);
 err_put_writes:
        percpu_ref_put(&c->writes);
@@ -903,8 +941,9 @@ err:
 
        bch2_keylist_free(&s->keys, s->inline_keys);
 
-       for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
-               kvpfree(s->stripe.data[i], s->stripe.size << 9);
+       ec_stripe_buf_free(&s->existing_stripe);
+       ec_stripe_buf_free(&s->new_stripe);
+       closure_debug_destroy(&s->iodone);
        kfree(s);
 }
 
@@ -981,7 +1020,7 @@ void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp)
        ca      = bch_dev_bkey_exists(c, ob->ptr.dev);
        offset  = ca->mi.bucket_size - ob->sectors_free;
 
-       return ob->ec->stripe.data[ob->ec_idx] + (offset << 9);
+       return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9);
 }
 
 void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp,
@@ -1088,7 +1127,6 @@ static void ec_stripe_key_init(struct bch_fs *c,
 static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 {
        struct ec_stripe_new *s;
-       unsigned i;
 
        lockdep_assert_held(&h->lock);
 
@@ -1097,6 +1135,7 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
                return -ENOMEM;
 
        mutex_init(&s->lock);
+       closure_init(&s->iodone, NULL);
        atomic_set(&s->pin, 1);
        s->c            = c;
        s->h            = h;
@@ -1106,32 +1145,20 @@ static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h)
 
        bch2_keylist_init(&s->keys, s->inline_keys);
 
-       s->stripe.offset        = 0;
-       s->stripe.size          = h->blocksize;
-       memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid));
+       s->new_stripe.offset    = 0;
+       s->new_stripe.size      = h->blocksize;
 
-       ec_stripe_key_init(c, &s->stripe.key, s->nr_data,
+       ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data,
                           s->nr_parity, h->blocksize);
 
-       for (i = 0; i < s->stripe.key.v.nr_blocks; i++) {
-               s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL);
-               if (!s->stripe.data[i])
-                       goto err;
-       }
-
        h->s = s;
-
        return 0;
-err:
-       for (i = 0; i < s->stripe.key.v.nr_blocks; i++)
-               kvpfree(s->stripe.data[i], s->stripe.size << 9);
-       kfree(s);
-       return -ENOMEM;
 }
 
 static struct ec_stripe_head *
 ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
-                        unsigned algo, unsigned redundancy)
+                        unsigned algo, unsigned redundancy,
+                        bool copygc)
 {
        struct ec_stripe_head *h;
        struct bch_dev *ca;
@@ -1147,6 +1174,7 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
        h->target       = target;
        h->algo         = algo;
        h->redundancy   = redundancy;
+       h->copygc       = copygc;
 
        rcu_read_lock();
        h->devs = target_rw_devs(c, BCH_DATA_user, target);
@@ -1178,9 +1206,10 @@ void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h)
 }
 
 struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c,
-                                              unsigned target,
-                                              unsigned algo,
-                                              unsigned redundancy)
+                                                unsigned target,
+                                                unsigned algo,
+                                                unsigned redundancy,
+                                                bool copygc)
 {
        struct ec_stripe_head *h;
 
@@ -1191,21 +1220,21 @@ struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c,
        list_for_each_entry(h, &c->ec_stripe_head_list, list)
                if (h->target           == target &&
                    h->algo             == algo &&
-                   h->redundancy       == redundancy) {
+                   h->redundancy       == redundancy &&
+                   h->copygc           == copygc) {
                        mutex_lock(&h->lock);
                        goto found;
                }
 
-       h = ec_new_stripe_head_alloc(c, target, algo, redundancy);
+       h = ec_new_stripe_head_alloc(c, target, algo, redundancy, copygc);
 found:
        mutex_unlock(&c->ec_stripe_head_lock);
        return h;
 }
 
-/*
- * XXX: use a higher watermark for allocating open buckets here:
- */
-static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h)
+static enum bucket_alloc_ret
+new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h,
+                        struct closure *cl)
 {
        struct bch_devs_mask devs;
        struct open_bucket *ob;
@@ -1213,12 +1242,12 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h)
                min_t(unsigned, h->nr_active_devs,
                      BCH_BKEY_PTRS_MAX) - h->redundancy;
        bool have_cache = true;
-       int ret = 0;
+       enum bucket_alloc_ret ret = ALLOC_SUCCESS;
 
        devs = h->devs;
 
        for_each_set_bit(i, h->s->blocks_allocated, BCH_BKEY_PTRS_MAX) {
-               __clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d);
+               __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d);
                --nr_data;
        }
 
@@ -1242,9 +1271,11 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h)
                                            h->redundancy,
                                            &nr_have,
                                            &have_cache,
-                                           RESERVE_NONE,
+                                           h->copygc
+                                           ? RESERVE_MOVINGGC
+                                           : RESERVE_NONE,
                                            0,
-                                           NULL);
+                                           cl);
                if (ret)
                        goto err;
        }
@@ -1258,9 +1289,11 @@ static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h)
                                            nr_data,
                                            &nr_have,
                                            &have_cache,
-                                           RESERVE_NONE,
+                                           h->copygc
+                                           ? RESERVE_MOVINGGC
+                                           : RESERVE_NONE,
                                            0,
-                                           NULL);
+                                           cl);
                if (ret)
                        goto err;
        }
@@ -1326,64 +1359,84 @@ static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *strip
 struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
                                               unsigned target,
                                               unsigned algo,
-                                              unsigned redundancy)
+                                              unsigned redundancy,
+                                              bool copygc,
+                                              struct closure *cl)
 {
-       struct closure cl;
        struct ec_stripe_head *h;
        struct open_bucket *ob;
        unsigned i, data_idx = 0;
        s64 idx;
        int ret;
 
-       closure_init_stack(&cl);
-
-       h = __bch2_ec_stripe_head_get(c, target, algo, redundancy);
-       if (!h)
+       h = __bch2_ec_stripe_head_get(c, target, algo, redundancy, copygc);
+       if (!h) {
+               bch_err(c, "no stripe head");
                return NULL;
+       }
 
        if (!h->s) {
                if (ec_new_stripe_alloc(c, h)) {
                        bch2_ec_stripe_head_put(c, h);
+                       bch_err(c, "failed to allocate new stripe");
                        return NULL;
                }
 
                idx = get_existing_stripe(c, target, algo, redundancy);
                if (idx >= 0) {
-                       h->s->existing_stripe = true;
-                       h->s->existing_stripe_idx = idx;
-                       if (get_stripe_key(c, idx, &h->s->stripe)) {
-                               /* btree error */
+                       h->s->have_existing_stripe = true;
+                       ret = get_stripe_key(c, idx, &h->s->existing_stripe);
+                       if (ret) {
+                               bch2_fs_fatal_error(c, "error reading stripe key: %i", ret);
+                               bch2_ec_stripe_head_put(c, h);
+                               return NULL;
+                       }
+
+                       if (ec_stripe_buf_alloc(&h->s->existing_stripe)) {
+                               /*
+                                * this is a problem: we have deleted from the
+                                * stripes heap already
+                                */
                                BUG();
                        }
 
-                       for (i = 0; i < h->s->stripe.key.v.nr_blocks; i++)
-                               if (stripe_blockcount_get(&h->s->stripe.key.v, i)) {
+                       for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) {
+                               if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i))
                                        __set_bit(i, h->s->blocks_allocated);
-                                       ec_block_io(c, &h->s->stripe, READ, i, &cl);
-                               }
+
+                               ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone);
+                       }
+
+                       bkey_copy(&h->s->new_stripe.key.k_i,
+                                 &h->s->existing_stripe.key.k_i);
+               }
+
+               if (ec_stripe_buf_alloc(&h->s->new_stripe)) {
+                       BUG();
                }
        }
 
        if (!h->s->allocated) {
-               if (!h->s->existing_stripe &&
+               if (!h->s->have_existing_stripe &&
                    !h->s->res.sectors) {
                        ret = bch2_disk_reservation_get(c, &h->s->res,
-                                                       h->blocksize,
-                                                       h->s->nr_parity, 0);
+                                       h->blocksize,
+                                       h->s->nr_parity, 0);
                        if (ret) {
-                               /* What should we do here? */
-                               bch_err(c, "unable to create new stripe: %i", ret);
+                               /*
+                                * This means we need to wait for copygc to
+                                * empty out buckets from existing stripes:
+                                */
                                bch2_ec_stripe_head_put(c, h);
                                h = NULL;
                                goto out;
-
                        }
-
                }
 
-               if (new_stripe_alloc_buckets(c, h)) {
+               ret = new_stripe_alloc_buckets(c, h, cl);
+               if (ret) {
                        bch2_ec_stripe_head_put(c, h);
-                       h = NULL;
+                       h = ERR_PTR(-ret);
                        goto out;
                }
 
@@ -1392,19 +1445,18 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c,
                                                      h->s->nr_data, data_idx);
                        BUG_ON(data_idx >= h->s->nr_data);
 
-                       h->s->stripe.key.v.ptrs[data_idx] = ob->ptr;
+                       h->s->new_stripe.key.v.ptrs[data_idx] = ob->ptr;
                        h->s->data_block_idx[i] = data_idx;
                        data_idx++;
                }
 
                open_bucket_for_each(c, &h->s->parity, ob, i)
-                       h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr;
+                       h->s->new_stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr;
 
                //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]);
                h->s->allocated = true;
        }
 out:
-       closure_sync(&cl);
        return h;
 }
 
index 450bb1a113a30c200db219525076a7522d34b275..97a263cf9c87ee66123b6a1cc28e343dc7004b1c 100644 (file)
@@ -88,6 +88,7 @@ struct ec_stripe_new {
        struct ec_stripe_head   *h;
        struct mutex            lock;
        struct list_head        list;
+       struct closure          iodone;
 
        /* counts in flight writes, stripe is created when pin == 0 */
        atomic_t                pin;
@@ -98,8 +99,7 @@ struct ec_stripe_new {
        u8                      nr_parity;
        bool                    allocated;
        bool                    pending;
-       bool                    existing_stripe;
-       u64                     existing_stripe_idx;
+       bool                    have_existing_stripe;
 
        unsigned long           blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)];
 
@@ -111,7 +111,8 @@ struct ec_stripe_new {
        struct keylist          keys;
        u64                     inline_keys[BKEY_U64s * 8];
 
-       struct ec_stripe_buf    stripe;
+       struct ec_stripe_buf    new_stripe;
+       struct ec_stripe_buf    existing_stripe;
 };
 
 struct ec_stripe_head {
@@ -121,6 +122,7 @@ struct ec_stripe_head {
        unsigned                target;
        unsigned                algo;
        unsigned                redundancy;
+       bool                    copygc;
 
        struct bch_devs_mask    devs;
        unsigned                nr_active_devs;
@@ -145,8 +147,8 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *);
 int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *);
 
 void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *);
-struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned,
-                                              unsigned, unsigned);
+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *,
+                       unsigned, unsigned, unsigned, bool, struct closure *);
 
 void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t);
 void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t);
index fd011df3cb9943b2dd9c2ae0f3a80191263c5b41..1faca4bc182567c93a2c799fc418311786775ddc 100644 (file)
@@ -1,6 +1,5 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
-#include "bkey_on_stack.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "buckets.h"
index 828ccf07da610eb6faa9d46dbd7ca331dc14ea8b..c0ae31238b488c6e9589b18e3094b1d2f5907a27 100644 (file)
@@ -665,7 +665,7 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k)
 }
 
 bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
-                               unsigned nr_replicas)
+                               unsigned nr_replicas, bool compressed)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
@@ -683,7 +683,8 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
                if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
                        break;
 
-               if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) {
+               if (nr_replicas > bch2_bkey_replicas(c, k) ||
+                   (!compressed && bch2_bkey_sectors_compressed(k))) {
                        ret = false;
                        break;
                }
@@ -693,6 +694,33 @@ bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
        return ret;
 }
 
+unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const union bch_extent_entry *entry;
+       struct extent_ptr_decoded p;
+       unsigned replicas = 0;
+
+       bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
+               if (p.ptr.cached)
+                       continue;
+
+               if (p.has_ec) {
+                       struct stripe *s =
+                               genradix_ptr(&c->stripes[0], p.ec.idx);
+
+                       WARN_ON(!s);
+                       if (s)
+                               replicas += s->nr_redundant;
+               }
+
+               replicas++;
+
+       }
+
+       return replicas;
+}
+
 static unsigned bch2_extent_ptr_durability(struct bch_fs *c,
                                           struct extent_ptr_decoded p)
 {
index 74c7bb8f9104e1603207054eab4d66526c5f7bcc..ebe0a04c785081a65eee3010f7ccb03d2a09bb78 100644 (file)
@@ -538,7 +538,9 @@ unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
 unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
 bool bch2_bkey_is_incompressible(struct bkey_s_c);
 unsigned bch2_bkey_sectors_compressed(struct bkey_s_c);
-bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
+bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool);
+
+unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c);
 
 void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s,
index 53c6660e07f8092822373549c40e2bc64c5f9de0..959eff4c925849cd82cd938644261d11c1eb5ad3 100644 (file)
@@ -3,7 +3,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "clock.h"
@@ -791,7 +791,7 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
                       struct readpages_iter *readpages_iter)
 {
        struct bch_fs *c = trans->c;
-       struct bkey_on_stack sk;
+       struct bkey_buf sk;
        int flags = BCH_READ_RETRY_IF_STALE|
                BCH_READ_MAY_PROMOTE;
        int ret = 0;
@@ -799,7 +799,7 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter,
        rbio->c = c;
        rbio->start_time = local_clock();
 
-       bkey_on_stack_init(&sk);
+       bch2_bkey_buf_init(&sk);
 retry:
        while (1) {
                struct bkey_s_c k;
@@ -817,7 +817,7 @@ retry:
                        bkey_start_offset(k.k);
                sectors = k.k->size - offset_into_extent;
 
-               bkey_on_stack_reassemble(&sk, c, k);
+               bch2_bkey_buf_reassemble(&sk, c, k);
 
                ret = bch2_read_indirect_extent(trans,
                                        &offset_into_extent, &sk);
@@ -862,7 +862,7 @@ retry:
                bio_endio(&rbio->bio);
        }
 
-       bkey_on_stack_exit(&sk, c);
+       bch2_bkey_buf_exit(&sk, c);
 }
 
 void bch2_readahead(struct readahead_control *ractl)
@@ -1863,7 +1863,9 @@ static long bch2_dio_write_loop(struct dio_write *dio)
                                                dio->op.opts.data_replicas, 0);
                if (unlikely(ret) &&
                    !bch2_check_range_allocated(c, dio->op.pos,
-                               bio_sectors(bio), dio->op.opts.data_replicas))
+                               bio_sectors(bio),
+                               dio->op.opts.data_replicas,
+                               dio->op.opts.compression != 0))
                        goto err;
 
                task_io_account_write(bio->bi_iter.bi_size);
@@ -2414,7 +2416,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
 {
        struct bch_fs *c = inode->v.i_sb->s_fs_info;
        struct address_space *mapping = inode->v.i_mapping;
-       struct bkey_on_stack copy;
+       struct bkey_buf copy;
        struct btree_trans trans;
        struct btree_iter *src, *dst;
        loff_t shift, new_size;
@@ -2424,7 +2426,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
        if ((offset | len) & (block_bytes(c) - 1))
                return -EINVAL;
 
-       bkey_on_stack_init(&copy);
+       bch2_bkey_buf_init(&copy);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256);
 
        /*
@@ -2512,7 +2514,7 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode,
                    bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0)
                        break;
 reassemble:
-               bkey_on_stack_reassemble(&copy, c, k);
+               bch2_bkey_buf_reassemble(&copy, c, k);
 
                if (insert &&
                    bkey_cmp(bkey_start_pos(k.k), move_pos) < 0)
@@ -2589,7 +2591,7 @@ bkey_err:
        }
 err:
        bch2_trans_exit(&trans);
-       bkey_on_stack_exit(&copy, c);
+       bch2_bkey_buf_exit(&copy, c);
        bch2_pagecache_block_put(&inode->ei_pagecache_lock);
        inode_unlock(&inode->v);
        return ret;
index e3edca4d265b60fe094905e3f74b0d415d4fd6a0..9ce031728af5d2cf35211db1a201e7bbf3a50de8 100644 (file)
@@ -3,7 +3,7 @@
 
 #include "bcachefs.h"
 #include "acl.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "buckets.h"
 #include "chardev.h"
@@ -886,7 +886,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
-       struct bkey_on_stack cur, prev;
+       struct bkey_buf cur, prev;
        struct bpos end = POS(ei->v.i_ino, (start + len) >> 9);
        unsigned offset_into_extent, sectors;
        bool have_extent = false;
@@ -899,8 +899,8 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
        if (start + len < start)
                return -EINVAL;
 
-       bkey_on_stack_init(&cur);
-       bkey_on_stack_init(&prev);
+       bch2_bkey_buf_init(&cur);
+       bch2_bkey_buf_init(&prev);
        bch2_trans_init(&trans, c, 0, 0);
 
        iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -919,7 +919,7 @@ retry:
                        bkey_start_offset(k.k);
                sectors                 = k.k->size - offset_into_extent;
 
-               bkey_on_stack_reassemble(&cur, c, k);
+               bch2_bkey_buf_reassemble(&cur, c, k);
 
                ret = bch2_read_indirect_extent(&trans,
                                        &offset_into_extent, &cur);
@@ -927,7 +927,7 @@ retry:
                        break;
 
                k = bkey_i_to_s_c(cur.k);
-               bkey_on_stack_realloc(&prev, c, k.k->u64s);
+               bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
 
                sectors = min(sectors, k.k->size - offset_into_extent);
 
@@ -961,8 +961,8 @@ retry:
                                       FIEMAP_EXTENT_LAST);
 
        ret = bch2_trans_exit(&trans) ?: ret;
-       bkey_on_stack_exit(&cur, c);
-       bkey_on_stack_exit(&prev, c);
+       bch2_bkey_buf_exit(&cur, c);
+       bch2_bkey_buf_exit(&prev, c);
        return ret < 0 ? ret : 0;
 }
 
@@ -1007,10 +1007,7 @@ static const struct file_operations bch_file_operations = {
        .open           = generic_file_open,
        .fsync          = bch2_fsync,
        .splice_read    = generic_file_splice_read,
-       /*
-        * Broken, on v5.3:
        .splice_write   = iter_file_splice_write,
-       */
        .fallocate      = bch2_fallocate_dispatch,
        .unlocked_ioctl = bch2_fs_file_ioctl,
 #ifdef CONFIG_COMPAT
index 39f872de0c186d5ec388867d2bc848c8640af080..df0f00f10bd73cad36e9feb2443469df95a1b39d 100644 (file)
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "dirent.h"
 #include "error.h"
@@ -464,11 +464,11 @@ static int check_extents(struct bch_fs *c)
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
-       struct bkey_on_stack prev;
+       struct bkey_buf prev;
        u64 i_sectors;
        int ret = 0;
 
-       bkey_on_stack_init(&prev);
+       bch2_bkey_buf_init(&prev);
        prev.k->k = KEY(0, 0, 0);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
@@ -500,7 +500,7 @@ retry:
                                        goto err;
                        }
                }
-               bkey_on_stack_reassemble(&prev, c, k);
+               bch2_bkey_buf_reassemble(&prev, c, k);
 
                ret = walk_inode(&trans, &w, k.k->p.inode);
                if (ret)
@@ -569,7 +569,7 @@ err:
 fsck_err:
        if (ret == -EINTR)
                goto retry;
-       bkey_on_stack_exit(&prev, c);
+       bch2_bkey_buf_exit(&prev, c);
        return bch2_trans_exit(&trans) ?: ret;
 }
 
index abf204ef21cac13cca8041890930210ed6614ee8..4c4ba07c10f147350e9244a580eb92c1bcf9ae2d 100644 (file)
@@ -9,7 +9,7 @@
 #include "bcachefs.h"
 #include "alloc_background.h"
 #include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "bset.h"
 #include "btree_update.h"
 #include "buckets.h"
@@ -183,18 +183,23 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio,
 
 /* Extent update path: */
 
-static int sum_sector_overwrites(struct btree_trans *trans,
-                                struct btree_iter *extent_iter,
-                                struct bkey_i *new,
-                                bool *maybe_extending,
-                                s64 *i_sectors_delta,
-                                s64 *disk_sectors_delta)
+int bch2_sum_sector_overwrites(struct btree_trans *trans,
+                              struct btree_iter *extent_iter,
+                              struct bkey_i *new,
+                              bool *maybe_extending,
+                              bool *should_check_enospc,
+                              s64 *i_sectors_delta,
+                              s64 *disk_sectors_delta)
 {
+       struct bch_fs *c = trans->c;
        struct btree_iter *iter;
        struct bkey_s_c old;
+       unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new));
+       bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new));
        int ret = 0;
 
        *maybe_extending        = true;
+       *should_check_enospc    = false;
        *i_sectors_delta        = 0;
        *disk_sectors_delta     = 0;
 
@@ -213,6 +218,11 @@ static int sum_sector_overwrites(struct btree_trans *trans,
                        (int) (bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)) -
                               bch2_bkey_nr_ptrs_fully_allocated(old));
 
+               if (!*should_check_enospc &&
+                   (new_replicas > bch2_bkey_replicas(c, old) ||
+                    (!new_compressed && bch2_bkey_sectors_compressed(old))))
+                       *should_check_enospc = true;
+
                if (bkey_cmp(old.k->p, new->k.p) >= 0) {
                        /*
                         * Check if there's already data above where we're
@@ -250,7 +260,7 @@ int bch2_extent_update(struct btree_trans *trans,
 {
        /* this must live until after bch2_trans_commit(): */
        struct bkey_inode_buf inode_p;
-       bool extending = false;
+       bool extending = false, should_check_enospc;
        s64 i_sectors_delta = 0, disk_sectors_delta = 0;
        int ret;
 
@@ -258,8 +268,9 @@ int bch2_extent_update(struct btree_trans *trans,
        if (ret)
                return ret;
 
-       ret = sum_sector_overwrites(trans, iter, k,
+       ret = bch2_sum_sector_overwrites(trans, iter, k,
                        &extending,
+                       &should_check_enospc,
                        &i_sectors_delta,
                        &disk_sectors_delta);
        if (ret)
@@ -269,7 +280,8 @@ int bch2_extent_update(struct btree_trans *trans,
            disk_sectors_delta > (s64) disk_res->sectors) {
                ret = bch2_disk_reservation_add(trans->c, disk_res,
                                        disk_sectors_delta - disk_res->sectors,
-                                       0);
+                                       !should_check_enospc
+                                       ? BCH_DISK_RESERVATION_NOFAIL : 0);
                if (ret)
                        return ret;
        }
@@ -320,8 +332,7 @@ int bch2_extent_update(struct btree_trans *trans,
 
        ret = bch2_trans_commit(trans, disk_res, journal_seq,
                                BTREE_INSERT_NOCHECK_RW|
-                               BTREE_INSERT_NOFAIL|
-                               BTREE_INSERT_USE_RESERVE);
+                               BTREE_INSERT_NOFAIL);
        if (ret)
                return ret;
 
@@ -404,14 +415,14 @@ int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end,
 int bch2_write_index_default(struct bch_write_op *op)
 {
        struct bch_fs *c = op->c;
-       struct bkey_on_stack sk;
+       struct bkey_buf sk;
        struct keylist *keys = &op->insert_keys;
        struct bkey_i *k = bch2_keylist_front(keys);
        struct btree_trans trans;
        struct btree_iter *iter;
        int ret;
 
-       bkey_on_stack_init(&sk);
+       bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024);
 
        iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -423,7 +434,7 @@ int bch2_write_index_default(struct bch_write_op *op)
 
                k = bch2_keylist_front(keys);
 
-               bkey_on_stack_realloc(&sk, c, k->k.u64s);
+               bch2_bkey_buf_realloc(&sk, c, k->k.u64s);
                bkey_copy(sk.k, k);
                bch2_cut_front(iter->pos, sk.k);
 
@@ -440,7 +451,7 @@ int bch2_write_index_default(struct bch_write_op *op)
        } while (!bch2_keylist_empty(keys));
 
        bch2_trans_exit(&trans);
-       bkey_on_stack_exit(&sk, c);
+       bch2_bkey_buf_exit(&sk, c);
 
        return ret;
 }
@@ -1617,14 +1628,14 @@ static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio
 {
        struct btree_trans trans;
        struct btree_iter *iter;
-       struct bkey_on_stack sk;
+       struct bkey_buf sk;
        struct bkey_s_c k;
        int ret;
 
        flags &= ~BCH_READ_LAST_FRAGMENT;
        flags |= BCH_READ_MUST_CLONE;
 
-       bkey_on_stack_init(&sk);
+       bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
 
        iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS,
@@ -1636,7 +1647,7 @@ retry:
        if (bkey_err(k))
                goto err;
 
-       bkey_on_stack_reassemble(&sk, c, k);
+       bch2_bkey_buf_reassemble(&sk, c, k);
        k = bkey_i_to_s_c(sk.k);
        bch2_trans_unlock(&trans);
 
@@ -1657,7 +1668,7 @@ retry:
 out:
        bch2_rbio_done(rbio);
        bch2_trans_exit(&trans);
-       bkey_on_stack_exit(&sk, c);
+       bch2_bkey_buf_exit(&sk, c);
        return;
 err:
        rbio->bio.bi_status = BLK_STS_IOERR;
@@ -1670,14 +1681,14 @@ static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio,
 {
        struct btree_trans trans;
        struct btree_iter *iter;
-       struct bkey_on_stack sk;
+       struct bkey_buf sk;
        struct bkey_s_c k;
        int ret;
 
        flags &= ~BCH_READ_LAST_FRAGMENT;
        flags |= BCH_READ_MUST_CLONE;
 
-       bkey_on_stack_init(&sk);
+       bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
 retry:
        bch2_trans_begin(&trans);
@@ -1687,7 +1698,7 @@ retry:
                           BTREE_ITER_SLOTS, k, ret) {
                unsigned bytes, sectors, offset_into_extent;
 
-               bkey_on_stack_reassemble(&sk, c, k);
+               bch2_bkey_buf_reassemble(&sk, c, k);
 
                offset_into_extent = iter->pos.offset -
                        bkey_start_offset(k.k);
@@ -1736,7 +1747,7 @@ err:
        rbio->bio.bi_status = BLK_STS_IOERR;
 out:
        bch2_trans_exit(&trans);
-       bkey_on_stack_exit(&sk, c);
+       bch2_bkey_buf_exit(&sk, c);
        bch2_rbio_done(rbio);
 }
 
@@ -1807,17 +1818,6 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
        if ((ret = bkey_err(k)))
                goto out;
 
-       /*
-        * going to be temporarily appending another checksum entry:
-        */
-       new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
-                                BKEY_EXTENT_U64s_MAX * 8);
-       if ((ret = PTR_ERR_OR_ZERO(new)))
-               goto out;
-
-       bkey_reassemble(new, k);
-       k = bkey_i_to_s_c(new);
-
        if (bversion_cmp(k.k->version, rbio->version) ||
            !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset))
                goto out;
@@ -1836,6 +1836,16 @@ static int __bch2_rbio_narrow_crcs(struct btree_trans *trans,
                goto out;
        }
 
+       /*
+        * going to be temporarily appending another checksum entry:
+        */
+       new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) +
+                                sizeof(struct bch_extent_crc128));
+       if ((ret = PTR_ERR_OR_ZERO(new)))
+               goto out;
+
+       bkey_reassemble(new, k);
+
        if (!bch2_bkey_narrow_crcs(new, new_crc))
                goto out;
 
@@ -2002,7 +2012,7 @@ static void bch2_read_endio(struct bio *bio)
 
 int __bch2_read_indirect_extent(struct btree_trans *trans,
                                unsigned *offset_into_extent,
-                               struct bkey_on_stack *orig_k)
+                               struct bkey_buf *orig_k)
 {
        struct btree_iter *iter;
        struct bkey_s_c k;
@@ -2029,7 +2039,7 @@ int __bch2_read_indirect_extent(struct btree_trans *trans,
        }
 
        *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k);
-       bkey_on_stack_reassemble(orig_k, trans->c, k);
+       bch2_bkey_buf_reassemble(orig_k, trans->c, k);
 err:
        bch2_trans_iter_put(trans, iter);
        return ret;
@@ -2208,7 +2218,11 @@ get_bio:
 
        bch2_increment_clock(c, bio_sectors(&rbio->bio), READ);
 
-       if (pick.ptr.cached)
+       /*
+        * If it's being moved internally, we don't want to flag it as a cache
+        * hit:
+        */
+       if (pick.ptr.cached && !(flags & BCH_READ_NODECODE))
                bch2_bucket_io_time_reset(trans, pick.ptr.dev,
                        PTR_BUCKET_NR(ca, &pick.ptr), READ);
 
@@ -2290,7 +2304,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
-       struct bkey_on_stack sk;
+       struct bkey_buf sk;
        struct bkey_s_c k;
        unsigned flags = BCH_READ_RETRY_IF_STALE|
                BCH_READ_MAY_PROMOTE|
@@ -2304,7 +2318,7 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode)
        rbio->c = c;
        rbio->start_time = local_clock();
 
-       bkey_on_stack_init(&sk);
+       bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
 retry:
        bch2_trans_begin(&trans);
@@ -2327,7 +2341,7 @@ retry:
                        bkey_start_offset(k.k);
                sectors = k.k->size - offset_into_extent;
 
-               bkey_on_stack_reassemble(&sk, c, k);
+               bch2_bkey_buf_reassemble(&sk, c, k);
 
                ret = bch2_read_indirect_extent(&trans,
                                        &offset_into_extent, &sk);
@@ -2364,7 +2378,7 @@ retry:
        }
 out:
        bch2_trans_exit(&trans);
-       bkey_on_stack_exit(&sk, c);
+       bch2_bkey_buf_exit(&sk, c);
        return;
 err:
        if (ret == -EINTR)
index e6aac594f3e6a8e0267c0cad3aabde00059b0fae..04f6baa1daf7ddb678737a90d23c459425ee8cc6 100644 (file)
@@ -3,7 +3,7 @@
 #define _BCACHEFS_IO_H
 
 #include "checksum.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "io_types.h"
 
 #define to_wbio(_bio)                  \
@@ -60,6 +60,8 @@ static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op)
                : op->c->wq;
 }
 
+int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *,
+                              struct bkey_i *, bool *, bool *, s64 *, s64 *);
 int bch2_extent_update(struct btree_trans *, struct btree_iter *,
                       struct bkey_i *, struct disk_reservation *,
                       u64 *, u64, s64 *);
@@ -112,11 +114,11 @@ struct cache_promote_op;
 struct extent_ptr_decoded;
 
 int __bch2_read_indirect_extent(struct btree_trans *, unsigned *,
-                               struct bkey_on_stack *);
+                               struct bkey_buf *);
 
 static inline int bch2_read_indirect_extent(struct btree_trans *trans,
                                            unsigned *offset_into_extent,
-                                           struct bkey_on_stack *k)
+                                           struct bkey_buf *k)
 {
        return k->k->k.type == KEY_TYPE_reflink_p
                ? __bch2_read_indirect_extent(trans, offset_into_extent, k)
index d544248293789acb4cecb00c0ea79f87b0f99bf7..69e487bc29ff0f3f6d31df94926c30d01261f6eb 100644 (file)
@@ -777,7 +777,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
                        }
                } else {
                        rcu_read_lock();
-                       ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC,
+                       ob = bch2_bucket_alloc(c, ca, RESERVE_NONE,
                                               false, cl);
                        rcu_read_unlock();
                        if (IS_ERR(ob)) {
@@ -1095,7 +1095,7 @@ int bch2_fs_journal_init(struct journal *j)
 
        /* Btree roots: */
        j->entry_u64s_reserved +=
-               BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX);
+               BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX);
 
        atomic64_set(&j->reservations.counter,
                ((union journal_res_state)
index 0e6fbe2f6a7542ac24495a8a02864af35638fa48..2a344a04de871b1ffd1cf688f10d5f5b7fd5b915 100644 (file)
@@ -577,8 +577,15 @@ reread:
                        if (bch2_dev_io_err_on(ret, ca,
                                               "journal read error: sector %llu",
                                               offset) ||
-                           bch2_meta_read_fault("journal"))
-                               return -EIO;
+                           bch2_meta_read_fault("journal")) {
+                               /*
+                                * We don't error out of the recovery process
+                                * here, since the relevant journal entry may be
+                                * found on a different device, and missing or
+                                * no journal entries will be handled later
+                                */
+                               return 0;
+                       }
 
                        j = buf->data;
                }
@@ -990,6 +997,8 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w,
 done:
        rcu_read_unlock();
 
+       BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
+
        return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
 }
 
@@ -1050,9 +1059,13 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
                return;
 
        memcpy(new_buf, buf->data, buf->buf_size);
-       kvpfree(buf->data, buf->buf_size);
-       buf->data       = new_buf;
-       buf->buf_size   = new_size;
+
+       spin_lock(&j->lock);
+       swap(buf->data,         new_buf);
+       swap(buf->buf_size,     new_size);
+       spin_unlock(&j->lock);
+
+       kvpfree(new_buf, new_size);
 }
 
 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
index 67ee47eb17a79c4b1cf5e091f84a0ff5fd587633..9953663e3a634a5606b0359b8870da3e6c81378b 100644 (file)
@@ -20,7 +20,7 @@
 struct journal_buf {
        struct jset             *data;
 
-       BKEY_PADDED(key);
+       __BKEY_PADDED(key, BCH_REPLICAS_MAX);
 
        struct closure_waitlist wait;
 
index 96c8690adc5bf51cfaecc8e8628a45109a07d7a1..6241ff0c129fa0f957e0f41f24eb6234069e5b0e 100644 (file)
@@ -4,7 +4,7 @@
  */
 
 #include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "buckets.h"
@@ -41,10 +41,10 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
-       struct bkey_on_stack sk;
+       struct bkey_buf sk;
        int ret = 0;
 
-       bkey_on_stack_init(&sk);
+       bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
        iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
@@ -57,7 +57,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
                        continue;
                }
 
-               bkey_on_stack_reassemble(&sk, c, k);
+               bch2_bkey_buf_reassemble(&sk, c, k);
 
                ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k),
                                    dev_idx, flags, false);
@@ -90,7 +90,7 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
        }
 
        ret = bch2_trans_exit(&trans) ?: ret;
-       bkey_on_stack_exit(&sk, c);
+       bch2_bkey_buf_exit(&sk, c);
 
        BUG_ON(ret == -EINTR);
 
@@ -109,6 +109,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
        struct btree_iter *iter;
        struct closure cl;
        struct btree *b;
+       struct bkey_buf k;
        unsigned id;
        int ret;
 
@@ -116,28 +117,28 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
        if (flags & BCH_FORCE_IF_METADATA_LOST)
                return -EINVAL;
 
+       bch2_bkey_buf_init(&k);
        bch2_trans_init(&trans, c, 0, 0);
        closure_init_stack(&cl);
 
        for (id = 0; id < BTREE_ID_NR; id++) {
                for_each_btree_node(&trans, iter, id, POS_MIN,
                                    BTREE_ITER_PREFETCH, b) {
-                       __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
 retry:
                        if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key),
                                                  dev_idx))
                                continue;
 
-                       bkey_copy(&tmp.k, &b->key);
+                       bch2_bkey_buf_copy(&k, c, &b->key);
 
-                       ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.k),
+                       ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
                                            dev_idx, flags, true);
                        if (ret) {
                                bch_err(c, "Cannot drop device without losing data");
                                goto err;
                        }
 
-                       ret = bch2_btree_node_update_key(c, iter, b, &tmp.k);
+                       ret = bch2_btree_node_update_key(c, iter, b, k.k);
                        if (ret == -EINTR) {
                                b = bch2_btree_iter_peek_node(iter);
                                goto retry;
@@ -157,6 +158,7 @@ retry:
        ret = 0;
 err:
        ret = bch2_trans_exit(&trans) ?: ret;
+       bch2_bkey_buf_exit(&k, c);
 
        BUG_ON(ret == -EINTR);
 
index 6633d21f604ab00fc476b8530be18ebc5ba13606..9505eab99332800ed6497879008afdf525d6a2f3 100644 (file)
@@ -2,7 +2,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
@@ -61,8 +61,13 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
        struct migrate_write *m =
                container_of(op, struct migrate_write, op);
        struct keylist *keys = &op->insert_keys;
+       struct bkey_buf _new, _insert;
        int ret = 0;
 
+       bch2_bkey_buf_init(&_new);
+       bch2_bkey_buf_init(&_insert);
+       bch2_bkey_buf_realloc(&_insert, c, U8_MAX);
+
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
        iter = bch2_trans_get_iter(&trans, m->btree_id,
@@ -73,21 +78,18 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                struct bkey_s_c k;
                struct bkey_i *insert;
                struct bkey_i_extent *new;
-               BKEY_PADDED(k) _new, _insert;
                const union bch_extent_entry *entry;
                struct extent_ptr_decoded p;
                bool did_work = false;
-               int nr;
+               bool extending = false, should_check_enospc;
+               s64 i_sectors_delta = 0, disk_sectors_delta = 0;
 
                bch2_trans_reset(&trans, 0);
 
                k = bch2_btree_iter_peek_slot(iter);
                ret = bkey_err(k);
-               if (ret) {
-                       if (ret == -EINTR)
-                               continue;
-                       break;
-               }
+               if (ret)
+                       goto err;
 
                new = bkey_i_to_extent(bch2_keylist_front(keys));
 
@@ -95,11 +97,11 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                    !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset))
                        goto nomatch;
 
-               bkey_reassemble(&_insert.k, k);
-               insert = &_insert.k;
+               bkey_reassemble(_insert.k, k);
+               insert = _insert.k;
 
-               bkey_copy(&_new.k, bch2_keylist_front(keys));
-               new = bkey_i_to_extent(&_new.k);
+               bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys));
+               new = bkey_i_to_extent(_new.k);
                bch2_cut_front(iter->pos, &new->k_i);
 
                bch2_cut_front(iter->pos,       insert);
@@ -144,23 +146,21 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                                               op->opts.background_target,
                                               op->opts.data_replicas);
 
-               /*
-                * If we're not fully overwriting @k, and it's compressed, we
-                * need a reservation for all the pointers in @insert
-                */
-               nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) -
-                        m->nr_ptrs_reserved;
+               ret = bch2_sum_sector_overwrites(&trans, iter, insert,
+                                                &extending,
+                                                &should_check_enospc,
+                                                &i_sectors_delta,
+                                                &disk_sectors_delta);
+               if (ret)
+                       goto err;
 
-               if (insert->k.size < k.k->size &&
-                   bch2_bkey_sectors_compressed(k) &&
-                   nr > 0) {
+               if (disk_sectors_delta > (s64) &op->res.sectors) {
                        ret = bch2_disk_reservation_add(c, &op->res,
-                                       keylist_sectors(keys) * nr, 0);
+                                               disk_sectors_delta - op->res.sectors,
+                                               !should_check_enospc
+                                               ? BCH_DISK_RESERVATION_NOFAIL : 0);
                        if (ret)
                                goto out;
-
-                       m->nr_ptrs_reserved += nr;
-                       goto next;
                }
 
                bch2_trans_update(&trans, iter, insert, 0);
@@ -168,8 +168,8 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                ret = bch2_trans_commit(&trans, &op->res,
                                op_journal_seq(op),
                                BTREE_INSERT_NOFAIL|
-                               BTREE_INSERT_USE_RESERVE|
                                m->data_opts.btree_insert_flags);
+err:
                if (!ret)
                        atomic_long_inc(&c->extent_migrate_done);
                if (ret == -EINTR)
@@ -197,6 +197,8 @@ nomatch:
        }
 out:
        bch2_trans_exit(&trans);
+       bch2_bkey_buf_exit(&_insert, c);
+       bch2_bkey_buf_exit(&_new, c);
        BUG_ON(ret == -EINTR);
        return ret;
 }
@@ -516,7 +518,7 @@ static int __bch2_move_data(struct bch_fs *c,
 {
        bool kthread = (current->flags & PF_KTHREAD) != 0;
        struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts);
-       struct bkey_on_stack sk;
+       struct bkey_buf sk;
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
@@ -525,7 +527,7 @@ static int __bch2_move_data(struct bch_fs *c,
        u64 delay, cur_inum = U64_MAX;
        int ret = 0, ret2;
 
-       bkey_on_stack_init(&sk);
+       bch2_bkey_buf_init(&sk);
        bch2_trans_init(&trans, c, 0, 0);
 
        stats->data_type = BCH_DATA_user;
@@ -605,13 +607,19 @@ peek:
                }
 
                /* unlock before doing IO: */
-               bkey_on_stack_reassemble(&sk, c, k);
+               bch2_bkey_buf_reassemble(&sk, c, k);
                k = bkey_i_to_s_c(sk.k);
                bch2_trans_unlock(&trans);
 
                ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k,
                                        data_cmd, data_opts);
                if (ret2) {
+                       if (ret2 == -EINTR) {
+                               bch2_trans_reset(&trans, 0);
+                               bch2_trans_cond_resched(&trans);
+                               continue;
+                       }
+
                        if (ret2 == -ENOMEM) {
                                /* memory allocation failure, wait for some IO to finish */
                                bch2_move_ctxt_wait_for_io(ctxt);
@@ -633,7 +641,7 @@ next_nondata:
        }
 out:
        ret = bch2_trans_exit(&trans) ?: ret;
-       bkey_on_stack_exit(&sk, c);
+       bch2_bkey_buf_exit(&sk, c);
 
        return ret;
 }
index 2c5daed58acafe9061f331c3d27e512c1216b456..efa7f38ecec62375f7641912c32becec63ae6008 100644 (file)
@@ -200,6 +200,11 @@ static int bch2_copygc(struct bch_fs *c)
                return -1;
        }
 
+       /*
+        * Our btree node allocations also come out of RESERVE_MOVINGGC:
+        */
+       sectors_to_move = (sectors_to_move * 3) / 4;
+
        for (i = h->data; i < h->data + h->used; i++)
                sectors_to_move += i->sectors * i->replicas;
 
index 1883a1faf380c9d69bfd0bd1d720a1c127a94bb3..5a43682c26efce43ce2001d0e99fba57fb94d43d 100644 (file)
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bkey_buf.h"
 #include "alloc_background.h"
 #include "btree_gc.h"
 #include "btree_update.h"
@@ -224,28 +225,29 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
 
                if (b->c.level) {
                        struct btree *child;
-                       BKEY_PADDED(k) tmp;
+                       struct bkey_buf tmp;
 
-                       bkey_reassemble(&tmp.k, k);
-                       k = bkey_i_to_s_c(&tmp.k);
+                       bch2_bkey_buf_init(&tmp);
+                       bch2_bkey_buf_reassemble(&tmp, c, k);
+                       k = bkey_i_to_s_c(tmp.k);
 
                        bch2_btree_and_journal_iter_advance(&iter);
 
-                       if (b->c.level > 0) {
-                               child = bch2_btree_node_get_noiter(c, &tmp.k,
-                                                       b->c.btree_id, b->c.level - 1);
-                               ret = PTR_ERR_OR_ZERO(child);
-                               if (ret)
-                                       break;
+                       child = bch2_btree_node_get_noiter(c, tmp.k,
+                                               b->c.btree_id, b->c.level - 1);
+                       bch2_bkey_buf_exit(&tmp, c);
 
-                               ret   = (node_fn ? node_fn(c, b) : 0) ?:
-                                       bch2_btree_and_journal_walk_recurse(c, child,
-                                               journal_keys, btree_id, node_fn, key_fn);
-                               six_unlock_read(&child->c.lock);
+                       ret = PTR_ERR_OR_ZERO(child);
+                       if (ret)
+                               break;
 
-                               if (ret)
-                                       break;
-                       }
+                       ret   = (node_fn ? node_fn(c, b) : 0) ?:
+                               bch2_btree_and_journal_walk_recurse(c, child,
+                                       journal_keys, btree_id, node_fn, key_fn);
+                       six_unlock_read(&child->c.lock);
+
+                       if (ret)
+                               break;
                } else {
                        bch2_btree_and_journal_iter_advance(&iter);
                }
@@ -936,7 +938,7 @@ int bch2_fs_recovery(struct bch_fs *c)
        struct bch_sb_field_clean *clean = NULL;
        struct jset *last_journal_entry = NULL;
        u64 blacklist_seq, journal_seq;
-       bool write_sb = false, need_write_alloc = false;
+       bool write_sb = false;
        int ret;
 
        if (c->sb.clean)
@@ -1082,10 +1084,8 @@ use_clean:
                bch_info(c, "starting metadata mark and sweep");
                err = "error in mark and sweep";
                ret = bch2_gc(c, &c->journal_keys, true, true);
-               if (ret < 0)
-                       goto err;
                if (ret)
-                       need_write_alloc = true;
+                       goto err;
                bch_verbose(c, "mark and sweep done");
        }
 
@@ -1095,10 +1095,8 @@ use_clean:
                bch_info(c, "starting mark and sweep");
                err = "error in mark and sweep";
                ret = bch2_gc(c, &c->journal_keys, true, false);
-               if (ret < 0)
-                       goto err;
                if (ret)
-                       need_write_alloc = true;
+                       goto err;
                bch_verbose(c, "mark and sweep done");
        }
 
@@ -1122,7 +1120,8 @@ use_clean:
                goto err;
        bch_verbose(c, "journal replay done");
 
-       if (need_write_alloc && !c->opts.nochanges) {
+       if (test_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags) &&
+           !c->opts.nochanges) {
                /*
                 * note that even when filesystem was clean there might be work
                 * to do here, if we ran gc (because of fsck) which recalculated
@@ -1137,8 +1136,6 @@ use_clean:
                        goto err;
                }
                bch_verbose(c, "alloc write done");
-
-               set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags);
        }
 
        if (!c->sb.clean) {
index 8abcbfb3bd645c4ce337a6619d6944178c0ee41f..930547de33091754aae2a19c62ce2e3fd4ded961 100644 (file)
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
-#include "bkey_on_stack.h"
+#include "bkey_buf.h"
 #include "btree_update.h"
 #include "extents.h"
 #include "inode.h"
@@ -198,8 +198,7 @@ s64 bch2_remap_range(struct bch_fs *c,
        struct btree_trans trans;
        struct btree_iter *dst_iter, *src_iter;
        struct bkey_s_c src_k;
-       BKEY_PADDED(k) new_dst;
-       struct bkey_on_stack new_src;
+       struct bkey_buf new_dst, new_src;
        struct bpos dst_end = dst_start, src_end = src_start;
        struct bpos dst_want, src_want;
        u64 src_done, dst_done;
@@ -216,7 +215,8 @@ s64 bch2_remap_range(struct bch_fs *c,
        dst_end.offset += remap_sectors;
        src_end.offset += remap_sectors;
 
-       bkey_on_stack_init(&new_src);
+       bch2_bkey_buf_init(&new_dst);
+       bch2_bkey_buf_init(&new_src);
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096);
 
        src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start,
@@ -257,7 +257,7 @@ s64 bch2_remap_range(struct bch_fs *c,
                        break;
 
                if (src_k.k->type != KEY_TYPE_reflink_p) {
-                       bkey_on_stack_reassemble(&new_src, c, src_k);
+                       bch2_bkey_buf_reassemble(&new_src, c, src_k);
                        src_k = bkey_i_to_s_c(new_src.k);
 
                        bch2_cut_front(src_iter->pos,   new_src.k);
@@ -275,7 +275,7 @@ s64 bch2_remap_range(struct bch_fs *c,
                        struct bkey_s_c_reflink_p src_p =
                                bkey_s_c_to_reflink_p(src_k);
                        struct bkey_i_reflink_p *dst_p =
-                               bkey_reflink_p_init(&new_dst.k);
+                               bkey_reflink_p_init(new_dst.k);
 
                        u64 offset = le64_to_cpu(src_p.v->idx) +
                                (src_iter->pos.offset -
@@ -286,12 +286,12 @@ s64 bch2_remap_range(struct bch_fs *c,
                        BUG();
                }
 
-               new_dst.k.k.p = dst_iter->pos;
-               bch2_key_resize(&new_dst.k.k,
+               new_dst.k->k.p = dst_iter->pos;
+               bch2_key_resize(&new_dst.k->k,
                                min(src_k.k->p.offset - src_iter->pos.offset,
                                    dst_end.offset - dst_iter->pos.offset));
 
-               ret = bch2_extent_update(&trans, dst_iter, &new_dst.k,
+               ret = bch2_extent_update(&trans, dst_iter, new_dst.k,
                                         NULL, journal_seq,
                                         new_i_size, i_sectors_delta);
                if (ret)
@@ -333,7 +333,8 @@ err:
        } while (ret2 == -EINTR);
 
        ret = bch2_trans_exit(&trans) ?: ret;
-       bkey_on_stack_exit(&new_src, c);
+       bch2_bkey_buf_exit(&new_src, c);
+       bch2_bkey_buf_exit(&new_dst, c);
 
        percpu_ref_put(&c->writes);
 
index cc13fc2581150874c2fa54a34b7ad63d831264d1..bfae0d7142e0901c2545aa0b7b366fe1964a3f45 100644 (file)
@@ -798,7 +798,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
 
        pr_buf(out,
                "free_inc:               %zu/%zu\n"
-               "free[RESERVE_BTREE]:    %zu/%zu\n"
                "free[RESERVE_MOVINGGC]: %zu/%zu\n"
                "free[RESERVE_NONE]:     %zu/%zu\n"
                "buckets:\n"
@@ -827,7 +826,6 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
                "open_buckets_user:      %u\n"
                "btree reserve cache:    %u\n",
                fifo_used(&ca->free_inc),               ca->free_inc.size,
-               fifo_used(&ca->free[RESERVE_BTREE]),    ca->free[RESERVE_BTREE].size,
                fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size,
                fifo_used(&ca->free[RESERVE_NONE]),     ca->free[RESERVE_NONE].size,
                ca->mi.nbuckets - ca->mi.first_bucket,