]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 710cd382bf bcachefs: Fix for leaking of reflinked extents
authorKent Overstreet <kent.overstreet@gmail.com>
Thu, 14 Oct 2021 15:22:05 +0000 (11:22 -0400)
committerKent Overstreet <kent.overstreet@gmail.com>
Thu, 14 Oct 2021 15:22:05 +0000 (11:22 -0400)
.bcachefs_revision
libbcachefs/bcachefs_format.h
libbcachefs/btree_gc.c
libbcachefs/buckets.c
libbcachefs/extents.c
libbcachefs/extents.h
libbcachefs/io.c
libbcachefs/migrate.c
libbcachefs/move.c
libbcachefs/reflink.c
libbcachefs/util.c

index d1622d3d7f9f20b495d76006db145c4f044f7c42..978f8ce2f7f5ac7fa97fff2ec74d4eba65357da5 100644 (file)
@@ -1 +1 @@
-07c2895cb3372c0e7b406ab13264de80c7ff19eb
+710cd382bf5f50ab8114a7cc22d78b5b2f574529
index 0b8eabe5eaa47a2d3922e2a2d12aa5deef6df18d..1c4b76ed125cfccde8860e7c5346fc5524f7615f 100644 (file)
@@ -913,18 +913,24 @@ struct bch_stripe {
 struct bch_reflink_p {
        struct bch_val          v;
        __le64                  idx;
-
-       __le32                  reservation_generation;
-       __u8                    nr_replicas;
-       __u8                    pad[3];
-};
+       /*
+        * A reflink pointer might point to an indirect extent which is then
+        * later split (by copygc or rebalance). If we only pointed to part of
+        * the original indirect extent, and then one of the fragments is
+        * outside the range we point to, we'd leak a refcount: so when creating
+        * reflink pointers, we need to store pad values to remember the full
+        * range we were taking a reference on.
+        */
+       __le32                  front_pad;
+       __le32                  back_pad;
+} __attribute__((packed, aligned(8)));
 
 struct bch_reflink_v {
        struct bch_val          v;
        __le64                  refcount;
        union bch_extent_entry  start[0];
        __u64                   _data[0];
-};
+} __attribute__((packed, aligned(8)));
 
 struct bch_indirect_inline_data {
        struct bch_val          v;
index 079424227a1c9c8d1e00a556a6168e902c379213..236ecbd82a638e1596981ce4983300fc08d0da26 100644 (file)
@@ -1767,7 +1767,6 @@ static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
                        bch2_bkey_buf_reassemble(&sk, c, k);
                        bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
-
                        commit_err =
                                bch2_trans_update(&trans, &iter, sk.k, 0) ?:
                                bch2_trans_commit(&trans, NULL, NULL,
index 5fd3aabb76692b45b74f2d317bbb951f9aa3939b..c45bcd0b43bd78a33ad0f2ee3f6215934ad2a722 100644 (file)
@@ -1143,8 +1143,10 @@ static int bch2_mark_reflink_p(struct bch_fs *c,
        struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
        struct reflink_gc *ref;
        size_t l, r, m;
-       u64 idx = le64_to_cpu(p.v->idx);
-       unsigned sectors = p.k->size;
+       u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
+       u64 sectors = (u64) le32_to_cpu(p.v->front_pad) +
+                           le32_to_cpu(p.v->back_pad) +
+                           p.k->size;
        s64 ret = 0;
 
        BUG_ON((flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE)) ==
@@ -1720,12 +1722,33 @@ static int __bch2_trans_mark_reflink_p(struct btree_trans *trans,
                bch2_fs_inconsistent(c,
                        "%llu:%llu len %u points to nonexistent indirect extent %llu",
                        p.k->p.inode, p.k->p.offset, p.k->size, idx);
-               bch2_inconsistent_error(c);
                ret = -EIO;
                goto err;
        }
 
-       BUG_ON(!*refcount && (flags & BTREE_TRIGGER_OVERWRITE));
+       if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
+               bch2_fs_inconsistent(c,
+                       "%llu:%llu len %u idx %llu indirect extent refcount underflow",
+                       p.k->p.inode, p.k->p.offset, p.k->size, idx);
+               ret = -EIO;
+               goto err;
+       }
+
+       if (flags & BTREE_TRIGGER_INSERT) {
+               struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
+               u64 pad;
+
+               pad = max_t(s64, le32_to_cpu(v->front_pad),
+                           le64_to_cpu(v->idx) - bkey_start_offset(k.k));
+               BUG_ON(pad > U32_MAX);
+               v->front_pad = cpu_to_le32(pad);
+
+               pad = max_t(s64, le32_to_cpu(v->back_pad),
+                           k.k->p.offset - p.k->size - le64_to_cpu(v->idx));
+               BUG_ON(pad > U32_MAX);
+               v->back_pad = cpu_to_le32(pad);
+       }
+
        le64_add_cpu(refcount, add);
 
        if (!*refcount) {
@@ -1748,10 +1771,20 @@ static int bch2_trans_mark_reflink_p(struct btree_trans *trans,
                                     struct bkey_s_c k, unsigned flags)
 {
        struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
-       u64 idx = le64_to_cpu(p.v->idx);
-       unsigned sectors = p.k->size;
+       u64 idx, sectors;
        s64 ret = 0;
 
+       if (flags & BTREE_TRIGGER_INSERT) {
+               struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
+
+               v->front_pad = v->back_pad = 0;
+       }
+
+       idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
+       sectors = (u64) le32_to_cpu(p.v->front_pad) +
+                       le32_to_cpu(p.v->back_pad) +
+                       p.k->size;
+
        while (sectors) {
                ret = __bch2_trans_mark_reflink_p(trans, p, idx, flags);
                if (ret < 0)
index 6c2eed77a3267da7fc70ca5df1a2c40a8accf17c..194fbe21c97f173e7ffe8610d079207346e2d0e9 100644 (file)
@@ -480,7 +480,7 @@ restart_narrow_pointers:
 
        bkey_for_each_ptr_decode(&k->k, ptrs, p, i)
                if (can_narrow_crc(p.crc, n)) {
-                       bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
+                       __bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr);
                        p.ptr.offset += p.crc.offset;
                        p.crc = n;
                        bch2_extent_ptr_decoded_append(k, &p);
@@ -785,41 +785,85 @@ static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs,
        return i;
 }
 
-union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
-                                          struct bch_extent_ptr *ptr)
+static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry)
+{
+       union bch_extent_entry *next = extent_entry_next(entry);
+
+       /* stripes have ptrs, but their layout doesn't work with this code */
+       BUG_ON(k.k->type == KEY_TYPE_stripe);
+
+       memmove_u64s_down(entry, next,
+                         (u64 *) bkey_val_end(k) - (u64 *) next);
+       k.k->u64s -= (u64 *) next - (u64 *) entry;
+}
+
+/*
+ * Returns pointer to the next entry after the one being dropped:
+ */
+union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s k,
+                                            struct bch_extent_ptr *ptr)
 {
        struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
-       union bch_extent_entry *dst, *src, *prev;
+       union bch_extent_entry *entry = to_entry(ptr), *next;
+       union bch_extent_entry *ret = entry;
        bool drop_crc = true;
 
        EBUG_ON(ptr < &ptrs.start->ptr ||
                ptr >= &ptrs.end->ptr);
        EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr);
 
-       src = extent_entry_next(to_entry(ptr));
-       if (src != ptrs.end &&
-           !extent_entry_is_crc(src))
-               drop_crc = false;
-
-       dst = to_entry(ptr);
-       while ((prev = extent_entry_prev(ptrs, dst))) {
-               if (extent_entry_is_ptr(prev))
+       for (next = extent_entry_next(entry);
+            next != ptrs.end;
+            next = extent_entry_next(next)) {
+               if (extent_entry_is_crc(next)) {
                        break;
-
-               if (extent_entry_is_crc(prev)) {
-                       if (drop_crc)
-                               dst = prev;
+               } else if (extent_entry_is_ptr(next)) {
+                       drop_crc = false;
                        break;
                }
+       }
+
+       extent_entry_drop(k, entry);
 
-               dst = prev;
+       while ((entry = extent_entry_prev(ptrs, entry))) {
+               if (extent_entry_is_ptr(entry))
+                       break;
+
+               if ((extent_entry_is_crc(entry) && drop_crc) ||
+                   extent_entry_is_stripe_ptr(entry)) {
+                       ret = (void *) ret - extent_entry_bytes(entry);
+                       extent_entry_drop(k, entry);
+               }
        }
 
-       memmove_u64s_down(dst, src,
-                         (u64 *) ptrs.end - (u64 *) src);
-       k.k->u64s -= (u64 *) src - (u64 *) dst;
+       return ret;
+}
+
+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k,
+                                          struct bch_extent_ptr *ptr)
+{
+       bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr;
+       union bch_extent_entry *ret =
+               __bch2_bkey_drop_ptr(k, ptr);
+
+       /*
+        * If we deleted all the dirty pointers and there's still cached
+        * pointers, we could set the cached pointers to dirty if they're not
+        * stale - but to do that correctly we'd need to grab an open_bucket
+        * reference so that we don't race with bucket reuse:
+        */
+       if (have_dirty &&
+           !bch2_bkey_dirty_devs(k.s_c).nr) {
+               k.k->type = KEY_TYPE_error;
+               set_bkey_val_u64s(k.k, 0);
+               ret = NULL;
+       } else if (!bch2_bkey_nr_ptrs(k.s_c)) {
+               k.k->type = KEY_TYPE_deleted;
+               set_bkey_val_u64s(k.k, 0);
+               ret = NULL;
+       }
 
-       return dst;
+       return ret;
 }
 
 void bch2_bkey_drop_device(struct bkey_s k, unsigned dev)
@@ -889,10 +933,6 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
                ptr->cached &&
                ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr));
 
-       /* will only happen if all pointers were cached: */
-       if (!bch2_bkey_nr_ptrs(k.s_c))
-               k.k->type = KEY_TYPE_deleted;
-
        return bkey_deleted(k.k);
 }
 
index afd3067bb64eb83be7c16f954094f9c148c72be7..9c2567274a2b8d286707d6b1b3594b3d04007ac3 100644 (file)
@@ -78,12 +78,12 @@ static inline size_t extent_entry_u64s(const union bch_extent_entry *entry)
 
 static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
 {
-       switch (extent_entry_type(e)) {
-       case BCH_EXTENT_ENTRY_ptr:
-               return true;
-       default:
-               return false;
-       }
+       return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+}
+
+static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
+{
+       return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
 }
 
 static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
@@ -578,6 +578,8 @@ void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *);
 void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr);
 void bch2_extent_ptr_decoded_append(struct bkey_i *,
                                    struct extent_ptr_decoded *);
+union bch_extent_entry *__bch2_bkey_drop_ptr(struct bkey_s,
+                                            struct bch_extent_ptr *);
 union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s,
                                           struct bch_extent_ptr *);
 
index 0bc72d2a4dd4cf09bc6b11b9e22a8afcb43977f8..65f8645c8bd48c75fabf330b0db5ffc16fc0dfb6 100644 (file)
@@ -782,6 +782,8 @@ static struct bio *bch2_write_bio_alloc(struct bch_fs *c,
                                       ? ((unsigned long) buf & (PAGE_SIZE - 1))
                                       : 0), PAGE_SIZE);
 
+       pages = min(pages, BIO_MAX_VECS);
+
        bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write);
        wbio                    = wbio_init(bio);
        wbio->put_bio           = true;
index 9f9eb799337e2ae22197e0d13aa23febae6ac1be..94d5d99ffd2a4634398af0faae7acfd3f90c78d6 100644 (file)
@@ -73,6 +73,15 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags
                 */
                bch2_extent_normalize(c, bkey_i_to_s(sk.k));
 
+               /*
+                * Since we're not inserting through an extent iterator
+                * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators),
+                * we aren't using the extent overwrite path to delete, we're
+                * just using the normal key deletion path:
+                */
+               if (bkey_deleted(&sk.k->k))
+                       sk.k->k.size = 0;
+
                ret   = bch2_btree_iter_traverse(&iter) ?:
                        bch2_trans_update(&trans, &iter, sk.k,
                                          BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
index 0a8fe7085cc05b5f2afa4253623ce091e744ad5f..790389d485a46c04e18840ef2743aff5b736b155 100644 (file)
@@ -196,7 +196,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op)
                                extent_for_each_ptr(extent_i_to_s(new), new_ptr)
                                        new_ptr->cached = true;
 
-                       bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
+                       __bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr);
                }
 
                extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) {
index 92ff609453b8349e3ce6a25277bcaeece461d388..8726943ac1e629efd2fe40ba8063d7d37f350844 100644 (file)
@@ -32,6 +32,9 @@ const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k)
        if (bkey_val_bytes(p.k) != sizeof(*p.v))
                return "incorrect value size";
 
+       if (le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad))
+               return "idx < front_pad";
+
        return NULL;
 }
 
index 9f21f68e84d34333474f89dcf4504b715d02c95e..52de7c49cacb62db540b64b4f51477f8688a49ec 100644 (file)
@@ -525,7 +525,11 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask)
                if (!page)
                        return -ENOMEM;
 
-               BUG_ON(!bio_add_page(bio, page, len, 0));
+               if (unlikely(!bio_add_page(bio, page, len, 0))) {
+                       __free_page(page);
+                       break;
+               }
+
                size -= len;
        }