]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to bee34d805c bcachefs: Repair bad data pointers
authorKent Overstreet <kent.overstreet@gmail.com>
Thu, 28 Jan 2021 21:16:51 +0000 (16:16 -0500)
committerKent Overstreet <kent.overstreet@gmail.com>
Thu, 28 Jan 2021 21:16:51 +0000 (16:16 -0500)
19 files changed:
.bcachefs_revision
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/btree_cache.c
libbcachefs/btree_cache.h
libbcachefs/btree_gc.c
libbcachefs/btree_gc.h
libbcachefs/btree_io.c
libbcachefs/btree_update_interior.c
libbcachefs/ec.c
libbcachefs/extents.c
libbcachefs/journal.c
libbcachefs/journal_io.c
libbcachefs/journal_io.h
libbcachefs/recovery.c
libbcachefs/recovery.h
libbcachefs/super-io.c
libbcachefs/super.c
libbcachefs/sysfs.c

index 490264142cf706ea512332db0339a44d91ebcebf..69ea54bb14debbcfdfa9f08efde46af1df3ec9c8 100644 (file)
@@ -1 +1 @@
-ffc900d5936ae538e34d18a6ce739d0a5a9178cf
+bee34d805cf75e57f9380e0ee91771b9d90b2b2d
index 505777ba8b54a40e7a724350704473e1d4f77868..91b9375f7341533f81f20fa2435730cba641a36f 100644 (file)
@@ -509,7 +509,8 @@ enum {
        BCH_FS_ERRORS_FIXED,
 
        /* misc: */
-       BCH_FS_FIXED_GENS,
+       BCH_FS_NEED_ANOTHER_GC,
+       BCH_FS_DELETED_NODES,
        BCH_FS_NEED_ALLOC_WRITE,
        BCH_FS_REBUILD_REPLICAS,
        BCH_FS_HOLD_BTREE_WRITES,
@@ -539,11 +540,13 @@ struct journal_keys {
        struct journal_key {
                enum btree_id   btree_id:8;
                unsigned        level:8;
+               bool            allocated;
                struct bkey_i   *k;
                u32             journal_seq;
                u32             journal_offset;
        }                       *d;
        size_t                  nr;
+       size_t                  size;
        u64                     journal_seq_base;
 };
 
@@ -840,6 +843,7 @@ struct bch_fs {
        struct journal          journal;
        struct list_head        journal_entries;
        struct journal_keys     journal_keys;
+       struct list_head        journal_iters;
 
        u64                     last_bucket_seq_cleanup;
 
index 307d5523a52d63a58ccd68b701857b06d21e0e19..6dc150cbf2af48784915579afa0f9dd13fbf5232 100644 (file)
@@ -603,13 +603,14 @@ struct bch_btree_ptr_v2 {
        __u64                   mem_ptr;
        __le64                  seq;
        __le16                  sectors_written;
-       /* In case we ever decide to do variable size btree nodes: */
-       __le16                  sectors;
+       __le16                  flags;
        struct bpos             min_key;
        struct bch_extent_ptr   start[0];
        __u64                   _data[0];
 } __attribute__((packed, aligned(8)));
 
+LE16_BITMASK(BTREE_PTR_RANGE_UPDATED,  struct bch_btree_ptr_v2, flags, 0, 1);
+
 struct bch_extent {
        struct bch_val          v;
 
index bebf9fb01fe1ff1cc3301a33ab0005d951623d70..4fa3f80a805e39b6546f2725a8549e4676409da6 100644 (file)
@@ -7,6 +7,7 @@
 #include "btree_iter.h"
 #include "btree_locking.h"
 #include "debug.h"
+#include "error.h"
 
 #include <linux/prefetch.h>
 #include <linux/sched/mm.h>
@@ -812,9 +813,12 @@ lock_node:
                return ERR_PTR(-EIO);
        }
 
-       EBUG_ON(b->c.btree_id != iter->btree_id ||
-               BTREE_NODE_LEVEL(b->data) != level ||
-               bkey_cmp(b->data->max_key, k->k.p));
+       EBUG_ON(b->c.btree_id != iter->btree_id);
+       EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+       EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
+       EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+               bkey_cmp(b->data->min_key,
+                        bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
 
        return b;
 }
@@ -822,7 +826,8 @@ lock_node:
 struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
                                         const struct bkey_i *k,
                                         enum btree_id btree_id,
-                                        unsigned level)
+                                        unsigned level,
+                                        bool nofill)
 {
        struct btree_cache *bc = &c->btree_cache;
        struct btree *b;
@@ -837,6 +842,9 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *c,
 retry:
        b = btree_cache_find(bc, k);
        if (unlikely(!b)) {
+               if (nofill)
+                       return NULL;
+
                b = bch2_btree_node_fill(c, NULL, k, btree_id,
                                         level, SIX_LOCK_read, true);
 
@@ -883,9 +891,12 @@ lock_node:
                return ERR_PTR(-EIO);
        }
 
-       EBUG_ON(b->c.btree_id != btree_id ||
-               BTREE_NODE_LEVEL(b->data) != level ||
-               bkey_cmp(b->data->max_key, k->k.p));
+       EBUG_ON(b->c.btree_id != btree_id);
+       EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
+       EBUG_ON(bkey_cmp(b->data->max_key, k->k.p));
+       EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+               bkey_cmp(b->data->min_key,
+                        bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
 
        return b;
 }
@@ -995,8 +1006,22 @@ out:
                if (sib != btree_prev_sib)
                        swap(n1, n2);
 
-               BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p),
-                               n2->data->min_key));
+               if (bkey_cmp(bkey_successor(n1->key.k.p),
+                            n2->data->min_key)) {
+                       char buf1[200], buf2[200];
+
+                       bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&n1->key));
+                       bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&n2->key));
+
+                       bch2_fs_inconsistent(c, "btree topology error at btree %s level %u:\n"
+                                            "prev: %s\n"
+                                            "next: %s\n",
+                                            bch2_btree_ids[iter->btree_id], level,
+                                            buf1, buf2);
+
+                       six_unlock_intent(&ret->c.lock);
+                       ret = NULL;
+               }
        }
 
        bch2_btree_trans_verify_locks(trans);
index 0eeca0bcc48ead34e563794f8a4bd1890faa8752..5fffae92effb35bc3ff61793e1b42660bc78a59f 100644 (file)
@@ -26,7 +26,7 @@ struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
                                  enum six_lock_type, unsigned long);
 
 struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
-                                        enum btree_id, unsigned);
+                                        enum btree_id, unsigned, bool);
 
 struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *,
                                struct btree *, enum btree_node_sibling);
index efeaec3d9c0326ac6ac9816d8832d46e9d6102a1..bab5ebd37f04753f9e842ba8aa1b22d153917126 100644 (file)
@@ -50,39 +50,199 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
        __gc_pos_set(c, new_pos);
 }
 
+/*
+ * Missing: if an interior btree node is empty, we need to do something -
+ * perhaps just kill it
+ */
 static int bch2_gc_check_topology(struct bch_fs *c,
-                                 struct bkey_s_c k,
-                                 struct bpos *expected_start,
-                                 struct bpos expected_end,
+                                 struct btree *b,
+                                 struct bkey_buf *prev,
+                                 struct bkey_buf cur,
                                  bool is_last)
 {
+       struct bpos node_start  = b->data->min_key;
+       struct bpos node_end    = b->data->max_key;
+       struct bpos expected_start = bkey_deleted(&prev->k->k)
+               ? node_start
+               : bkey_successor(prev->k->k.p);
+       char buf1[200], buf2[200];
+       bool update_min = false;
+       bool update_max = false;
        int ret = 0;
 
-       if (k.k->type == KEY_TYPE_btree_ptr_v2) {
-               struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
+       if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
+               struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
+
+               if (bkey_deleted(&prev->k->k))
+                       scnprintf(buf1, sizeof(buf1), "start of node: %llu:%llu",
+                                 node_start.inode,
+                                 node_start.offset);
+               else
+                       bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
+
+               if (fsck_err_on(bkey_cmp(expected_start, bp->v.min_key), c,
+                               "btree node with incorrect min_key at btree %s level %u:\n"
+                               "  prev %s\n"
+                               "  cur %s",
+                               bch2_btree_ids[b->c.btree_id], b->c.level,
+                               buf1,
+                               (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)))
+                       update_min = true;
+       }
+
+       if (fsck_err_on(is_last &&
+                       bkey_cmp(cur.k->k.p, node_end), c,
+                       "btree node with incorrect max_key at btree %s level %u:\n"
+                       "  %s\n"
+                       "  expected %s",
+                       bch2_btree_ids[b->c.btree_id], b->c.level,
+                       (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
+                       (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)))
+               update_max = true;
+
+       bch2_bkey_buf_copy(prev, c, cur.k);
+
+       if (update_min || update_max) {
+               struct bkey_i *new;
+               struct bkey_i_btree_ptr_v2 *bp = NULL;
+               struct btree *n;
+
+               if (update_max) {
+                       ret = bch2_journal_key_delete(c, b->c.btree_id,
+                                                     b->c.level, cur.k->k.p);
+                       if (ret)
+                               return ret;
+               }
+
+               new = kmalloc(bkey_bytes(&cur.k->k), GFP_KERNEL);
+               if (!new)
+                       return -ENOMEM;
+
+               bkey_copy(new, cur.k);
+
+               if (new->k.type == KEY_TYPE_btree_ptr_v2)
+                       bp = bkey_i_to_btree_ptr_v2(new);
+
+               if (update_min)
+                       bp->v.min_key = expected_start;
+               if (update_max)
+                       new->k.p = node_end;
+               if (bp)
+                       SET_BTREE_PTR_RANGE_UPDATED(&bp->v, true);
 
-               if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c,
-                               "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu",
-                               bp.v->min_key.inode,
-                               bp.v->min_key.offset,
-                               expected_start->inode,
-                               expected_start->offset)) {
-                       BUG();
+               ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level, new);
+               if (ret) {
+                       kfree(new);
+                       return ret;
+               }
+
+               n = bch2_btree_node_get_noiter(c, cur.k, b->c.btree_id,
+                                              b->c.level - 1, true);
+               if (n) {
+                       mutex_lock(&c->btree_cache.lock);
+                       bch2_btree_node_hash_remove(&c->btree_cache, n);
+
+                       bkey_copy(&n->key, new);
+                       if (update_min)
+                               n->data->min_key = expected_start;
+                       if (update_max)
+                               n->data->max_key = node_end;
+
+                       ret = __bch2_btree_node_hash_insert(&c->btree_cache, n);
+                       BUG_ON(ret);
+                       mutex_unlock(&c->btree_cache.lock);
+                       six_unlock_read(&n->c.lock);
                }
        }
+fsck_err:
+       return ret;
+}
 
-       *expected_start = bkey_cmp(k.k->p, POS_MAX)
-               ? bkey_successor(k.k->p)
-               : k.k->p;
+static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
+                              unsigned level, bool is_root,
+                              struct bkey_s_c *k)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k);
+       const struct bch_extent_ptr *ptr;
+       bool do_update = false;
+       int ret = 0;
 
-       if (fsck_err_on(is_last &&
-                       bkey_cmp(k.k->p, expected_end), c,
-                       "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu",
-                       k.k->p.inode,
-                       k.k->p.offset,
-                       expected_end.inode,
-                       expected_end.offset)) {
-               BUG();
+       bkey_for_each_ptr(ptrs, ptr) {
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+               struct bucket *g = PTR_BUCKET(ca, ptr, true);
+               struct bucket *g2 = PTR_BUCKET(ca, ptr, false);
+
+               if (fsck_err_on(!g->gen_valid, c,
+                               "bucket %u:%zu data type %s ptr gen %u missing in alloc btree",
+                               ptr->dev, PTR_BUCKET_NR(ca, ptr),
+                               bch2_data_types[ptr_data_type(k->k, ptr)],
+                               ptr->gen)) {
+                       if (!ptr->cached) {
+                               g2->_mark.gen   = g->_mark.gen          = ptr->gen;
+                               g2->gen_valid   = g->gen_valid          = true;
+                               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+                       } else {
+                               do_update = true;
+                       }
+               }
+
+               if (fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
+                               "bucket %u:%zu data type %s ptr gen in the future: %u > %u",
+                               ptr->dev, PTR_BUCKET_NR(ca, ptr),
+                               bch2_data_types[ptr_data_type(k->k, ptr)],
+                               ptr->gen, g->mark.gen)) {
+                       if (!ptr->cached) {
+                               g2->_mark.gen   = g->_mark.gen  = ptr->gen;
+                               g2->gen_valid   = g->gen_valid  = true;
+                               g2->_mark.data_type             = 0;
+                               g2->_mark.dirty_sectors         = 0;
+                               g2->_mark.cached_sectors        = 0;
+                               set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+                               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
+                       } else {
+                               do_update = true;
+                       }
+               }
+
+               if (fsck_err_on(!ptr->cached &&
+                               gen_cmp(ptr->gen, g->mark.gen) < 0, c,
+                               "bucket %u:%zu data type %s stale dirty ptr: %u < %u",
+                               ptr->dev, PTR_BUCKET_NR(ca, ptr),
+                               bch2_data_types[ptr_data_type(k->k, ptr)],
+                               ptr->gen, g->mark.gen))
+                       do_update = true;
+       }
+
+       if (do_update) {
+               struct bch_extent_ptr *ptr;
+               struct bkey_i *new;
+
+               if (is_root) {
+                       bch_err(c, "cannot update btree roots yet");
+                       return -EINVAL;
+               }
+
+               new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
+               if (!new)
+                       return -ENOMEM;
+
+               bkey_reassemble(new, *k);
+
+               bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
+                       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+                       struct bucket *g = PTR_BUCKET(ca, ptr, true);
+
+                       (ptr->cached &&
+                        (!g->gen_valid || gen_cmp(ptr->gen, g->mark.gen) > 0)) ||
+                       (!ptr->cached &&
+                        gen_cmp(ptr->gen, g->mark.gen) < 0);
+               }));
+
+               ret = bch2_journal_key_insert(c, btree_id, level, new);
+               if (ret)
+                       kfree(new);
+               else
+                       *k = bkey_i_to_s_c(new);
        }
 fsck_err:
        return ret;
@@ -90,7 +250,9 @@ fsck_err:
 
 /* marking of btree keys/nodes: */
 
-static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
+static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
+                           unsigned level, bool is_root,
+                           struct bkey_s_c k,
                            u8 *max_stale, bool initial)
 {
        struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
@@ -104,7 +266,6 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
                BUG_ON(bch2_journal_seq_verify &&
                       k.k->version.lo > journal_cur_seq(&c->journal));
 
-               /* XXX change to fsck check */
                if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
                                "key version number higher than recorded: %llu > %llu",
                                k.k->version.lo,
@@ -120,35 +281,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
                                return ret;
                }
 
-               bkey_for_each_ptr(ptrs, ptr) {
-                       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-                       struct bucket *g = PTR_BUCKET(ca, ptr, true);
-                       struct bucket *g2 = PTR_BUCKET(ca, ptr, false);
-
-                       if (mustfix_fsck_err_on(!g->gen_valid, c,
-                                       "bucket %u:%zu data type %s ptr gen %u missing in alloc btree",
-                                       ptr->dev, PTR_BUCKET_NR(ca, ptr),
-                                       bch2_data_types[ptr_data_type(k.k, ptr)],
-                                       ptr->gen)) {
-                               g2->_mark.gen   = g->_mark.gen          = ptr->gen;
-                               g2->gen_valid   = g->gen_valid          = true;
-                               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
-                       }
-
-                       if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c,
-                                       "bucket %u:%zu data type %s ptr gen in the future: %u > %u",
-                                       ptr->dev, PTR_BUCKET_NR(ca, ptr),
-                                       bch2_data_types[ptr_data_type(k.k, ptr)],
-                                       ptr->gen, g->mark.gen)) {
-                               g2->_mark.gen   = g->_mark.gen          = ptr->gen;
-                               g2->gen_valid   = g->gen_valid          = true;
-                               g2->_mark.data_type             = 0;
-                               g2->_mark.dirty_sectors         = 0;
-                               g2->_mark.cached_sectors        = 0;
-                               set_bit(BCH_FS_FIXED_GENS, &c->flags);
-                               set_bit(BCH_FS_NEED_ALLOC_WRITE, &c->flags);
-                       }
-               }
+               ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, &k);
        }
 
        bkey_for_each_ptr(ptrs, ptr) {
@@ -169,10 +302,10 @@ fsck_err:
 static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
                              bool initial)
 {
-       struct bpos next_node_start = b->data->min_key;
        struct btree_node_iter iter;
        struct bkey unpacked;
        struct bkey_s_c k;
+       struct bkey_buf prev, cur;
        int ret = 0;
 
        *max_stale = 0;
@@ -181,26 +314,32 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
                return 0;
 
        bch2_btree_node_iter_init_from_start(&iter, b);
+       bch2_bkey_buf_init(&prev);
+       bch2_bkey_buf_init(&cur);
+       bkey_init(&prev.k->k);
 
        while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
                bch2_bkey_debugcheck(c, b, k);
 
-               ret = bch2_gc_mark_key(c, k, max_stale, initial);
+               ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
+                                      k, max_stale, initial);
                if (ret)
                        break;
 
                bch2_btree_node_iter_advance(&iter, b);
 
                if (b->c.level) {
-                       ret = bch2_gc_check_topology(c, k,
-                                       &next_node_start,
-                                       b->data->max_key,
+                       bch2_bkey_buf_reassemble(&cur, c, k);
+
+                       ret = bch2_gc_check_topology(c, b, &prev, cur,
                                        bch2_btree_node_iter_end(&iter));
                        if (ret)
                                break;
                }
        }
 
+       bch2_bkey_buf_exit(&cur, c);
+       bch2_bkey_buf_exit(&prev, c);
        return ret;
 }
 
@@ -253,7 +392,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
        mutex_lock(&c->btree_root_lock);
        b = c->btree_roots[btree_id].b;
        if (!btree_node_fake(b))
-               ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
+               ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
+                                      bkey_i_to_s_c(&b->key),
                                       &max_stale, initial);
        gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
        mutex_unlock(&c->btree_root_lock);
@@ -262,18 +402,18 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 }
 
 static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
-                                     struct journal_keys *journal_keys,
                                      unsigned target_depth)
 {
        struct btree_and_journal_iter iter;
        struct bkey_s_c k;
-       struct bpos next_node_start = b->data->min_key;
-       struct bkey_buf tmp;
+       struct bkey_buf cur, prev;
        u8 max_stale = 0;
        int ret = 0;
 
-       bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
-       bch2_bkey_buf_init(&tmp);
+       bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+       bch2_bkey_buf_init(&prev);
+       bch2_bkey_buf_init(&cur);
+       bkey_init(&prev.k->k);
 
        while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
                bch2_bkey_debugcheck(c, b, k);
@@ -281,50 +421,72 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
                BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0);
                BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0);
 
-               ret = bch2_gc_mark_key(c, k, &max_stale, true);
+               ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, false,
+                                      k, &max_stale, true);
                if (ret)
                        break;
 
                if (b->c.level) {
-                       struct btree *child;
-
-                       bch2_bkey_buf_reassemble(&tmp, c, k);
-                       k = bkey_i_to_s_c(tmp.k);
+                       bch2_bkey_buf_reassemble(&cur, c, k);
+                       k = bkey_i_to_s_c(cur.k);
 
                        bch2_btree_and_journal_iter_advance(&iter);
 
-                       ret = bch2_gc_check_topology(c, k,
-                                       &next_node_start,
-                                       b->data->max_key,
+                       ret = bch2_gc_check_topology(c, b,
+                                       &prev, cur,
                                        !bch2_btree_and_journal_iter_peek(&iter).k);
                        if (ret)
                                break;
+               } else {
+                       bch2_btree_and_journal_iter_advance(&iter);
+               }
+       }
 
-                       if (b->c.level > target_depth) {
-                               child = bch2_btree_node_get_noiter(c, tmp.k,
-                                                       b->c.btree_id, b->c.level - 1);
-                               ret = PTR_ERR_OR_ZERO(child);
-                               if (ret)
-                                       break;
+       if (b->c.level > target_depth) {
+               bch2_btree_and_journal_iter_exit(&iter);
+               bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+               while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+                       struct btree *child;
+
+                       bch2_bkey_buf_reassemble(&cur, c, k);
+                       bch2_btree_and_journal_iter_advance(&iter);
 
-                               ret = bch2_gc_btree_init_recurse(c, child,
-                                               journal_keys, target_depth);
-                               six_unlock_read(&child->c.lock);
+                       child = bch2_btree_node_get_noiter(c, cur.k,
+                                               b->c.btree_id, b->c.level - 1,
+                                               false);
+                       ret = PTR_ERR_OR_ZERO(child);
 
+                       if (fsck_err_on(ret == -EIO, c,
+                                       "unreadable btree node")) {
+                               ret = bch2_journal_key_delete(c, b->c.btree_id,
+                                                             b->c.level, cur.k->k.p);
                                if (ret)
-                                       break;
+                                       return ret;
+
+                               set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+                               continue;
                        }
-               } else {
-                       bch2_btree_and_journal_iter_advance(&iter);
+
+                       if (ret)
+                               break;
+
+                       ret = bch2_gc_btree_init_recurse(c, child,
+                                                        target_depth);
+                       six_unlock_read(&child->c.lock);
+
+                       if (ret)
+                               break;
                }
        }
-
-       bch2_bkey_buf_exit(&tmp, c);
+fsck_err:
+       bch2_bkey_buf_exit(&cur, c);
+       bch2_bkey_buf_exit(&prev, c);
+       bch2_btree_and_journal_iter_exit(&iter);
        return ret;
 }
 
 static int bch2_gc_btree_init(struct bch_fs *c,
-                             struct journal_keys *journal_keys,
                              enum btree_id btree_id)
 {
        struct btree *b;
@@ -355,11 +517,11 @@ static int bch2_gc_btree_init(struct bch_fs *c,
        }
 
        if (b->c.level >= target_depth)
-               ret = bch2_gc_btree_init_recurse(c, b,
-                                       journal_keys, target_depth);
+               ret = bch2_gc_btree_init_recurse(c, b, target_depth);
 
        if (!ret)
-               ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
+               ret = bch2_gc_mark_key(c, b->c.btree_id, b->c.level, true,
+                                      bkey_i_to_s_c(&b->key),
                                       &max_stale, true);
 fsck_err:
        six_unlock_read(&b->c.lock);
@@ -373,8 +535,7 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
                (int) btree_id_to_gc_phase(r);
 }
 
-static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
-                         bool initial)
+static int bch2_gc_btrees(struct bch_fs *c, bool initial)
 {
        enum btree_id ids[BTREE_ID_NR];
        unsigned i;
@@ -386,8 +547,7 @@ static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys,
        for (i = 0; i < BTREE_ID_NR; i++) {
                enum btree_id id = ids[i];
                int ret = initial
-                       ? bch2_gc_btree_init(c, journal_keys,
-                                            id)
+                       ? bch2_gc_btree_init(c, id)
                        : bch2_gc_btree(c, id, initial);
                if (ret)
                        return ret;
@@ -775,8 +935,7 @@ static int bch2_gc_start(struct bch_fs *c)
  *    move around - if references move backwards in the ordering GC
  *    uses, GC could skip past them
  */
-int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
-           bool initial)
+int bch2_gc(struct bch_fs *c, bool initial)
 {
        struct bch_dev *ca;
        u64 start_time = local_clock();
@@ -798,7 +957,7 @@ again:
 
        bch2_mark_superblocks(c);
 
-       ret = bch2_gc_btrees(c, journal_keys, initial);
+       ret = bch2_gc_btrees(c, initial);
        if (ret)
                goto out;
 
@@ -808,16 +967,15 @@ again:
        bch2_mark_allocator_buckets(c);
 
        c->gc_count++;
-out:
-       if (!ret &&
-           (test_bit(BCH_FS_FIXED_GENS, &c->flags) ||
-            (!iter && bch2_test_restart_gc))) {
+
+       if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
+           (!iter && bch2_test_restart_gc)) {
                /*
                 * XXX: make sure gens we fixed got saved
                 */
                if (iter++ <= 2) {
-                       bch_info(c, "Fixed gens, restarting mark and sweep:");
-                       clear_bit(BCH_FS_FIXED_GENS, &c->flags);
+                       bch_info(c, "Second GC pass needed, restarting:");
+                       clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
                        __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
 
                        percpu_down_write(&c->mark_lock);
@@ -832,7 +990,7 @@ out:
                bch_info(c, "Unable to fix bucket gens, looping");
                ret = -EINVAL;
        }
-
+out:
        if (!ret) {
                bch2_journal_block(&c->journal);
 
@@ -1371,7 +1529,7 @@ static int bch2_gc_thread(void *arg)
                 * Full gc is currently incompatible with btree key cache:
                 */
 #if 0
-               ret = bch2_gc(c, NULL, false, false);
+               ret = bch2_gc(c, false, false);
 #else
                ret = bch2_gc_gens(c);
 #endif
index f0435a58793be24be2cbfa43e1c012c14084918a..fa604efc70cc555d33a2615e18d3a737a241758d 100644 (file)
@@ -6,8 +6,7 @@
 
 void bch2_coalesce(struct bch_fs *);
 
-struct journal_keys;
-int bch2_gc(struct bch_fs *, struct journal_keys *, bool);
+int bch2_gc(struct bch_fs *, bool);
 int bch2_gc_gens(struct bch_fs *);
 void bch2_gc_thread_stop(struct bch_fs *);
 int bch2_gc_thread_start(struct bch_fs *);
index 65f7e36677b7da68ea84487d503ebe86f7bce38a..91e578b2d8c0daf3359082876015569421ad7564 100644 (file)
@@ -753,6 +753,11 @@ static int validate_bset(struct bch_fs *c, struct btree *b,
                        struct bch_btree_ptr_v2 *bp =
                                &bkey_i_to_btree_ptr_v2(&b->key)->v;
 
+                       if (BTREE_PTR_RANGE_UPDATED(bp)) {
+                               b->data->min_key = bp->min_key;
+                               b->data->max_key = b->key.k.p;
+                       }
+
                        btree_err_on(bkey_cmp(b->data->min_key, bp->min_key),
                                     BTREE_ERR_MUST_RETRY, c, b, NULL,
                                     "incorrect min_key: got %llu:%llu should be %llu:%llu",
index 5bb653298c6ca9b3b52a4e60cda4a276afe2c482..8919ea628138b04b8ea93b16fa0d9e0d11e6a890 100644 (file)
@@ -297,7 +297,6 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev
                bp->v.mem_ptr           = 0;
                bp->v.seq               = b->data->keys.seq;
                bp->v.sectors_written   = 0;
-               bp->v.sectors           = cpu_to_le16(c->opts.btree_node_size);
        }
 
        if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))
index 9c7cc78849b940b6491500d7ed71b597a3a071c1..086897c3bdc363d72ae582daea396867d786276b 100644 (file)
@@ -744,7 +744,6 @@ err:
 static int ec_stripe_bkey_update(struct btree_trans *trans,
                                 struct bkey_i_stripe *new)
 {
-       struct bch_fs *c = trans->c;
        struct btree_iter *iter;
        struct bkey_s_c k;
        const struct bch_stripe *existing;
@@ -759,7 +758,7 @@ static int ec_stripe_bkey_update(struct btree_trans *trans,
                goto err;
 
        if (!k.k || k.k->type != KEY_TYPE_stripe) {
-               bch_err(c, "error updating stripe: not found");
+               bch_err(trans->c, "error updating stripe: not found");
                ret = -ENOENT;
                goto err;
        }
@@ -767,7 +766,7 @@ static int ec_stripe_bkey_update(struct btree_trans *trans,
        existing = bkey_s_c_to_stripe(k).v;
 
        if (existing->nr_blocks != new->v.nr_blocks) {
-               bch_err(c, "error updating stripe: nr_blocks does not match");
+               bch_err(trans->c, "error updating stripe: nr_blocks does not match");
                ret = -EINVAL;
                goto err;
        }
index c0ae31238b488c6e9589b18e3094b1d2f5907a27..67ba2c21627efd18a549c2df5efcf93f8d65bacb 100644 (file)
@@ -215,9 +215,8 @@ void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c,
 {
        struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
-       pr_buf(out, "seq %llx sectors %u written %u min_key ",
+       pr_buf(out, "seq %llx written %u min_key ",
               le64_to_cpu(bp.v->seq),
-              le16_to_cpu(bp.v->sectors),
               le16_to_cpu(bp.v->sectors_written));
 
        bch2_bpos_to_text(out, bp.v->min_key);
@@ -1082,10 +1081,9 @@ const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k)
        unsigned nonce = UINT_MAX;
        unsigned i;
 
-       if (k.k->type == KEY_TYPE_btree_ptr)
+       if (k.k->type == KEY_TYPE_btree_ptr ||
+           k.k->type == KEY_TYPE_btree_ptr_v2)
                size_ondisk = c->opts.btree_node_size;
-       if (k.k->type == KEY_TYPE_btree_ptr_v2)
-               size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors);
 
        bkey_extent_entry_for_each(ptrs, entry) {
                if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX)
index d6273c8d7d0cbb34cbac3a6534fa02016b5bf4af..a7c5f5fddedb40c0e150acf8ed18862075f856c0 100644 (file)
@@ -1011,13 +1011,19 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq,
        }
 
        list_for_each_entry(i, journal_entries, list) {
+               unsigned ptr;
+
                seq = le64_to_cpu(i->j.seq);
                BUG_ON(seq >= cur_seq);
 
                if (seq < last_seq)
                        continue;
 
-               journal_seq_pin(j, seq)->devs = i->devs;
+               p = journal_seq_pin(j, seq);
+
+               p->devs.nr = 0;
+               for (ptr = 0; ptr < i->nr_ptrs; ptr++)
+                       bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
        }
 
        spin_lock(&j->lock);
index 750f6fab2e635e0bb7bdb2d1825ff395c06719de..f5264d1433f3a161f3092924f0bf9d0b31ba72e3 100644 (file)
@@ -46,15 +46,16 @@ struct journal_list {
  * be replayed:
  */
 static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
+                            struct bch_extent_ptr entry_ptr,
                             struct journal_list *jlist, struct jset *j,
                             bool bad)
 {
-       struct journal_replay *i, *pos;
-       struct bch_devs_list devs = { .nr = 0 };
+       struct journal_replay *i, *pos, *dup = NULL;
+       struct bch_extent_ptr *ptr;
        struct list_head *where;
        size_t bytes = vstruct_bytes(j);
        u64 last_seq = 0;
-       int ret;
+       int ret = JOURNAL_ENTRY_ADD_OK;
 
        list_for_each_entry_reverse(i, jlist->head, list) {
                if (!JSET_NO_FLUSH(&i->j)) {
@@ -88,28 +89,29 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 
        where = jlist->head;
 add:
-       i = where->next != jlist->head
+       dup = where->next != jlist->head
                ? container_of(where->next, struct journal_replay, list)
                : NULL;
 
+       if (dup && le64_to_cpu(j->seq) != le64_to_cpu(dup->j.seq))
+               dup = NULL;
+
        /*
         * Duplicate journal entries? If so we want the one that didn't have a
         * checksum error:
         */
-       if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) {
-               if (i->bad) {
-                       devs = i->devs;
-                       __journal_replay_free(i);
+       if (dup) {
+               if (dup->bad) {
+                       /* we'll replace @dup: */
                } else if (bad) {
                        goto found;
                } else {
-                       fsck_err_on(bytes != vstruct_bytes(&i->j) ||
-                                   memcmp(j, &i->j, bytes), c,
+                       fsck_err_on(bytes != vstruct_bytes(&dup->j) ||
+                                   memcmp(j, &dup->j, bytes), c,
                                    "found duplicate but non identical journal entries (seq %llu)",
                                    le64_to_cpu(j->seq));
                        goto found;
                }
-
        }
 
        i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
@@ -118,17 +120,34 @@ add:
                goto out;
        }
 
-       list_add(&i->list, where);
-       i->devs = devs;
-       i->bad  = bad;
-       i->ignore = false;
+       i->nr_ptrs       = 0;
+       i->bad          = bad;
+       i->ignore       = false;
        memcpy(&i->j, j, bytes);
+
+       if (dup) {
+               i->nr_ptrs = dup->nr_ptrs;
+               memcpy(i->ptrs, dup->ptrs, sizeof(dup->ptrs));
+               __journal_replay_free(dup);
+       }
+
+       list_add(&i->list, where);
 found:
-       if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx))
-               bch2_dev_list_add_dev(&i->devs, ca->dev_idx);
-       else
-               fsck_err_on(1, c, "duplicate journal entries on same device");
-       ret = JOURNAL_ENTRY_ADD_OK;
+       for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; i++) {
+               if (ptr->dev == ca->dev_idx) {
+                       bch_err(c, "duplicate journal entry %llu on same device",
+                               le64_to_cpu(i->j.seq));
+                       goto out;
+               }
+       }
+
+       if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
+               bch_err(c, "found too many copies of journal entry %llu",
+                       le64_to_cpu(i->j.seq));
+               goto out;
+       }
+
+       i->ptrs[i->nr_ptrs++] = entry_ptr;
 out:
 fsck_err:
        return ret;
@@ -654,7 +673,10 @@ reread:
                ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
 
                mutex_lock(&jlist->lock);
-               ret = journal_entry_add(c, ca, jlist, j, ret != 0);
+               ret = journal_entry_add(c, ca, (struct bch_extent_ptr) {
+                                       .dev = ca->dev_idx,
+                                       .offset = offset,
+                                       }, jlist, j, ret != 0);
                mutex_unlock(&jlist->lock);
 
                switch (ret) {
@@ -742,6 +764,23 @@ err:
        goto out;
 }
 
+static void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+                                     struct journal_replay *j)
+{
+       unsigned i;
+
+       for (i = 0; i < j->nr_ptrs; i++) {
+               struct bch_dev *ca = c->devs[j->ptrs[i].dev];
+
+               if (i)
+                       pr_buf(out, " ");
+               pr_buf(out, "%u:%llu (offset %llu)",
+                      j->ptrs[i].dev,
+                      (u64) j->ptrs[i].offset,
+                      (u64) j->ptrs[i].offset % ca->mi.bucket_size);
+       }
+}
+
 int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                      u64 *blacklist_seq, u64 *start_seq)
 {
@@ -839,6 +878,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
 
                while (seq < le64_to_cpu(i->j.seq)) {
                        u64 missing_start, missing_end;
+                       char buf1[200], buf2[200];
 
                        while (seq < le64_to_cpu(i->j.seq) &&
                               bch2_journal_seq_is_blacklisted(c, seq, false))
@@ -853,10 +893,23 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                               !bch2_journal_seq_is_blacklisted(c, seq, false))
                                seq++;
 
+                       if (i->list.prev != list) {
+                               struct printbuf out = PBUF(buf1);
+                               struct journal_replay *p = list_prev_entry(i, list);
+
+                               bch2_journal_ptrs_to_text(&out, c, p);
+                               pr_buf(&out, " size %llu", vstruct_sectors(&p->j, c->block_bits));
+                       } else
+                               sprintf(buf1, "(none)");
+                       bch2_journal_ptrs_to_text(&PBUF(buf2), c, i);
+
                        missing_end = seq - 1;
-                       fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)",
+                       fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n"
+                                "  prev at %s\n"
+                                "  next at %s",
                                 missing_start, missing_end,
-                                last_seq, *blacklist_seq - 1);
+                                last_seq, *blacklist_seq - 1,
+                                buf1, buf2);
                }
 
                seq++;
@@ -865,7 +918,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
        list_for_each_entry(i, list, list) {
                struct jset_entry *entry;
                struct bkey_i *k, *_n;
-               struct bch_replicas_padded replicas;
+               struct bch_replicas_padded replicas = {
+                       .e.data_type = BCH_DATA_journal,
+                       .e.nr_required = 1,
+               };
+               unsigned ptr;
                char buf[80];
 
                if (i->ignore)
@@ -875,13 +932,14 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list,
                if (ret)
                        goto fsck_err;
 
+               for (ptr = 0; ptr < i->nr_ptrs; ptr++)
+                       replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
+
                /*
                 * If we're mounting in degraded mode - if we didn't read all
                 * the devices - this is wrong:
                 */
 
-               bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs);
-
                if (!degraded &&
                    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
                     fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
index 6b4c80968f52064370c4d3b767db29cd3cb59bed..a4931ab93a68dc0a18415a141aee4094d4d87e3e 100644 (file)
@@ -8,7 +8,9 @@
  */
 struct journal_replay {
        struct list_head        list;
-       struct bch_devs_list    devs;
+       struct bch_extent_ptr   ptrs[BCH_REPLICAS_MAX];
+       unsigned                nr_ptrs;
+
        /* checksum error, but we may want to try using it anyways: */
        bool                    bad;
        bool                    ignore;
index 422f2fbe6dfb2d930054b7352f0d1e593c4a58bd..f470e0e233ce949c46480cb57242fae6d34074d3 100644 (file)
@@ -40,78 +40,169 @@ static void drop_alloc_keys(struct journal_keys *keys)
 
 /* iterate over keys read from the journal: */
 
-static struct journal_key *journal_key_search(struct journal_keys *journal_keys,
-                                             enum btree_id id, unsigned level,
-                                             struct bpos pos)
+static int __journal_key_cmp(enum btree_id     l_btree_id,
+                            unsigned           l_level,
+                            struct bpos        l_pos,
+                            struct journal_key *r)
+{
+       return (cmp_int(l_btree_id,     r->btree_id) ?:
+               cmp_int(l_level,        r->level) ?:
+               bkey_cmp(l_pos, r->k->k.p));
+}
+
+static int journal_key_cmp(struct journal_key *l, struct journal_key *r)
+{
+       return (cmp_int(l->btree_id,    r->btree_id) ?:
+               cmp_int(l->level,       r->level) ?:
+               bkey_cmp(l->k->k.p,     r->k->k.p));
+}
+
+static size_t journal_key_search(struct journal_keys *journal_keys,
+                                enum btree_id id, unsigned level,
+                                struct bpos pos)
 {
        size_t l = 0, r = journal_keys->nr, m;
 
        while (l < r) {
                m = l + ((r - l) >> 1);
-               if ((cmp_int(id,        journal_keys->d[m].btree_id) ?:
-                    cmp_int(level,     journal_keys->d[m].level) ?:
-                    bkey_cmp(pos,      journal_keys->d[m].k->k.p)) > 0)
+               if (__journal_key_cmp(id, level, pos, &journal_keys->d[m]) > 0)
                        l = m + 1;
                else
                        r = m;
        }
 
        BUG_ON(l < journal_keys->nr &&
-              (cmp_int(id,     journal_keys->d[l].btree_id) ?:
-               cmp_int(level,  journal_keys->d[l].level) ?:
-               bkey_cmp(pos,   journal_keys->d[l].k->k.p)) > 0);
+              __journal_key_cmp(id, level, pos, &journal_keys->d[l]) > 0);
 
        BUG_ON(l &&
-              (cmp_int(id,     journal_keys->d[l - 1].btree_id) ?:
-               cmp_int(level,  journal_keys->d[l - 1].level) ?:
-               bkey_cmp(pos,   journal_keys->d[l - 1].k->k.p)) <= 0);
+              __journal_key_cmp(id, level, pos, &journal_keys->d[l - 1]) <= 0);
 
-       return l < journal_keys->nr ? journal_keys->d + l : NULL;
+       return l;
+}
+
+static void journal_iter_fix(struct bch_fs *c, struct journal_iter *iter, unsigned idx)
+{
+       struct bkey_i *n = iter->keys->d[idx].k;
+       struct btree_and_journal_iter *biter =
+               container_of(iter, struct btree_and_journal_iter, journal);
+
+       if (iter->idx > idx ||
+           (iter->idx == idx &&
+            biter->last &&
+            bkey_cmp(n->k.p, biter->unpacked.p) <= 0))
+               iter->idx++;
+}
+
+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
+                           unsigned level, struct bkey_i *k)
+{
+       struct journal_key n = {
+               .btree_id       = id,
+               .level          = level,
+               .k              = k,
+               .allocated      = true
+       };
+       struct journal_keys *keys = &c->journal_keys;
+       struct journal_iter *iter;
+       unsigned idx = journal_key_search(keys, id, level, k->k.p);
+
+       if (idx < keys->nr &&
+           journal_key_cmp(&n, &keys->d[idx]) == 0) {
+               if (keys->d[idx].allocated)
+                       kfree(keys->d[idx].k);
+               keys->d[idx] = n;
+               return 0;
+       }
+
+       if (keys->nr == keys->size) {
+               struct journal_keys new_keys = {
+                       .nr                     = keys->nr,
+                       .size                   = keys->size * 2,
+                       .journal_seq_base       = keys->journal_seq_base,
+               };
+
+               new_keys.d = kvmalloc(sizeof(new_keys.d[0]) * new_keys.size, GFP_KERNEL);
+               if (!new_keys.d)
+                       return -ENOMEM;
+
+               memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
+               kvfree(keys->d);
+               *keys = new_keys;
+       }
+
+       array_insert_item(keys->d, keys->nr, idx, n);
+
+       list_for_each_entry(iter, &c->journal_iters, list)
+               journal_iter_fix(c, iter, idx);
+
+       return 0;
+}
+
+int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
+                           unsigned level, struct bpos pos)
+{
+       struct bkey_i *whiteout =
+               kmalloc(sizeof(struct bkey), GFP_KERNEL);
+       int ret;
+
+       if (!whiteout)
+               return -ENOMEM;
+
+       bkey_init(&whiteout->k);
+       whiteout->k.p = pos;
+
+       ret = bch2_journal_key_insert(c, id, level, whiteout);
+       if (ret)
+               kfree(whiteout);
+       return ret;
 }
 
 static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
 {
-       if (iter->k &&
-           iter->k < iter->keys->d + iter->keys->nr &&
-           iter->k->btree_id   == iter->btree_id &&
-           iter->k->level      == iter->level)
-               return iter->k->k;
+       struct journal_key *k = iter->idx - iter->keys->nr
+               ? iter->keys->d + iter->idx : NULL;
+
+       if (k &&
+           k->btree_id == iter->btree_id &&
+           k->level    == iter->level)
+               return k->k;
 
-       iter->k = NULL;
+       iter->idx = iter->keys->nr;
        return NULL;
 }
 
 static void bch2_journal_iter_advance(struct journal_iter *iter)
 {
-       if (iter->k)
-               iter->k++;
+       if (iter->idx < iter->keys->nr)
+               iter->idx++;
+}
+
+static void bch2_journal_iter_exit(struct journal_iter *iter)
+{
+       list_del(&iter->list);
 }
 
-static void bch2_journal_iter_init(struct journal_iter *iter,
-                                  struct journal_keys *journal_keys,
+static void bch2_journal_iter_init(struct bch_fs *c,
+                                  struct journal_iter *iter,
                                   enum btree_id id, unsigned level,
                                   struct bpos pos)
 {
        iter->btree_id  = id;
        iter->level     = level;
-       iter->keys      = journal_keys;
-       iter->k         = journal_key_search(journal_keys, id, level, pos);
+       iter->keys      = &c->journal_keys;
+       iter->idx       = journal_key_search(&c->journal_keys, id, level, pos);
+       list_add(&iter->list, &c->journal_iters);
 }
 
 static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
 {
-       return iter->btree
-               ? bch2_btree_iter_peek(iter->btree)
-               : bch2_btree_node_iter_peek_unpack(&iter->node_iter,
-                                                  iter->b, &iter->unpacked);
+       return bch2_btree_node_iter_peek_unpack(&iter->node_iter,
+                                               iter->b, &iter->unpacked);
 }
 
 static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter)
 {
-       if (iter->btree)
-               bch2_btree_iter_next(iter->btree);
-       else
-               bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
+       bch2_btree_node_iter_advance(&iter->node_iter, iter->b);
 }
 
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
@@ -160,7 +251,7 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
 
                if (iter->b &&
                    bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) {
-                       iter->journal.k = NULL;
+                       iter->journal.idx = iter->journal.keys->nr;
                        iter->last = none;
                        return bkey_s_c_null;
                }
@@ -181,26 +272,20 @@ struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *
        return bch2_btree_and_journal_iter_peek(iter);
 }
 
-void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter,
-                                     struct btree_trans *trans,
-                                     struct journal_keys *journal_keys,
-                                     enum btree_id id, struct bpos pos)
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
 {
-       memset(iter, 0, sizeof(*iter));
-
-       iter->btree = bch2_trans_get_iter(trans, id, pos, BTREE_ITER_PREFETCH);
-       bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos);
+       bch2_journal_iter_exit(&iter->journal);
 }
 
 void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-                                               struct journal_keys *journal_keys,
+                                               struct bch_fs *c,
                                                struct btree *b)
 {
        memset(iter, 0, sizeof(*iter));
 
        iter->b = b;
        bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
-       bch2_journal_iter_init(&iter->journal, journal_keys,
+       bch2_journal_iter_init(c, &iter->journal,
                               b->c.btree_id, b->c.level, b->data->min_key);
 }
 
@@ -244,7 +329,7 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
        int ret = 0;
 
        bch2_bkey_buf_init(&tmp);
-       bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b);
+       bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
 
        while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
                ret = key_fn(c, btree_id, b->c.level, k);
@@ -257,7 +342,8 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
                        bch2_btree_and_journal_iter_advance(&iter);
 
                        child = bch2_btree_node_get_noiter(c, tmp.k,
-                                               b->c.btree_id, b->c.level - 1);
+                                               b->c.btree_id, b->c.level - 1,
+                                               false);
 
                        ret = PTR_ERR_OR_ZERO(child);
                        if (ret)
@@ -277,6 +363,7 @@ static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b
                }
        }
 
+       bch2_btree_and_journal_iter_exit(&iter);
        bch2_bkey_buf_exit(&tmp, c);
        return ret;
 }
@@ -333,6 +420,12 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
 
 void bch2_journal_keys_free(struct journal_keys *keys)
 {
+       struct journal_key *i;
+
+       for (i = keys->d; i < keys->d + keys->nr; i++)
+               if (i->allocated)
+                       kfree(i->k);
+
        kvfree(keys->d);
        keys->d = NULL;
        keys->nr = 0;
@@ -361,7 +454,9 @@ static struct journal_keys journal_keys_sort(struct list_head *journal_entries)
                        nr_keys++;
        }
 
-       keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL);
+       keys.size = roundup_pow_of_two(nr_keys);
+
+       keys.d = kvmalloc(sizeof(keys.d[0]) * keys.size, GFP_KERNEL);
        if (!keys.d)
                goto err;
 
@@ -545,14 +640,16 @@ static int __bch2_journal_replay_key(struct btree_trans *trans,
        return ret;
 }
 
-static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id,
-                                  unsigned level, struct bkey_i *k)
+static int bch2_journal_replay_key(struct bch_fs *c, struct journal_key *k)
 {
-       return bch2_trans_do(c, NULL, NULL,
-                            BTREE_INSERT_NOFAIL|
-                            BTREE_INSERT_LAZY_RW|
-                            BTREE_INSERT_JOURNAL_REPLAY,
-                            __bch2_journal_replay_key(&trans, id, level, k));
+       unsigned commit_flags = BTREE_INSERT_NOFAIL|
+               BTREE_INSERT_LAZY_RW;
+
+       if (!k->allocated)
+               commit_flags |= BTREE_INSERT_JOURNAL_REPLAY;
+
+       return bch2_trans_do(c, NULL, NULL, commit_flags,
+                            __bch2_journal_replay_key(&trans, k->btree_id, k->level, k->k));
 }
 
 static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k)
@@ -628,7 +725,7 @@ static int bch2_journal_replay(struct bch_fs *c,
 
                if (i->level) {
                        j->replay_journal_seq = keys.journal_seq_base + i->journal_seq;
-                       ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
+                       ret = bch2_journal_replay_key(c, i);
                        if (ret)
                                goto err;
                }
@@ -658,7 +755,7 @@ static int bch2_journal_replay(struct bch_fs *c,
 
                ret = i->k->k.size
                        ? bch2_extent_replay_key(c, i->btree_id, i->k)
-                       : bch2_journal_replay_key(c, i->btree_id, i->level, i->k);
+                       : bch2_journal_replay_key(c, i);
                if (ret)
                        goto err;
        }
@@ -670,7 +767,8 @@ static int bch2_journal_replay(struct bch_fs *c,
        bch2_journal_flush_all_pins(j);
        return bch2_journal_error(j);
 err:
-       bch_err(c, "journal replay: error %d while replaying key", ret);
+       bch_err(c, "journal replay: error %d while replaying key at btree %s level %u",
+               ret, bch2_btree_ids[i->btree_id], i->level);
        return ret;
 }
 
@@ -1105,7 +1203,7 @@ use_clean:
            test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) {
                bch_info(c, "starting mark and sweep");
                err = "error in mark and sweep";
-               ret = bch2_gc(c, &c->journal_keys, true);
+               ret = bch2_gc(c, true);
                if (ret)
                        goto err;
                bch_verbose(c, "mark and sweep done");
index a66827c9addf71a4b3eaeb08a0151e0d0c5cd9c2..fa91851b9ed7a2e890cb498b9012fe451c327813 100644 (file)
@@ -6,10 +6,11 @@
        for (i = (keys).d; i < (keys).d + (keys).nr; (i)++)
 
 struct journal_iter {
+       struct list_head        list;
        enum btree_id           btree_id;
        unsigned                level;
+       size_t                  idx;
        struct journal_keys     *keys;
-       struct journal_key      *k;
 };
 
 /*
@@ -17,8 +18,6 @@ struct journal_iter {
  */
 
 struct btree_and_journal_iter {
-       struct btree_iter       *btree;
-
        struct btree            *b;
        struct btree_node_iter  node_iter;
        struct bkey             unpacked;
@@ -32,16 +31,18 @@ struct btree_and_journal_iter {
        }                       last;
 };
 
+int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
+                           unsigned, struct bkey_i *);
+int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
+                           unsigned, struct bpos);
+
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
 
-void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *,
-                                     struct btree_trans *,
-                                     struct journal_keys *,
-                                     enum btree_id, struct bpos);
+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
 void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-                                               struct journal_keys *,
+                                               struct bch_fs *,
                                                struct btree *);
 
 typedef int (*btree_walk_node_fn)(struct bch_fs *c, struct btree *b);
index 78835bd2d6bc4e0c95c96f68d9d318eef47652e6..751efd28b672bd3fceedfade1b4d5af95ab1392c 100644 (file)
@@ -276,19 +276,19 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb)
                return "Bad number of member devices";
 
        if (!BCH_SB_META_REPLICAS_WANT(sb) ||
-           BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+           BCH_SB_META_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
                return "Invalid number of metadata replicas";
 
        if (!BCH_SB_META_REPLICAS_REQ(sb) ||
-           BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
+           BCH_SB_META_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
                return "Invalid number of metadata replicas";
 
        if (!BCH_SB_DATA_REPLICAS_WANT(sb) ||
-           BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX)
+           BCH_SB_DATA_REPLICAS_WANT(sb) > BCH_REPLICAS_MAX)
                return "Invalid number of data replicas";
 
        if (!BCH_SB_DATA_REPLICAS_REQ(sb) ||
-           BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX)
+           BCH_SB_DATA_REPLICAS_REQ(sb) > BCH_REPLICAS_MAX)
                return "Invalid number of data replicas";
 
        if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR)
index 9f7a6f1f8290738a25edaa66caf705d0e0ebabd7..f3c12d89df58bf6a5739249a8e07cac575c3d2c7 100644 (file)
@@ -684,6 +684,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
                  bch2_blacklist_entries_gc);
 
        INIT_LIST_HEAD(&c->journal_entries);
+       INIT_LIST_HEAD(&c->journal_iters);
 
        INIT_LIST_HEAD(&c->fsck_errors);
        mutex_init(&c->fsck_error_lock);
index 4fc5777ecfb09d8959db36a0fa3c60719b2ddd13..80964bdf6237432af7fe3f91c000720d607f5950 100644 (file)
@@ -475,7 +475,7 @@ STORE(bch2_fs)
                 */
 #if 0
                down_read(&c->state_lock);
-               bch2_gc(c, NULL, false, false);
+               bch2_gc(c, false, false);
                up_read(&c->state_lock);
 #else
                bch2_gc_gens(c);