]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 8d3093bd9b bcachefs: Evict btree nodes we're deleting
authorKent Overstreet <kent.overstreet@gmail.com>
Sat, 24 Apr 2021 20:33:06 +0000 (16:33 -0400)
committerKent Overstreet <kent.overstreet@gmail.com>
Tue, 27 Apr 2021 00:18:47 +0000 (20:18 -0400)
19 files changed:
.bcachefs_revision
include/linux/bsearch.h [new file with mode: 0644]
include/linux/types.h
libbcachefs/bcachefs.h
libbcachefs/bcachefs_format.h
libbcachefs/btree_cache.c
libbcachefs/btree_cache.h
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_key_cache.c
libbcachefs/btree_update.h
libbcachefs/btree_update_interior.c
libbcachefs/error.c
libbcachefs/error.h
libbcachefs/fsck.c
libbcachefs/recovery.c
libbcachefs/super-io.c
libbcachefs/super.c

index a0a4a3431fa03ab19d9deb3770133187ca805a47..d1024536aa74e22a37fdb81c6484008f59167ac0 100644 (file)
@@ -1 +1 @@
-e6fa8eaa1b374fc6262bd088ad1f140f4c5a8b11
+8d3093bd9b9254957badce4a4ff178baeb3632ed
diff --git a/include/linux/bsearch.h b/include/linux/bsearch.h
new file mode 100644 (file)
index 0000000..e66b711
--- /dev/null
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_BSEARCH_H
+#define _LINUX_BSEARCH_H
+
+#include <linux/types.h>
+
+static __always_inline
+void *__inline_bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp)
+{
+       const char *pivot;
+       int result;
+
+       while (num > 0) {
+               pivot = base + (num >> 1) * size;
+               result = cmp(key, pivot);
+
+               if (result == 0)
+                       return (void *)pivot;
+
+               if (result > 0) {
+                       base = pivot + size;
+                       num--;
+               }
+               num >>= 1;
+       }
+
+       return NULL;
+}
+
+extern void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp);
+
+#endif /* _LINUX_BSEARCH_H */
index c9886cbaadcc891f6f1dcdd3369ca415b1cbeaf7..77f967377757530f64238075483a4e6e6a3df365 100644 (file)
@@ -76,4 +76,6 @@ typedef __u64 __bitwise __be64;
 
 typedef u64 sector_t;
 
+typedef int (*cmp_func_t)(const void *a, const void *b);
+
 #endif /* _TOOLS_LINUX_TYPES_H_ */
index ce058d55eb348c1a2324f971a1bb8d5eef048393..f2d2c7bbc29be54be0f88b7dcd229e0cb89331f7 100644 (file)
@@ -485,10 +485,12 @@ enum {
        BCH_FS_ALLOCATOR_RUNNING,
        BCH_FS_ALLOCATOR_STOPPING,
        BCH_FS_INITIAL_GC_DONE,
+       BCH_FS_INITIAL_GC_UNFIXED,
        BCH_FS_BTREE_INTERIOR_REPLAY_DONE,
        BCH_FS_FSCK_DONE,
        BCH_FS_STARTED,
        BCH_FS_RW,
+       BCH_FS_WAS_RW,
 
        /* shutdown: */
        BCH_FS_STOPPING,
@@ -497,7 +499,9 @@ enum {
 
        /* errors: */
        BCH_FS_ERROR,
+       BCH_FS_TOPOLOGY_ERROR,
        BCH_FS_ERRORS_FIXED,
+       BCH_FS_ERRORS_NOT_FIXED,
 
        /* misc: */
        BCH_FS_NEED_ANOTHER_GC,
index ead7268bf8984052d0c2935100286980ee1cbcbb..d640a3115adc0cda2d7d68077031bedfcc0abc4d 100644 (file)
@@ -1313,12 +1313,10 @@ LE64_BITMASK(BCH_SB_GRPQUOTA,           struct bch_sb, flags[0], 58, 59);
 LE64_BITMASK(BCH_SB_PRJQUOTA,          struct bch_sb, flags[0], 59, 60);
 
 LE64_BITMASK(BCH_SB_HAS_ERRORS,                struct bch_sb, flags[0], 60, 61);
+LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62);
 
-/* bit 61 was reflink option */
 LE64_BITMASK(BCH_SB_BIG_ENDIAN,                struct bch_sb, flags[0], 62, 63);
 
-/* 61-64 unused */
-
 LE64_BITMASK(BCH_SB_STR_HASH_TYPE,     struct bch_sb, flags[1],  0,  4);
 LE64_BITMASK(BCH_SB_COMPRESSION_TYPE,  struct bch_sb, flags[1],  4,  8);
 LE64_BITMASK(BCH_SB_INODE_32BIT,       struct bch_sb, flags[1],  8,  9);
index edc3c5edb62ba3e3cb9b7514298934030924f76b..f8692f792dd40e97afb08b321ffc509ac3b6f649 100644 (file)
@@ -958,6 +958,36 @@ void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter,
        bch2_btree_node_fill(c, iter, k, btree_id, level, SIX_LOCK_read, false);
 }
 
+void bch2_btree_node_evict(struct bch_fs *c, const struct bkey_i *k)
+{
+       struct btree_cache *bc = &c->btree_cache;
+       struct btree *b;
+
+       b = btree_cache_find(bc, k);
+       if (!b)
+               return;
+
+       six_lock_intent(&b->c.lock, NULL, NULL);
+       six_lock_write(&b->c.lock, NULL, NULL);
+
+       wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight,
+                      TASK_UNINTERRUPTIBLE);
+       __bch2_btree_node_write(c, b);
+
+       /* wait for any in flight btree write */
+       btree_node_wait_on_io(b);
+
+       BUG_ON(btree_node_dirty(b));
+
+       mutex_lock(&bc->lock);
+       btree_node_data_free(c, b);
+       bch2_btree_node_hash_remove(bc, b);
+       mutex_unlock(&bc->lock);
+
+       six_unlock_write(&b->c.lock);
+       six_unlock_intent(&b->c.lock);
+}
+
 void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c,
                             struct btree *b)
 {
index c517cc02945405f5bc1a77a31f7292f504833cfd..40dd263a7caa7b043ab75792cb30a94660399572 100644 (file)
@@ -30,6 +30,8 @@ struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *,
 void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *,
                              const struct bkey_i *, enum btree_id, unsigned);
 
+void bch2_btree_node_evict(struct bch_fs *, const struct bkey_i *);
+
 void bch2_fs_btree_cache_exit(struct bch_fs *);
 int bch2_fs_btree_cache_init(struct bch_fs *);
 void bch2_fs_btree_cache_init_early(struct btree_cache *);
index 864931eaf616369b1e0e3e2a19a37f83bdc7678b..24fa279d1cdbcb146c6b7151fd131fc376826d64 100644 (file)
@@ -66,8 +66,6 @@ static int bch2_gc_check_topology(struct bch_fs *c,
                ? node_start
                : bpos_successor(prev->k->k.p);
        char buf1[200], buf2[200];
-       bool update_min = false;
-       bool update_max = false;
        int ret = 0;
 
        if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
@@ -81,83 +79,340 @@ static int bch2_gc_check_topology(struct bch_fs *c,
                        bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev->k));
                }
 
-               if (fsck_err_on(bpos_cmp(expected_start, bp->v.min_key), c,
-                               "btree node with incorrect min_key at btree %s level %u:\n"
-                               "  prev %s\n"
-                               "  cur %s",
-                               bch2_btree_ids[b->c.btree_id], b->c.level,
-                               buf1,
-                               (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2)))
-                       update_min = true;
+               if (bpos_cmp(expected_start, bp->v.min_key)) {
+                       bch2_topology_error(c);
+
+                       if (fsck_err(c, "btree node with incorrect min_key at btree %s level %u:\n"
+                                    "  prev %s\n"
+                                    "  cur %s",
+                                    bch2_btree_ids[b->c.btree_id], b->c.level,
+                                    buf1,
+                                    (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(cur.k)), buf2))) {
+                               bch_info(c, "Halting mark and sweep to start topology repair pass");
+                               return FSCK_ERR_START_TOPOLOGY_REPAIR;
+                       } else {
+                               set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+                       }
+               }
        }
 
-       if (fsck_err_on(is_last &&
-                       bpos_cmp(cur.k->k.p, node_end), c,
+       if (is_last && bpos_cmp(cur.k->k.p, node_end)) {
+               bch2_topology_error(c);
+
+               if (fsck_err(c, "btree node with incorrect max_key at btree %s level %u:\n"
+                            "  %s\n"
+                            "  expected %s",
+                            bch2_btree_ids[b->c.btree_id], b->c.level,
+                            (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
+                            (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2))) {
+                       bch_info(c, "Halting mark and sweep to start topology repair pass");
+                       return FSCK_ERR_START_TOPOLOGY_REPAIR;
+               } else {
+                       set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+               }
+       }
+
+       bch2_bkey_buf_copy(prev, c, cur.k);
+fsck_err:
+       return ret;
+}
+
+static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
+{
+       switch (b->key.k.type) {
+       case KEY_TYPE_btree_ptr: {
+               struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key);
+
+               dst->k.p                = src->k.p;
+               dst->v.mem_ptr          = 0;
+               dst->v.seq              = b->data->keys.seq;
+               dst->v.sectors_written  = 0;
+               dst->v.flags            = 0;
+               dst->v.min_key          = b->data->min_key;
+               set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k));
+               memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k));
+               break;
+       }
+       case KEY_TYPE_btree_ptr_v2:
+               bkey_copy(&dst->k_i, &b->key);
+               break;
+       default:
+               BUG();
+       }
+}
+
+static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
+{
+       struct bkey_i_btree_ptr_v2 *new;
+       int ret;
+
+       new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL);
+       if (!new)
+               return -ENOMEM;
+
+       btree_ptr_to_v2(b, new);
+       b->data->min_key        = new_min;
+       new->v.min_key          = new_min;
+       SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
+
+       ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+       if (ret) {
+               kfree(new);
+               return ret;
+       }
+
+       bch2_btree_node_drop_keys_outside_node(b);
+
+       return 0;
+}
+
+static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
+{
+       struct bkey_i_btree_ptr_v2 *new;
+       int ret;
+
+       ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
+       if (ret)
+               return ret;
+
+       new = kmalloc(BKEY_BTREE_PTR_U64s_MAX * sizeof(u64), GFP_KERNEL);
+       if (!new)
+               return -ENOMEM;
+
+       btree_ptr_to_v2(b, new);
+       b->data->max_key        = new_max;
+       new->k.p                = new_max;
+       SET_BTREE_PTR_RANGE_UPDATED(&new->v, true);
+
+       ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level + 1, &new->k_i);
+       if (ret) {
+               kfree(new);
+               return ret;
+       }
+
+       bch2_btree_node_drop_keys_outside_node(b);
+
+       mutex_lock(&c->btree_cache.lock);
+       bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+       bkey_copy(&b->key, &new->k_i);
+       ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
+       BUG_ON(ret);
+       mutex_unlock(&c->btree_cache.lock);
+       return 0;
+}
+
+static int btree_repair_node_start(struct bch_fs *c, struct btree *b,
+                                  struct btree *prev, struct btree *cur)
+{
+       struct bpos expected_start = !prev
+               ? b->data->min_key
+               : bpos_successor(prev->key.k.p);
+       char buf1[200], buf2[200];
+       int ret = 0;
+
+       if (!prev) {
+               struct printbuf out = PBUF(buf1);
+               pr_buf(&out, "start of node: ");
+               bch2_bpos_to_text(&out, b->data->min_key);
+       } else {
+               bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&prev->key));
+       }
+
+       if (mustfix_fsck_err_on(bpos_cmp(expected_start, cur->data->min_key), c,
+                       "btree node with incorrect min_key at btree %s level %u:\n"
+                       "  prev %s\n"
+                       "  cur %s",
+                       bch2_btree_ids[b->c.btree_id], b->c.level,
+                       buf1,
+                       (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(&cur->key)), buf2))) {
+               if (prev &&
+                   bpos_cmp(expected_start, cur->data->min_key) > 0 &&
+                   BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data))
+                       ret = set_node_max(c, prev,
+                               bpos_predecessor(cur->data->min_key));
+               else
+                       ret = set_node_min(c, cur, expected_start);
+               if (ret)
+                       return ret;
+       }
+fsck_err:
+       return ret;
+}
+
+static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
+                                struct btree *child)
+{
+       char buf1[200], buf2[200];
+       int ret = 0;
+
+       if (mustfix_fsck_err_on(bpos_cmp(child->key.k.p, b->key.k.p), c,
                        "btree node with incorrect max_key at btree %s level %u:\n"
                        "  %s\n"
                        "  expected %s",
                        bch2_btree_ids[b->c.btree_id], b->c.level,
-                       (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(cur.k)), buf1),
-                       (bch2_bpos_to_text(&PBUF(buf2), node_end), buf2)))
-               update_max = true;
+                       (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(&child->key)), buf1),
+                       (bch2_bpos_to_text(&PBUF(buf2), b->key.k.p), buf2))) {
+               ret = set_node_max(c, child, b->key.k.p);
+               if (ret)
+                       return ret;
+       }
+fsck_err:
+       return ret;
+}
 
-       bch2_bkey_buf_copy(prev, c, cur.k);
+#define DROP_THIS_NODE         10
 
-       if (update_min || update_max) {
-               struct bkey_i *new;
-               struct bkey_i_btree_ptr_v2 *bp = NULL;
-               struct btree *n;
+static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
+{
+       struct btree_and_journal_iter iter;
+       struct bkey_s_c k;
+       struct bkey_buf tmp;
+       struct btree *prev = NULL, *cur = NULL;
+       bool have_child, dropped_children = false;
+       char buf[200];
+       int ret = 0;
+
+       if (!b->c.level)
+               return 0;
+again:
+       have_child = dropped_children = false;
+       bch2_bkey_buf_init(&tmp);
+       bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
 
-               if (update_max) {
+       while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+               bch2_btree_and_journal_iter_advance(&iter);
+               bch2_bkey_buf_reassemble(&tmp, c, k);
+
+               cur = bch2_btree_node_get_noiter(c, tmp.k,
+                                       b->c.btree_id, b->c.level - 1,
+                                       false);
+               ret = PTR_ERR_OR_ZERO(cur);
+
+               if (mustfix_fsck_err_on(ret == -EIO, c,
+                               "Unreadable btree node at btree %s level %u:\n"
+                               "  %s",
+                               bch2_btree_ids[b->c.btree_id],
+                               b->c.level - 1,
+                               (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(tmp.k)), buf))) {
+                       bch2_btree_node_evict(c, tmp.k);
                        ret = bch2_journal_key_delete(c, b->c.btree_id,
-                                                     b->c.level, cur.k->k.p);
+                                                     b->c.level, tmp.k->k.p);
                        if (ret)
-                               return ret;
+                               goto err;
+                       continue;
                }
 
-               new = kmalloc(bkey_bytes(&cur.k->k), GFP_KERNEL);
-               if (!new) {
-                       bch_err(c, "%s: error allocating new key", __func__);
-                       return -ENOMEM;
+               if (ret) {
+                       bch_err(c, "%s: error %i getting btree node",
+                               __func__, ret);
+                       break;
                }
 
-               bkey_copy(new, cur.k);
+               ret = btree_repair_node_start(c, b, prev, cur);
+               if (prev)
+                       six_unlock_read(&prev->c.lock);
+               prev = cur;
+               cur = NULL;
+
+               if (ret)
+                       break;
+       }
+
+       if (!ret && !IS_ERR_OR_NULL(prev)) {
+               BUG_ON(cur);
+               ret = btree_repair_node_end(c, b, prev);
+       }
 
-               if (new->k.type == KEY_TYPE_btree_ptr_v2)
-                       bp = bkey_i_to_btree_ptr_v2(new);
+       if (!IS_ERR_OR_NULL(prev))
+               six_unlock_read(&prev->c.lock);
+       prev = NULL;
+       if (!IS_ERR_OR_NULL(cur))
+               six_unlock_read(&cur->c.lock);
+       cur = NULL;
 
-               if (update_min)
-                       bp->v.min_key = expected_start;
-               if (update_max)
-                       new->k.p = node_end;
-               if (bp)
-                       SET_BTREE_PTR_RANGE_UPDATED(&bp->v, true);
+       if (ret)
+               goto err;
+
+       bch2_btree_and_journal_iter_exit(&iter);
+       bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+       while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
+               bch2_bkey_buf_reassemble(&tmp, c, k);
+               bch2_btree_and_journal_iter_advance(&iter);
+
+               cur = bch2_btree_node_get_noiter(c, tmp.k,
+                                       b->c.btree_id, b->c.level - 1,
+                                       false);
+               ret = PTR_ERR_OR_ZERO(cur);
 
-               ret = bch2_journal_key_insert(c, b->c.btree_id, b->c.level, new);
                if (ret) {
-                       kfree(new);
-                       return ret;
+                       bch_err(c, "%s: error %i getting btree node",
+                               __func__, ret);
+                       goto err;
                }
 
-               n = bch2_btree_node_get_noiter(c, cur.k, b->c.btree_id,
-                                              b->c.level - 1, true);
-               if (n) {
-                       mutex_lock(&c->btree_cache.lock);
-                       bch2_btree_node_hash_remove(&c->btree_cache, n);
-
-                       bkey_copy(&n->key, new);
-                       if (update_min)
-                               n->data->min_key = expected_start;
-                       if (update_max)
-                               n->data->max_key = node_end;
-
-                       ret = __bch2_btree_node_hash_insert(&c->btree_cache, n);
-                       BUG_ON(ret);
-                       mutex_unlock(&c->btree_cache.lock);
-                       six_unlock_read(&n->c.lock);
+               ret = bch2_btree_repair_topology_recurse(c, cur);
+               six_unlock_read(&cur->c.lock);
+               cur = NULL;
+
+               if (ret == DROP_THIS_NODE) {
+                       bch2_btree_node_evict(c, tmp.k);
+                       ret = bch2_journal_key_delete(c, b->c.btree_id,
+                                                     b->c.level, tmp.k->k.p);
+                       dropped_children = true;
                }
+
+               if (ret)
+                       goto err;
+
+               have_child = true;
        }
+
+       if (mustfix_fsck_err_on(!have_child, c,
+                       "empty interior btree node at btree %s level %u\n"
+                       "  %s",
+                       bch2_btree_ids[b->c.btree_id],
+                       b->c.level,
+                       (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key)), buf)))
+               ret = DROP_THIS_NODE;
+err:
 fsck_err:
+       if (!IS_ERR_OR_NULL(prev))
+               six_unlock_read(&prev->c.lock);
+       if (!IS_ERR_OR_NULL(cur))
+               six_unlock_read(&cur->c.lock);
+
+       bch2_btree_and_journal_iter_exit(&iter);
+       bch2_bkey_buf_exit(&tmp, c);
+
+       if (!ret && dropped_children)
+               goto again;
+
+       return ret;
+}
+
+static int bch2_repair_topology(struct bch_fs *c)
+{
+       struct btree *b;
+       unsigned i;
+       int ret = 0;
+
+       for (i = 0; i < BTREE_ID_NR && !ret; i++) {
+               b = c->btree_roots[i].b;
+               if (btree_node_fake(b))
+                       continue;
+
+               six_lock_read(&b->c.lock, NULL, NULL);
+               ret = bch2_btree_repair_topology_recurse(c, b);
+               six_unlock_read(&b->c.lock);
+
+               if (ret == DROP_THIS_NODE) {
+                       bch_err(c, "empty btree root - repair unimplemented");
+                       ret = FSCK_ERR_EXIT;
+               }
+       }
+
        return ret;
 }
 
@@ -483,6 +738,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
        struct bkey_s_c k;
        struct bkey_buf cur, prev;
        u8 max_stale = 0;
+       char buf[200];
        int ret = 0;
 
        bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
@@ -498,7 +754,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
                                       &k, &max_stale, true);
                if (ret) {
                        bch_err(c, "%s: error %i from bch2_gc_mark_key", __func__, ret);
-                       break;
+                       goto fsck_err;
                }
 
                if (b->c.level) {
@@ -511,7 +767,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
                                        &prev, cur,
                                        !bch2_btree_and_journal_iter_peek(&iter).k);
                        if (ret)
-                               break;
+                               goto fsck_err;
                } else {
                        bch2_btree_and_journal_iter_advance(&iter);
                }
@@ -532,18 +788,25 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
                                                false);
                        ret = PTR_ERR_OR_ZERO(child);
 
-                       if (fsck_err_on(ret == -EIO, c,
-                                       "unreadable btree node")) {
-                               ret = bch2_journal_key_delete(c, b->c.btree_id,
-                                                             b->c.level, cur.k->k.p);
-                               if (ret)
-                                       return ret;
-
-                               set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
-                               continue;
-                       }
-
-                       if (ret) {
+                       if (ret == -EIO) {
+                               bch2_topology_error(c);
+
+                               if (fsck_err(c, "Unreadable btree node at btree %s level %u:\n"
+                                       "  %s",
+                                       bch2_btree_ids[b->c.btree_id],
+                                       b->c.level - 1,
+                                       (bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(cur.k)), buf))) {
+                                       ret = FSCK_ERR_START_TOPOLOGY_REPAIR;
+                                       bch_info(c, "Halting mark and sweep to start topology repair pass");
+                                       goto fsck_err;
+                               } else {
+                                       /* Continue marking when opted to not
+                                        * fix the error: */
+                                       ret = 0;
+                                       set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+                                       continue;
+                               }
+                       } else if (ret) {
                                bch_err(c, "%s: error %i getting btree node",
                                        __func__, ret);
                                break;
@@ -583,16 +846,20 @@ static int bch2_gc_btree_init(struct bch_fs *c,
                return 0;
 
        six_lock_read(&b->c.lock, NULL, NULL);
-       if (fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
+       if (mustfix_fsck_err_on(bpos_cmp(b->data->min_key, POS_MIN), c,
                        "btree root with incorrect min_key: %s",
                        (bch2_bpos_to_text(&PBUF(buf), b->data->min_key), buf))) {
-               BUG();
+               bch_err(c, "repair unimplemented");
+               ret = FSCK_ERR_EXIT;
+               goto fsck_err;
        }
 
-       if (fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c,
+       if (mustfix_fsck_err_on(bpos_cmp(b->data->max_key, POS_MAX), c,
                        "btree root with incorrect max_key: %s",
                        (bch2_bpos_to_text(&PBUF(buf), b->data->max_key), buf))) {
-               BUG();
+               bch_err(c, "repair unimplemented");
+               ret = FSCK_ERR_EXIT;
+               goto fsck_err;
        }
 
        if (b->c.level >= target_depth)
@@ -607,7 +874,7 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 fsck_err:
        six_unlock_read(&b->c.lock);
 
-       if (ret)
+       if (ret < 0)
                bch_err(c, "%s: ret %i", __func__, ret);
        return ret;
 }
@@ -622,23 +889,20 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
 {
        enum btree_id ids[BTREE_ID_NR];
        unsigned i;
+       int ret = 0;
 
        for (i = 0; i < BTREE_ID_NR; i++)
                ids[i] = i;
        bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp);
 
-       for (i = 0; i < BTREE_ID_NR; i++) {
-               enum btree_id id = ids[i];
-               int ret = initial
-                       ? bch2_gc_btree_init(c, id, metadata_only)
-                       : bch2_gc_btree(c, id, initial, metadata_only);
-               if (ret) {
-                       bch_err(c, "%s: ret %i", __func__, ret);
-                       return ret;
-               }
-       }
+       for (i = 0; i < BTREE_ID_NR && !ret; i++)
+               ret = initial
+                       ? bch2_gc_btree_init(c, ids[i], metadata_only)
+                       : bch2_gc_btree(c, ids[i], initial, metadata_only);
 
-       return 0;
+       if (ret < 0)
+               bch_err(c, "%s: ret %i", __func__, ret);
+       return ret;
 }
 
 static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca,
@@ -1025,7 +1289,27 @@ again:
 
        bch2_mark_superblocks(c);
 
+       if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags) &&
+           !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) &&
+           c->opts.fix_errors != FSCK_OPT_NO) {
+               bch_info(c, "starting topology repair pass");
+               ret = bch2_repair_topology(c);
+               if (ret)
+                       goto out;
+               bch_info(c, "topology repair pass done");
+       }
+
        ret = bch2_gc_btrees(c, initial, metadata_only);
+
+       if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR &&
+           !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
+               set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+               ret = 0;
+       }
+
+       if (ret == FSCK_ERR_START_TOPOLOGY_REPAIR)
+               ret = FSCK_ERR_EXIT;
+
        if (ret)
                goto out;
 
index 2de31a6b9661983917769fbd77047c52ac31085f..e609bc49cefe55d27e7e3fba994d89a38d1d5de2 100644 (file)
@@ -558,6 +558,46 @@ out:                                                                       \
 
 #define btree_err_on(cond, ...)        ((cond) ? btree_err(__VA_ARGS__) : false)
 
+/*
+ * When btree topology repair changes the start or end of a node, that might
+ * mean we have to drop keys that are no longer inside the node:
+ */
+void bch2_btree_node_drop_keys_outside_node(struct btree *b)
+{
+       struct bset_tree *t;
+
+       for_each_bset(b, t) {
+               struct bset *i = bset(b, t);
+               struct bkey_packed *k;
+
+               for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
+                       if (bkey_cmp_left_packed(b, k, &b->data->min_key) < 0)
+                               break;
+
+               if (k != i->start) {
+                       unsigned shift = (u64 *) k - (u64 *) i->start;
+
+                       memmove_u64s_down(i->start, k,
+                                         (u64 *) vstruct_end(i) - (u64 *) k);
+                       i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift);
+                       set_btree_bset_end(b, t);
+                       bch2_bset_set_no_aux_tree(b, t);
+               }
+
+               for (k = i->start; k != vstruct_last(i); k = bkey_next(k))
+                       if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0)
+                               break;
+
+               if (k != vstruct_last(i)) {
+                       i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start);
+                       set_btree_bset_end(b, t);
+                       bch2_bset_set_no_aux_tree(b, t);
+               }
+       }
+
+       bch2_btree_build_aux_trees(b);
+}
+
 static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
                         struct btree *b, struct bset *i,
                         unsigned sectors, int write, bool have_retry)
@@ -680,6 +720,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 {
        unsigned version = le16_to_cpu(i->version);
        struct bkey_packed *k, *prev = NULL;
+       bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+               BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
        int ret = 0;
 
        for (k = i->start;
@@ -713,7 +755,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
                u = __bkey_disassemble(b, k, &tmp);
 
                invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?:
-                       bch2_bkey_in_btree_node(b, u.s_c) ?:
+                       (!updated_range ?  bch2_bkey_in_btree_node(b, u.s_c) : NULL) ?:
                        (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL);
                if (invalid) {
                        char buf[160];
@@ -770,6 +812,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
        struct bch_extent_ptr *ptr;
        struct bset *i;
        bool used_mempool, blacklisted;
+       bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+               BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v);
        unsigned u64s;
        int ret, retry_read = 0, write = READ;
 
@@ -917,6 +961,9 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
        btree_bounce_free(c, btree_bytes(c), used_mempool, sorted);
 
+       if (updated_range)
+               bch2_btree_node_drop_keys_outside_node(b);
+
        i = &b->data->keys;
        for (k = i->start; k != vstruct_last(i);) {
                struct bkey tmp;
@@ -986,6 +1033,7 @@ static void btree_node_read_work(struct work_struct *work)
        struct bch_io_failures failed = { .nr = 0 };
        char buf[200];
        struct printbuf out;
+       bool saw_error = false;
        bool can_retry;
 
        goto start;
@@ -1023,6 +1071,8 @@ start:
                    !bch2_btree_node_read_done(c, ca, b, can_retry))
                        break;
 
+               saw_error = true;
+
                if (!can_retry) {
                        set_btree_node_read_error(b);
                        break;
@@ -1032,6 +1082,10 @@ start:
        bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read],
                               rb->start_time);
        bio_put(&rb->bio);
+
+       if (saw_error && !btree_node_read_error(b))
+               bch2_btree_node_rewrite_async(c, b);
+
        clear_btree_node_read_in_flight(b);
        wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
 }
index c8a8b05a19b0f1b1bc74fe254e28facd6a98b4a6..cadcf7f886d73759167ce8f177e0e55723ebf9a5 100644 (file)
@@ -131,6 +131,8 @@ static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offse
 
 void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *);
 
+void bch2_btree_node_drop_keys_outside_node(struct btree *);
+
 void bch2_btree_build_aux_trees(struct btree *);
 void bch2_btree_init_next(struct bch_fs *, struct btree *,
                         struct btree_iter *);
index a5181a96397a24637c9fe95c31e9fae1cccd875b..a0ff0c3ceb90c52271f0d73106e125167b711435 100644 (file)
@@ -682,7 +682,9 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
                kmem_cache_free(bch2_key_cache, ck);
        }
 
-       BUG_ON(atomic_long_read(&bc->nr_dirty) && !bch2_journal_error(&c->journal));
+       BUG_ON(atomic_long_read(&bc->nr_dirty) &&
+              !bch2_journal_error(&c->journal) &&
+              test_bit(BCH_FS_WAS_RW, &c->flags));
        BUG_ON(atomic_long_read(&bc->nr_keys));
 
        mutex_unlock(&bc->lock);
index 0c7caa7e91a0ac7836a638e599a82b628580aef2..56131ac516ce4b74e5f357f1f923b4ab7cdad7fd 100644 (file)
@@ -72,6 +72,7 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 
 int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *,
                            __le64, unsigned);
+void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *);
 int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *,
                               struct btree *, struct bkey_i *);
 
index 5c86e76f5079c90d79edb1253e61fa25120a3a14..b9e0ff97a41bb7a265c6db83bf8d9691a2f83075 100644 (file)
@@ -1154,6 +1154,27 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
        set_btree_node_need_write(b);
 }
 
+static void
+__bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
+                                 struct btree_iter *iter, struct keylist *keys,
+                                 struct btree_node_iter node_iter)
+{
+       struct bkey_i *insert = bch2_keylist_front(keys);
+       struct bkey_packed *k;
+
+       BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
+
+       while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
+              (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
+               ;
+
+       while (!bch2_keylist_empty(keys)) {
+               bch2_insert_fixup_btree_ptr(as, b, iter,
+                               bch2_keylist_front(keys), &node_iter);
+               bch2_keylist_pop_front(keys);
+       }
+}
+
 /*
  * Move keys from n1 (original replacement node, now lower node) to n2 (higher
  * node)
@@ -1284,16 +1305,9 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
        struct bkey_packed *src, *dst, *n;
        struct bset *i;
 
-       BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
-
        bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
 
-       while (!bch2_keylist_empty(keys)) {
-               k = bch2_keylist_front(keys);
-
-               bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter);
-               bch2_keylist_pop_front(keys);
-       }
+       __bch2_btree_insert_keys_interior(as, b, iter, keys, node_iter);
 
        /*
         * We can't tolerate whiteouts here - with whiteouts there can be
@@ -1439,24 +1453,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
                                struct btree_iter *iter, struct keylist *keys)
 {
        struct btree_iter *linked;
-       struct btree_node_iter node_iter;
-       struct bkey_i *insert = bch2_keylist_front(keys);
-       struct bkey_packed *k;
 
-       /* Don't screw up @iter's position: */
-       node_iter = iter->l[b->c.level].iter;
-
-       /*
-        * btree_split(), btree_gc_coalesce() will insert keys before
-        * the iterator's current position - they know the keys go in
-        * the node the iterator points to:
-        */
-       while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
-              (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
-               ;
-
-       for_each_keylist_key(keys, insert)
-               bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
+       __bch2_btree_insert_keys_interior(as, b, iter, keys, iter->l[b->c.level].iter);
 
        btree_update_updated_node(as, b);
 
@@ -1611,11 +1609,12 @@ retry:
 
                bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key);
                bch2_bpos_to_text(&PBUF(buf2), next->data->min_key);
-               bch2_fs_inconsistent(c,
-                                    "btree topology error in btree merge:\n"
-                                    "prev ends at   %s\n"
-                                    "next starts at %s\n",
-                                    buf1, buf2);
+               bch_err(c,
+                       "btree topology error in btree merge:\n"
+                       "  prev ends at   %s\n"
+                       "  next starts at %s",
+                       buf1, buf2);
+               bch2_topology_error(c);
                ret = -EIO;
                goto err;
        }
@@ -1797,6 +1796,56 @@ out:
        return ret;
 }
 
+struct async_btree_rewrite {
+       struct bch_fs           *c;
+       struct work_struct      work;
+       enum btree_id           btree_id;
+       unsigned                level;
+       struct bpos             pos;
+       __le64                  seq;
+};
+
+void async_btree_node_rewrite_work(struct work_struct *work)
+{
+       struct async_btree_rewrite *a =
+               container_of(work, struct async_btree_rewrite, work);
+       struct bch_fs *c = a->c;
+       struct btree_trans trans;
+       struct btree_iter *iter;
+
+       bch2_trans_init(&trans, c, 0, 0);
+       iter = bch2_trans_get_node_iter(&trans, a->btree_id, a->pos,
+                                       BTREE_MAX_DEPTH, a->level, 0);
+       bch2_btree_node_rewrite(c, iter, a->seq, 0);
+       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_exit(&trans);
+       percpu_ref_put(&c->writes);
+       kfree(a);
+}
+
+void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
+{
+       struct async_btree_rewrite *a = kmalloc(sizeof(*a), GFP_NOFS);
+
+       if (!percpu_ref_tryget(&c->writes))
+               return;
+
+       a = kmalloc(sizeof(*a), GFP_NOFS);
+       if (!a) {
+               percpu_ref_put(&c->writes);
+               return;
+       }
+
+       a->c            = c;
+       a->btree_id     = b->c.btree_id;
+       a->level        = b->c.level;
+       a->pos          = b->key.k.p;
+       a->seq          = b->data->keys.seq;
+
+       INIT_WORK(&a->work, async_btree_node_rewrite_work);
+       queue_work(system_long_wq, &a->work);
+}
+
 static void __bch2_btree_node_update_key(struct bch_fs *c,
                                         struct btree_update *as,
                                         struct btree_iter *iter,
index a8ee1db8aa3917851dfdd011e9d00e63bf8a84bd..90c3b986c264020253a2938941c6c9e6b1ae9bdf 100644 (file)
@@ -25,6 +25,13 @@ bool bch2_inconsistent_error(struct bch_fs *c)
        }
 }
 
+void bch2_topology_error(struct bch_fs *c)
+{
+       set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
+       if (test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
+               bch2_inconsistent_error(c);
+}
+
 void bch2_fatal_error(struct bch_fs *c)
 {
        if (bch2_fs_emergency_read_only(c))
@@ -74,9 +81,13 @@ enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags,
                vprintk(fmt, args);
                va_end(args);
 
-               return bch2_inconsistent_error(c)
-                       ? FSCK_ERR_EXIT
-                       : FSCK_ERR_FIX;
+               if (c->opts.errors == BCH_ON_ERROR_continue) {
+                       bch_err(c, "fixing");
+                       return FSCK_ERR_FIX;
+               } else {
+                       bch2_inconsistent_error(c);
+                       return FSCK_ERR_EXIT;
+               }
        }
 
        mutex_lock(&c->fsck_error_lock);
@@ -146,6 +157,7 @@ print:
                set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
                return FSCK_ERR_FIX;
        } else {
+               set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
                set_bit(BCH_FS_ERROR, &c->flags);
                return c->opts.fix_errors == FSCK_OPT_EXIT ||
                        !(flags & FSCK_CAN_IGNORE)
index 0e49fd728e440cb5be02bf1da3e399fa52e3e9f0..d8cd19b3f63c83c73b675b54ed767607eb638cf9 100644 (file)
@@ -29,6 +29,8 @@ struct work_struct;
 
 bool bch2_inconsistent_error(struct bch_fs *);
 
+void bch2_topology_error(struct bch_fs *);
+
 #define bch2_fs_inconsistent(c, ...)                                   \
 ({                                                                     \
        bch_err(c, __VA_ARGS__);                                        \
@@ -88,6 +90,7 @@ enum fsck_err_ret {
        FSCK_ERR_IGNORE = 0,
        FSCK_ERR_FIX    = 1,
        FSCK_ERR_EXIT   = 2,
+       FSCK_ERR_START_TOPOLOGY_REPAIR = 3,
 };
 
 struct fsck_err_state {
index 26fbd8c2f03df7f6b90e30fc3e6ce88887b22fa3..338d50bec7e5ebdfbd9cf2719ac75754487f7a80 100644 (file)
@@ -12,8 +12,8 @@
 #include "super.h"
 #include "xattr.h"
 
+#include <linux/bsearch.h>
 #include <linux/dcache.h> /* struct qstr */
-#include <linux/generic-radix-tree.h>
 
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
@@ -290,21 +290,24 @@ static int hash_redo_key(struct btree_trans *trans,
                         struct bch_hash_info *hash_info,
                         struct btree_iter *k_iter, struct bkey_s_c k)
 {
-       struct bkey_i delete;
+       struct bkey_i *delete;
        struct bkey_i *tmp;
 
+       delete = bch2_trans_kmalloc(trans, sizeof(*delete));
+       if (IS_ERR(delete))
+               return PTR_ERR(delete);
+
        tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k));
        if (IS_ERR(tmp))
                return PTR_ERR(tmp);
 
        bkey_reassemble(tmp, k);
 
-       bkey_init(&delete.k);
-       delete.k.p = k_iter->pos;
-       bch2_trans_update(trans, k_iter, &delete, 0);
+       bkey_init(&delete->k);
+       delete->k.p = k_iter->pos;
+       bch2_trans_update(trans, k_iter, delete, 0);
 
-       return bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode,
-                            tmp, 0);
+       return bch2_hash_set(trans, desc, hash_info, k_iter->pos.inode, tmp, 0);
 }
 
 static int fsck_hash_delete_at(struct btree_trans *trans,
@@ -377,9 +380,8 @@ static int hash_check_key(struct btree_trans *trans,
        return ret;
 bad_hash:
        if (fsck_err(c, "hash table key at wrong offset: btree %u inode %llu offset %llu, "
-                    "hashed to %llu should be at %llu\n%s",
-                    desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset,
-                    hash, iter->pos.offset,
+                    "hashed to %llu\n%s",
+                    desc.btree_id, hash_k.k->p.inode, hash_k.k->p.offset, hash,
                     (bch2_bkey_val_to_text(&PBUF(buf), c, hash_k), buf)) == FSCK_ERR_IGNORE)
                return 0;
 
@@ -1130,38 +1132,120 @@ static int check_directory_structure(struct bch_fs *c)
        return bch2_trans_exit(&trans) ?: ret;
 }
 
-struct nlink {
-       u32     count;
-};
+struct nlink_table {
+       size_t          nr;
+       size_t          size;
 
-typedef GENRADIX(struct nlink) nlink_table;
+       struct nlink {
+               u64     inum;
+               u32     snapshot;
+               u32     count;
+       }               *d;
+};
 
-static void inc_link(struct bch_fs *c, nlink_table *links,
-                    u64 range_start, u64 *range_end, u64 inum)
+static int add_nlink(struct nlink_table *t, u64 inum, u32 snapshot)
 {
-       struct nlink *link;
+       if (t->nr == t->size) {
+               size_t new_size = max_t(size_t, 128UL, t->size * 2);
+               void *d = kvmalloc(new_size * sizeof(t->d[0]), GFP_KERNEL);
+               if (!d) {
+                       return -ENOMEM;
+               }
 
-       if (inum < range_start || inum >= *range_end)
-               return;
+               memcpy(d, t->d, t->size * sizeof(t->d[0]));
+               kvfree(t->d);
 
-       if (inum - range_start >= SIZE_MAX / sizeof(struct nlink)) {
-               *range_end = inum;
-               return;
+               t->d = d;
+               t->size = new_size;
        }
 
-       link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL);
-       if (!link) {
-               bch_verbose(c, "allocation failed during fsck - will need another pass");
-               *range_end = inum;
+
+       t->d[t->nr++] = (struct nlink) {
+               .inum           = inum,
+               .snapshot       = snapshot,
+       };
+
+       return 0;
+}
+
+static int nlink_cmp(const void *_l, const void *_r)
+{
+       const struct nlink *l = _l;
+       const struct nlink *r = _r;
+
+       return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot);
+}
+
+static void inc_link(struct bch_fs *c, struct nlink_table *links,
+                    u64 range_start, u64 range_end, u64 inum)
+{
+       struct nlink *link, key = {
+               .inum = inum, .snapshot = U32_MAX,
+       };
+
+       if (inum < range_start || inum >= range_end)
                return;
+
+       link = __inline_bsearch(&key, links->d, links->nr,
+                               sizeof(links->d[0]), nlink_cmp);
+       if (link)
+               link->count++;
+}
+
+noinline_for_stack
+static int check_nlinks_find_hardlinks(struct bch_fs *c,
+                                      struct nlink_table *t,
+                                      u64 start, u64 *end)
+{
+       struct btree_trans trans;
+       struct btree_iter *iter;
+       struct bkey_s_c k;
+       struct bkey_s_c_inode inode;
+       struct bch_inode_unpacked u;
+       int ret = 0;
+
+       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+
+       for_each_btree_key(&trans, iter, BTREE_ID_inodes,
+                          POS(0, start), 0, k, ret) {
+               if (k.k->type != KEY_TYPE_inode)
+                       continue;
+
+               inode = bkey_s_c_to_inode(k);
+
+               /*
+                * Backpointer and directory structure checks are sufficient for
+                * directories, since they can't have hardlinks:
+                */
+               if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
+                       continue;
+
+               /* Should never fail, checked by bch2_inode_invalid: */
+               BUG_ON(bch2_inode_unpack(inode, &u));
+
+               if (!u.bi_nlink)
+                       continue;
+
+               ret = add_nlink(t, k.k->p.offset, k.k->p.snapshot);
+               if (ret) {
+                       *end = k.k->p.offset;
+                       ret = 0;
+                       break;
+               }
+
        }
+       bch2_trans_iter_put(&trans, iter);
+       bch2_trans_exit(&trans);
 
-       link->count++;
+       if (ret)
+               bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
+
+       return ret;
 }
 
 noinline_for_stack
-static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
-                              u64 range_start, u64 *range_end)
+static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
+                                    u64 range_start, u64 range_end)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
@@ -1193,80 +1277,58 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
        return ret;
 }
 
-static int check_inode_nlink(struct btree_trans *trans,
-                            struct btree_iter *iter,
-                            struct bkey_s_c_inode inode,
-                            unsigned nlink)
-{
-       struct bch_fs *c = trans->c;
-       struct bch_inode_unpacked u;
-       int ret = 0;
-
-       /*
-        * Backpointer and directory structure checks are sufficient for
-        * directories, since they can't have hardlinks:
-        */
-       if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
-               return 0;
-
-       if (!nlink) {
-               bch_err(c, "no links found to inode %llu", inode.k->p.offset);
-               return -EINVAL;
-       }
-
-       ret = bch2_inode_unpack(inode, &u);
-
-       /* Should never happen, checked by bch2_inode_invalid: */
-       if (bch2_fs_inconsistent_on(ret, c,
-                        "error unpacking inode %llu in fsck",
-                        inode.k->p.inode))
-               return ret;
-
-       if (fsck_err_on(bch2_inode_nlink_get(&u) != nlink, c,
-                       "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
-                       u.bi_inum, mode_to_type(u.bi_mode),
-                       bch2_inode_nlink_get(&u), nlink)) {
-               bch2_inode_nlink_set(&u, nlink);
-
-               ret = __bch2_trans_do(trans, NULL, NULL,
-                                     BTREE_INSERT_NOFAIL|
-                                     BTREE_INSERT_LAZY_RW,
-                               bch2_inode_write(trans, iter, &u));
-               if (ret)
-                       bch_err(c, "error in fsck: error %i updating inode", ret);
-       }
-fsck_err:
-       return ret;
-}
-
 noinline_for_stack
-static int bch2_gc_walk_inodes(struct bch_fs *c,
-                              nlink_table *links,
+static int check_nlinks_update_hardlinks(struct bch_fs *c,
+                              struct nlink_table *links,
                               u64 range_start, u64 range_end)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
        struct bkey_s_c k;
-       struct nlink *link;
+       struct bkey_s_c_inode inode;
+       struct bch_inode_unpacked u;
+       struct nlink *link = links->d;
        int ret = 0;
 
        bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
        for_each_btree_key(&trans, iter, BTREE_ID_inodes,
                           POS(0, range_start), 0, k, ret) {
-               if (!k.k || k.k->p.offset >= range_end)
+               if (k.k->p.offset >= range_end)
                        break;
 
                if (k.k->type != KEY_TYPE_inode)
                        continue;
 
-               link = genradix_ptr(links, k.k->p.offset - range_start);
-               ret = check_inode_nlink(&trans, iter,
-                                       bkey_s_c_to_inode(k), link ? link->count : 0);
-               if (ret)
-                       break;
+               inode = bkey_s_c_to_inode(k);
+               if (S_ISDIR(le16_to_cpu(inode.v->bi_mode)))
+                       continue;
+
+               BUG_ON(bch2_inode_unpack(inode, &u));
 
+               if (!u.bi_nlink)
+                       continue;
+
+               while (link->inum < k.k->p.offset) {
+                       link++;
+                       BUG_ON(link >= links->d + links->nr);
+               }
+
+               if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c,
+                               "inode %llu has wrong i_nlink (type %u i_nlink %u, should be %u)",
+                               u.bi_inum, mode_to_type(u.bi_mode),
+                               bch2_inode_nlink_get(&u), link->count)) {
+                       bch2_inode_nlink_set(&u, link->count);
+
+                       ret = __bch2_trans_do(&trans, NULL, NULL,
+                                             BTREE_INSERT_NOFAIL|
+                                             BTREE_INSERT_LAZY_RW,
+                                       bch2_inode_write(&trans, iter, &u));
+                       if (ret)
+                               bch_err(c, "error in fsck: error %i updating inode", ret);
+               }
        }
+fsck_err:
        bch2_trans_iter_put(&trans, iter);
        bch2_trans_exit(&trans);
 
@@ -1279,34 +1341,36 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
 noinline_for_stack
 static int check_nlinks(struct bch_fs *c)
 {
-       nlink_table links;
+       struct nlink_table links = { 0 };
        u64 this_iter_range_start, next_iter_range_start = 0;
        int ret = 0;
 
        bch_verbose(c, "checking inode nlinks");
 
-       genradix_init(&links);
-
        do {
                this_iter_range_start = next_iter_range_start;
                next_iter_range_start = U64_MAX;
 
-               ret = bch2_gc_walk_dirents(c, &links,
+               ret = check_nlinks_find_hardlinks(c, &links,
+                                                 this_iter_range_start,
+                                                 &next_iter_range_start);
+
+               ret = check_nlinks_walk_dirents(c, &links,
                                          this_iter_range_start,
-                                         &next_iter_range_start);
+                                         next_iter_range_start);
                if (ret)
                        break;
 
-               ret = bch2_gc_walk_inodes(c, &links,
+               ret = check_nlinks_update_hardlinks(c, &links,
                                         this_iter_range_start,
                                         next_iter_range_start);
                if (ret)
                        break;
 
-               genradix_free(&links);
+               links.nr = 0;
        } while (next_iter_range_start != U64_MAX);
 
-       genradix_free(&links);
+       kvfree(links.d);
 
        return ret;
 }
index 2dc3dee4efc8a8a38787cc06d2349473998766fe..a9ccd14effe7297fc61cf2dc9217719bdc342d90 100644 (file)
@@ -973,7 +973,7 @@ int bch2_fs_recovery(struct bch_fs *c)
        struct jset *last_journal_entry = NULL;
        u64 blacklist_seq, journal_seq;
        bool write_sb = false;
-       int ret;
+       int ret = 0;
 
        if (c->sb.clean)
                clean = read_superblock_clean(c);
@@ -1241,8 +1241,9 @@ use_clean:
 
        if (c->opts.fsck &&
            !test_bit(BCH_FS_ERROR, &c->flags) &&
-           BCH_SB_HAS_ERRORS(c->disk_sb.sb)) {
+           !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) {
                SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
+               SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0);
                write_sb = true;
        }
 
@@ -1253,10 +1254,9 @@ use_clean:
        if (c->journal_seq_blacklist_table &&
            c->journal_seq_blacklist_table->nr > 128)
                queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work);
-out:
+
        ret = 0;
-err:
-fsck_err:
+out:
        set_bit(BCH_FS_FSCK_DONE, &c->flags);
        bch2_flush_fsck_errs(c);
 
@@ -1270,6 +1270,10 @@ fsck_err:
        else
                bch_verbose(c, "ret %i", ret);
        return ret;
+err:
+fsck_err:
+       bch2_fs_emergency_read_only(c);
+       goto out;
 }
 
 int bch2_fs_initialize(struct bch_fs *c)
index de8d49e3ef02d7e1fbdded96ed8ff3bcc49f11d7..11d7167b01294002d98e6d930b236e233cfe3c4a 100644 (file)
@@ -433,6 +433,11 @@ int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src)
 
        __copy_super(&c->disk_sb, src);
 
+       if (BCH_SB_HAS_ERRORS(c->disk_sb.sb))
+               set_bit(BCH_FS_ERROR, &c->flags);
+       if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb))
+               set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
+
        ret = bch2_sb_replicas_to_cpu_replicas(c);
        if (ret)
                return ret;
@@ -713,6 +718,8 @@ int bch2_write_super(struct bch_fs *c)
 
        if (test_bit(BCH_FS_ERROR, &c->flags))
                SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
+       if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags))
+               SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1);
 
        SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
 
index b6e449a7a4d8785417746015db2f2da0dd97f70d..fd8a29911254a7ff5b6718bc54b42a35a6671b0e 100644 (file)
@@ -381,6 +381,11 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
        unsigned i;
        int ret;
 
+       if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) {
+               bch_err(c, "cannot go rw, unfixed btree errors");
+               return -EROFS;
+       }
+
        if (test_bit(BCH_FS_RW, &c->flags))
                return 0;
 
@@ -440,6 +445,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
 
        percpu_ref_reinit(&c->writes);
        set_bit(BCH_FS_RW, &c->flags);
+       set_bit(BCH_FS_WAS_RW, &c->flags);
        return 0;
 err:
        __bch2_fs_read_only(c);