]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 3c41353bc1 bcachefs: Fix bch2_verify_keylist_sorted
authorKent Overstreet <kent.overstreet@gmail.com>
Wed, 21 Apr 2021 22:13:43 +0000 (18:13 -0400)
committerKent Overstreet <kent.overstreet@gmail.com>
Sat, 24 Apr 2021 05:37:09 +0000 (01:37 -0400)
25 files changed:
.bcachefs_revision
libbcachefs/bcachefs.h
libbcachefs/bkey_methods.c
libbcachefs/btree_cache.c
libbcachefs/btree_cache.h
libbcachefs/btree_gc.c
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/btree_key_cache.c
libbcachefs/btree_types.h
libbcachefs/btree_update_interior.c
libbcachefs/btree_update_leaf.c
libbcachefs/buckets.c
libbcachefs/debug.c
libbcachefs/debug.h
libbcachefs/ec.c
libbcachefs/fsck.c
libbcachefs/journal_reclaim.c
libbcachefs/keylist.c
libbcachefs/move.c
libbcachefs/movinggc.c
libbcachefs/replicas.c
libbcachefs/replicas.h
libbcachefs/super.c

index 82c9b19fb83f8082ffdf64748e75df6e14d66c73..feafaff4009835d776df05058f69a8c0486dac37 100644 (file)
@@ -1 +1 @@
-fe72e70682cd2430a099c08c3135253675030d28
+3c41353bc185e0a0da4c6f63b1203575c41a2da1
index aade562444224ebb34e0a84ee9df67033ee84982..ce058d55eb348c1a2324f971a1bb8d5eef048393 100644 (file)
@@ -259,7 +259,11 @@ do {                                                                       \
        BCH_DEBUG_PARAM(btree_gc_rewrite_disabled,                      \
                "Disables rewriting of btree nodes during mark and sweep")\
        BCH_DEBUG_PARAM(btree_shrinker_disabled,                        \
-               "Disables the shrinker callback for the btree node cache")
+               "Disables the shrinker callback for the btree node cache")\
+       BCH_DEBUG_PARAM(verify_btree_ondisk,                            \
+               "Reread btree nodes at various points to verify the "   \
+               "mergesort in the read path against modifications "     \
+               "done in memory")
 
 /* Parameters that should only be compiled in in debug mode: */
 #define BCH_DEBUG_PARAMS_DEBUG()                                       \
@@ -273,10 +277,6 @@ do {                                                                       \
                "information) when iterating over keys")                \
        BCH_DEBUG_PARAM(debug_check_btree_accounting,                   \
                "Verify btree accounting for keys within a node")       \
-       BCH_DEBUG_PARAM(verify_btree_ondisk,                            \
-               "Reread btree nodes at various points to verify the "   \
-               "mergesort in the read path against modifications "     \
-               "done in memory")                                       \
        BCH_DEBUG_PARAM(journal_seq_verify,                             \
                "Store the journal sequence number in the version "     \
                "number of every btree key, and verify that btree "     \
@@ -545,6 +545,8 @@ struct btree_iter_buf {
        struct btree_iter       *iter;
 };
 
+#define REPLICAS_DELTA_LIST_MAX        (1U << 16)
+
 struct bch_fs {
        struct closure          cl;
 
@@ -572,6 +574,7 @@ struct bch_fs {
        struct bch_replicas_cpu replicas;
        struct bch_replicas_cpu replicas_gc;
        struct mutex            replicas_gc_lock;
+       mempool_t               replicas_delta_pool;
 
        struct journal_entry_res btree_root_journal_res;
        struct journal_entry_res replicas_journal_res;
@@ -644,6 +647,7 @@ struct bch_fs {
        struct mutex            btree_trans_lock;
        struct list_head        btree_trans_list;
        mempool_t               btree_iters_pool;
+       mempool_t               btree_trans_mem_pool;
        struct btree_iter_buf  __percpu *btree_iters_bufs;
 
        struct srcu_struct      btree_trans_barrier;
@@ -813,11 +817,9 @@ struct bch_fs {
        /* DEBUG JUNK */
        struct dentry           *debug;
        struct btree_debug      btree_debug[BTREE_ID_NR];
-#ifdef CONFIG_BCACHEFS_DEBUG
        struct btree            *verify_data;
        struct btree_node       *verify_ondisk;
        struct mutex            verify_lock;
-#endif
 
        u64                     *unused_inode_hints;
        unsigned                inode_shard_bits;
index 450b613dcf5cdf9fe699c1b689f71db0125edd5c..9f869bed9f1c1458449af5ef8dfb8514607785ad 100644 (file)
@@ -100,7 +100,6 @@ const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k)
 
 static unsigned bch2_key_types_allowed[] = {
        [BKEY_TYPE_extents] =
-               (1U << KEY_TYPE_discard)|
                (1U << KEY_TYPE_error)|
                (1U << KEY_TYPE_extent)|
                (1U << KEY_TYPE_reservation)|
index 9f963179c0750ada9b4ba9f11ba93607780b9ecc..edc3c5edb62ba3e3cb9b7514298934030924f76b 100644 (file)
@@ -33,21 +33,21 @@ static inline unsigned btree_cache_can_free(struct btree_cache *bc)
        return max_t(int, 0, bc->used - bc->reserve);
 }
 
-static void __btree_node_data_free(struct bch_fs *c, struct btree *b)
+static void btree_node_data_free(struct bch_fs *c, struct btree *b)
 {
+       struct btree_cache *bc = &c->btree_cache;
+
        EBUG_ON(btree_node_write_in_flight(b));
 
        kvpfree(b->data, btree_bytes(c));
        b->data = NULL;
+#ifdef __KERNEL__
        vfree(b->aux_data);
+#else
+       munmap(b->aux_data, btree_aux_data_bytes(b));
+#endif
        b->aux_data = NULL;
-}
 
-static void btree_node_data_free(struct bch_fs *c, struct btree *b)
-{
-       struct btree_cache *bc = &c->btree_cache;
-
-       __btree_node_data_free(c, b);
        bc->used--;
        list_move(&b->list, &bc->freed);
 }
@@ -75,8 +75,13 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
        b->data = kvpmalloc(btree_bytes(c), gfp);
        if (!b->data)
                return -ENOMEM;
-
+#ifdef __KERNEL__
        b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp);
+#else
+       b->aux_data = mmap(NULL, btree_aux_data_bytes(b),
+                          PROT_READ|PROT_WRITE|PROT_EXEC,
+                          MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+#endif
        if (!b->aux_data) {
                kvpfree(b->data, btree_bytes(c));
                b->data = NULL;
@@ -100,7 +105,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c)
        return b;
 }
 
-static struct btree *btree_node_mem_alloc(struct bch_fs *c)
+struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
 {
        struct btree_cache *bc = &c->btree_cache;
        struct btree *b = __btree_node_mem_alloc(c);
@@ -360,12 +365,10 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
        flags = memalloc_nofs_save();
        mutex_lock(&bc->lock);
 
-#ifdef CONFIG_BCACHEFS_DEBUG
        if (c->verify_data)
                list_move(&c->verify_data->list, &bc->live);
 
        kvpfree(c->verify_ondisk, btree_bytes(c));
-#endif
 
        for (i = 0; i < BTREE_ID_NR; i++)
                if (c->btree_roots[i].b)
@@ -419,31 +422,15 @@ int bch2_fs_btree_cache_init(struct bch_fs *c)
        bch2_recalc_btree_reserve(c);
 
        for (i = 0; i < bc->reserve; i++)
-               if (!btree_node_mem_alloc(c)) {
+               if (!__bch2_btree_node_mem_alloc(c)) {
                        ret = -ENOMEM;
                        goto out;
                }
 
        list_splice_init(&bc->live, &bc->freeable);
 
-#ifdef CONFIG_BCACHEFS_DEBUG
        mutex_init(&c->verify_lock);
 
-       c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
-       if (!c->verify_ondisk) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       c->verify_data = btree_node_mem_alloc(c);
-       if (!c->verify_data) {
-               ret = -ENOMEM;
-               goto out;
-       }
-
-       list_del_init(&c->verify_data->list);
-#endif
-
        bc->shrink.count_objects        = bch2_btree_cache_count;
        bc->shrink.scan_objects         = bch2_btree_cache_scan;
        bc->shrink.seeks                = 4;
@@ -703,6 +690,41 @@ static int lock_node_check_fn(struct six_lock *lock, void *p)
        return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1;
 }
 
+static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
+{
+       char buf1[100], buf2[100], buf3[100], buf4[100];
+
+       if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags))
+               return;
+
+       bch2_bpos_to_text(&PBUF(buf1), b->key.k.type == KEY_TYPE_btree_ptr_v2
+               ? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
+               : POS_MIN);
+       bch2_bpos_to_text(&PBUF(buf2), b->data->min_key);
+
+       bch2_bpos_to_text(&PBUF(buf3), b->key.k.p);
+       bch2_bpos_to_text(&PBUF(buf4), b->data->max_key);
+       bch2_fs_inconsistent(c, "btree node header doesn't match ptr\n"
+                            "btree: ptr %u header %llu\n"
+                            "level: ptr %u header %llu\n"
+                            "min ptr %s node header %s\n"
+                            "max ptr %s node header %s",
+                            b->c.btree_id,     BTREE_NODE_ID(b->data),
+                            b->c.level,        BTREE_NODE_LEVEL(b->data),
+                            buf1, buf2, buf3, buf4);
+}
+
+static inline void btree_check_header(struct bch_fs *c, struct btree *b)
+{
+       if (b->c.btree_id != BTREE_NODE_ID(b->data) ||
+           b->c.level != BTREE_NODE_LEVEL(b->data) ||
+           bpos_cmp(b->data->max_key, b->key.k.p) ||
+           (b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+            bpos_cmp(b->data->min_key,
+                     bkey_i_to_btree_ptr_v2(&b->key)->v.min_key)))
+               btree_bad_header(c, b);
+}
+
 /**
  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
  * in from disk if necessary.
@@ -833,10 +855,7 @@ lock_node:
 
        EBUG_ON(b->c.btree_id != iter->btree_id);
        EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
-       EBUG_ON(bpos_cmp(b->data->max_key, k->k.p));
-       EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-               bpos_cmp(b->data->min_key,
-                        bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
+       btree_check_header(c, b);
 
        return b;
 }
@@ -916,10 +935,7 @@ lock_node:
 
        EBUG_ON(b->c.btree_id != btree_id);
        EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
-       EBUG_ON(bpos_cmp(b->data->max_key, k->k.p));
-       EBUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
-               bpos_cmp(b->data->min_key,
-                        bkey_i_to_btree_ptr_v2(&b->key)->v.min_key));
+       btree_check_header(c, b);
 out:
        bch2_btree_cache_cannibalize_unlock(c);
        return b;
index 4791c3b64452d915486a2a5754210fcc1098f0ab..c517cc02945405f5bc1a77a31f7292f504833cfd 100644 (file)
@@ -17,6 +17,7 @@ int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
 void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
 int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
 
+struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
 struct btree *bch2_btree_node_mem_alloc(struct bch_fs *);
 
 struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *,
index 536947ccaf919a40813e327cc94f28d479411c63..864931eaf616369b1e0e3e2a19a37f83bdc7678b 100644 (file)
@@ -330,6 +330,10 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
                BUG_ON(bch2_journal_seq_verify &&
                       k->k->version.lo > journal_cur_seq(&c->journal));
 
+               ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k);
+               if (ret)
+                       goto err;
+
                if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
                                "key version number higher than recorded: %llu > %llu",
                                k->k->version.lo,
@@ -346,8 +350,6 @@ static int bch2_gc_mark_key(struct bch_fs *c, enum btree_id btree_id,
                                goto err;
                        }
                }
-
-               ret = bch2_check_fix_ptrs(c, btree_id, level, is_root, k);
        }
 
        ptrs = bch2_bkey_ptrs_c(*k);
index c8d8df9637db182519eed427e32109b52bc3ecb1..2de31a6b9661983917769fbd77047c52ac31085f 100644 (file)
@@ -1340,6 +1340,13 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
        return ret;
 }
 
+static void btree_write_submit(struct work_struct *work)
+{
+       struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
+
+       bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key);
+}
+
 void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
 {
        struct btree_write_bio *wbio;
@@ -1347,7 +1354,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
        struct bset *i;
        struct btree_node *bn = NULL;
        struct btree_node_entry *bne = NULL;
-       struct bkey_buf k;
        struct bch_extent_ptr *ptr;
        struct sort_iter sort_iter;
        struct nonce nonce;
@@ -1358,8 +1364,6 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
        bool validate_before_checksum = false;
        void *data;
 
-       bch2_bkey_buf_init(&k);
-
        if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
                return;
 
@@ -1536,6 +1540,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
        wbio_init(&wbio->wbio.bio);
        wbio->data                      = data;
        wbio->bytes                     = bytes;
+       wbio->wbio.c                    = c;
        wbio->wbio.used_mempool         = used_mempool;
        wbio->wbio.bio.bi_opf           = REQ_OP_WRITE|REQ_META;
        wbio->wbio.bio.bi_end_io        = btree_node_write_endio;
@@ -1558,9 +1563,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
         * just make all btree node writes FUA to keep things sane.
         */
 
-       bch2_bkey_buf_copy(&k, c, &b->key);
+       bkey_copy(&wbio->key, &b->key);
 
-       bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(k.k)), ptr)
+       bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&wbio->key)), ptr)
                ptr->offset += b->written;
 
        b->written += sectors_to_write;
@@ -1568,9 +1573,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
        atomic64_inc(&c->btree_writes_nr);
        atomic64_add(sectors_to_write, &c->btree_writes_sectors);
 
-       /* XXX: submitting IO with btree locks held: */
-       bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, k.k);
-       bch2_bkey_buf_exit(&k, c);
+       INIT_WORK(&wbio->work, btree_write_submit);
+       schedule_work(&wbio->work);
        return;
 err:
        set_btree_node_noevict(b);
index 95c351611045a861cf9d90f87d860883df2b6dbd..c8a8b05a19b0f1b1bc74fe254e28facd6a98b4a6 100644 (file)
@@ -42,6 +42,7 @@ struct btree_read_bio {
 
 struct btree_write_bio {
        struct work_struct      work;
+       __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
        void                    *data;
        unsigned                bytes;
        struct bch_write_bio    wbio;
index c8f527bc2ea2759ee3879770b0687c3cfda157a1..93194e62ab2a152a3363c67a161647a8bd924bd8 100644 (file)
@@ -2145,7 +2145,16 @@ void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
        if (new_top > trans->mem_bytes) {
                size_t old_bytes = trans->mem_bytes;
                size_t new_bytes = roundup_pow_of_two(new_top);
-               void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
+               void *new_mem;
+
+               WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
+
+               new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);
+               if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
+                       new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
+                       new_bytes = BTREE_TRANS_MEM_MAX;
+                       kfree(trans->mem);
+               }
 
                if (!new_mem)
                        return ERR_PTR(-ENOMEM);
@@ -2249,6 +2258,11 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
        if (expected_mem_bytes) {
                trans->mem_bytes = roundup_pow_of_two(expected_mem_bytes);
                trans->mem = kmalloc(trans->mem_bytes, GFP_KERNEL|__GFP_NOFAIL);
+
+               if (!unlikely(trans->mem)) {
+                       trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
+                       trans->mem_bytes = BTREE_TRANS_MEM_MAX;
+               }
        }
 
        trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
@@ -2290,8 +2304,19 @@ int bch2_trans_exit(struct btree_trans *trans)
 
        bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
 
-       kfree(trans->fs_usage_deltas);
-       kfree(trans->mem);
+       if (trans->fs_usage_deltas) {
+               if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
+                   REPLICAS_DELTA_LIST_MAX)
+                       mempool_free(trans->fs_usage_deltas,
+                                    &trans->c->replicas_delta_pool);
+               else
+                       kfree(trans->fs_usage_deltas);
+       }
+
+       if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
+               mempool_free(trans->mem, &trans->c->btree_trans_mem_pool);
+       else
+               kfree(trans->mem);
 
 #ifdef __KERNEL__
        /*
@@ -2299,6 +2324,7 @@ int bch2_trans_exit(struct btree_trans *trans)
         */
        trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
 #endif
+
        if (trans->iters)
                mempool_free(trans->iters, &trans->c->btree_iters_pool);
 
@@ -2392,6 +2418,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
 
 void bch2_fs_btree_iter_exit(struct bch_fs *c)
 {
+       mempool_exit(&c->btree_trans_mem_pool);
        mempool_exit(&c->btree_iters_pool);
        cleanup_srcu_struct(&c->btree_trans_barrier);
 }
@@ -2407,5 +2434,7 @@ int bch2_fs_btree_iter_init(struct bch_fs *c)
                mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
                        sizeof(struct btree_iter) * nr +
                        sizeof(struct btree_insert_entry) * nr +
-                       sizeof(struct btree_insert_entry) * nr);
+                       sizeof(struct btree_insert_entry) * nr) ?:
+               mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1,
+                                         BTREE_TRANS_MEM_MAX);
 }
index 53191c99e5906227ed1f009adff7b9eb7d0ce0f2..a5181a96397a24637c9fe95c31e9fae1cccd875b 100644 (file)
@@ -218,8 +218,14 @@ static int btree_key_cache_fill(struct btree_trans *trans,
                goto err;
        }
 
-       if (k.k->u64s > ck->u64s) {
-               new_u64s = roundup_pow_of_two(k.k->u64s);
+       /*
+        * bch2_varint_decode can read past the end of the buffer by at
+        * most 7 bytes (it won't be used):
+        */
+       new_u64s = k.k->u64s + 1;
+
+       if (new_u64s > ck->u64s) {
+               new_u64s = roundup_pow_of_two(new_u64s);
                new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS);
                if (!new_k) {
                        ret = -ENOMEM;
@@ -385,12 +391,18 @@ retry:
                goto evict;
        }
 
+       /*
+        * Since journal reclaim depends on us making progress here, and the
+        * allocator/copygc depend on journal reclaim making progress, we need
+        * to be using alloc reserves:
+        * */
        ret   = bch2_btree_iter_traverse(b_iter) ?:
                bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?:
                bch2_trans_commit(trans, NULL, NULL,
                                  BTREE_INSERT_NOUNLOCK|
                                  BTREE_INSERT_NOCHECK_RW|
                                  BTREE_INSERT_NOFAIL|
+                                 BTREE_INSERT_USE_RESERVE|
                                  (ck->journal.seq == journal_last_seq(j)
                                   ? BTREE_INSERT_JOURNAL_RESERVED
                                   : 0)|
index f942ccf62ff428a07bca323703e8e0a3b04542d8..06a2c412db7a2bedad619f6498dd10656683925c 100644 (file)
@@ -352,6 +352,8 @@ struct btree_trans_commit_hook {
        struct btree_trans_commit_hook  *next;
 };
 
+#define BTREE_TRANS_MEM_MAX    4096
+
 struct btree_trans {
        struct bch_fs           *c;
 #ifdef CONFIG_BCACHEFS_DEBUG
index 07c925345675369fa62b81d8f66e074e5cad0b3c..87426d171687e61c2747320a195524115fd4c59b 100644 (file)
@@ -887,6 +887,14 @@ void bch2_btree_interior_update_will_free_node(struct btree_update *as,
        btree_update_drop_new_node(c, b);
 
        btree_update_will_delete_key(as, &b->key);
+
+       /*
+        * XXX: Waiting on io with btree node locks held, we don't want to be
+        * doing this. We can't have btree writes happening after the space has
+        * been freed, but we really only need to block before
+        * btree_update_nodes_written_trans() happens.
+        */
+       btree_node_wait_on_io(b);
 }
 
 void bch2_btree_update_done(struct btree_update *as)
@@ -1146,6 +1154,24 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
        set_btree_node_need_write(b);
 }
 
+static void
+__bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
+                                 struct btree_iter *iter, struct keylist *keys,
+                                 struct btree_node_iter node_iter)
+{
+       struct bkey_i *insert = bch2_keylist_front(keys);
+       struct bkey_packed *k;
+
+       BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
+
+       while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
+              (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
+               ;
+
+       for_each_keylist_key(keys, insert)
+               bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
+}
+
 /*
  * Move keys from n1 (original replacement node, now lower node) to n2 (higher
  * node)
@@ -1276,16 +1302,9 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b,
        struct bkey_packed *src, *dst, *n;
        struct bset *i;
 
-       BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);
-
        bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
 
-       while (!bch2_keylist_empty(keys)) {
-               k = bch2_keylist_front(keys);
-
-               bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter);
-               bch2_keylist_pop_front(keys);
-       }
+       __bch2_btree_insert_keys_interior(as, b, iter, keys, node_iter);
 
        /*
         * We can't tolerate whiteouts here - with whiteouts there can be
@@ -1431,24 +1450,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b,
                                struct btree_iter *iter, struct keylist *keys)
 {
        struct btree_iter *linked;
-       struct btree_node_iter node_iter;
-       struct bkey_i *insert = bch2_keylist_front(keys);
-       struct bkey_packed *k;
-
-       /* Don't screw up @iter's position: */
-       node_iter = iter->l[b->c.level].iter;
-
-       /*
-        * btree_split(), btree_gc_coalesce() will insert keys before
-        * the iterator's current position - they know the keys go in
-        * the node the iterator points to:
-        */
-       while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
-              (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
-               ;
 
-       for_each_keylist_key(keys, insert)
-               bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter);
+       __bch2_btree_insert_keys_interior(as, b, iter, keys, iter->l[b->c.level].iter);
 
        btree_update_updated_node(as, b);
 
@@ -1598,7 +1601,19 @@ retry:
                next = m;
        }
 
-       BUG_ON(bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key));
+       if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) {
+               char buf1[100], buf2[100];
+
+               bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key);
+               bch2_bpos_to_text(&PBUF(buf2), next->data->min_key);
+               bch2_fs_inconsistent(c,
+                                    "btree topology error in btree merge:\n"
+                                    "prev ends at   %s\n"
+                                    "next starts at %s\n",
+                                    buf1, buf2);
+               ret = -EIO;
+               goto err;
+       }
 
        bch2_bkey_format_init(&new_s);
        bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
index afdcc98dfb83985ad2033c49feb18ef32c1c3af8..b793ab77e452a21bf8a200d2bd9a63a4135bda0e 100644 (file)
@@ -293,6 +293,12 @@ btree_key_can_insert_cached(struct btree_trans *trans,
            !(trans->flags & BTREE_INSERT_JOURNAL_RECLAIM))
                return BTREE_INSERT_NEED_JOURNAL_RECLAIM;
 
+       /*
+        * bch2_varint_decode can read past the end of the buffer by at most 7
+        * bytes (it won't be used):
+        */
+       u64s += 1;
+
        if (u64s <= ck->u64s)
                return BTREE_INSERT_OK;
 
index 6b99f12708200d53d352a5323e9dbe9e179b208b..c3ad0bc85e78a21fd33b62e22b38514bf3d099e1 100644 (file)
@@ -396,20 +396,22 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
                bch2_wake_allocator(ca);
 }
 
-static inline void update_replicas(struct bch_fs *c,
+static inline int update_replicas(struct bch_fs *c,
                                   struct bch_fs_usage *fs_usage,
                                   struct bch_replicas_entry *r,
                                   s64 sectors)
 {
        int idx = bch2_replicas_entry_idx(c, r);
 
-       BUG_ON(idx < 0);
+       if (idx < 0)
+               return -1;
 
        fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
        fs_usage->replicas[idx]         += sectors;
+       return 0;
 }
 
-static inline void update_cached_sectors(struct bch_fs *c,
+static inline int update_cached_sectors(struct bch_fs *c,
                                         struct bch_fs_usage *fs_usage,
                                         unsigned dev, s64 sectors)
 {
@@ -417,7 +419,7 @@ static inline void update_cached_sectors(struct bch_fs *c,
 
        bch2_replicas_entry_cached(&r.e, dev);
 
-       update_replicas(c, fs_usage, &r.e, sectors);
+       return update_replicas(c, fs_usage, &r.e, sectors);
 }
 
 static struct replicas_delta_list *
@@ -425,10 +427,26 @@ replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
 {
        struct replicas_delta_list *d = trans->fs_usage_deltas;
        unsigned new_size = d ? (d->size + more) * 2 : 128;
+       unsigned alloc_size = sizeof(*d) + new_size;
+
+       WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX);
 
        if (!d || d->used + more > d->size) {
-               d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO);
-               BUG_ON(!d);
+               d = krealloc(d, alloc_size, GFP_NOIO|__GFP_ZERO);
+
+               BUG_ON(!d && alloc_size > REPLICAS_DELTA_LIST_MAX);
+
+               if (!d) {
+                       d = mempool_alloc(&trans->c->replicas_delta_pool, GFP_NOIO);
+                       memset(d, 0, REPLICAS_DELTA_LIST_MAX);
+
+                       if (trans->fs_usage_deltas)
+                               memcpy(d, trans->fs_usage_deltas,
+                                      trans->fs_usage_deltas->size + sizeof(*d));
+
+                       new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d);
+                       kfree(trans->fs_usage_deltas);
+               }
 
                d->size = new_size;
                trans->fs_usage_deltas = d;
@@ -553,8 +571,12 @@ static int bch2_mark_alloc(struct bch_fs *c,
 
        if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
            old_m.cached_sectors) {
-               update_cached_sectors(c, fs_usage, ca->dev_idx,
-                                     -old_m.cached_sectors);
+               if (update_cached_sectors(c, fs_usage, ca->dev_idx,
+                                     -old_m.cached_sectors)) {
+                       bch2_fs_fatal_error(c, "bch2_mark_alloc(): no replicas entry while updating cached sectors");
+                       return -1;
+               }
+
                trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset),
                                 old_m.cached_sectors);
        }
@@ -936,8 +958,12 @@ static int bch2_mark_extent(struct bch_fs *c,
 
                if (p.ptr.cached) {
                        if (!stale)
-                               update_cached_sectors(c, fs_usage, p.ptr.dev,
-                                                     disk_sectors);
+                               if (update_cached_sectors(c, fs_usage, p.ptr.dev,
+                                                         disk_sectors)) {
+                                       bch2_fs_fatal_error(c, "bch2_mark_extent(): no replicas entry while updating cached sectors");
+                                       return -1;
+
+                               }
                } else if (!p.has_ec) {
                        dirty_sectors          += disk_sectors;
                        r.e.devs[r.e.nr_devs++] = p.ptr.dev;
@@ -956,8 +982,15 @@ static int bch2_mark_extent(struct bch_fs *c,
                }
        }
 
-       if (r.e.nr_devs)
-               update_replicas(c, fs_usage, &r.e, dirty_sectors);
+       if (r.e.nr_devs) {
+               if (update_replicas(c, fs_usage, &r.e, dirty_sectors)) {
+                       char buf[200];
+
+                       bch2_bkey_val_to_text(&PBUF(buf), c, k);
+                       bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
+                       return -1;
+               }
+       }
 
        return 0;
 }
@@ -1031,8 +1064,14 @@ static int bch2_mark_stripe(struct bch_fs *c,
                                return ret;
                }
 
-               update_replicas(c, fs_usage, &m->r.e,
-                               ((s64) m->sectors * m->nr_redundant));
+               if (update_replicas(c, fs_usage, &m->r.e,
+                               ((s64) m->sectors * m->nr_redundant))) {
+                       char buf[200];
+
+                       bch2_bkey_val_to_text(&PBUF(buf), c, new);
+                       bch2_fs_fatal_error(c, "no replicas entry for %s", buf);
+                       return -1;
+               }
        }
 
        return 0;
@@ -1292,7 +1331,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans,
                        added += d->delta;
                }
 
-               update_replicas(c, dst, &d->r, d->delta);
+               BUG_ON(update_replicas(c, dst, &d->r, d->delta));
        }
 
        dst->nr_inodes += deltas->nr_inodes;
index 90364b55aa40a29a8d8bfce01c0e573ec9723c5b..4215c119e0a27a9f31bc226e2c7074d721e1879d 100644 (file)
 
 static struct dentry *bch_debug;
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
+static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
+                                     struct extent_ptr_decoded pick)
 {
        struct btree *v = c->verify_data;
-       struct btree_node *n_ondisk, *n_sorted, *n_inmemory;
-       struct bset *sorted, *inmemory;
-       struct extent_ptr_decoded pick;
-       struct bch_dev *ca;
+       struct btree_node *n_ondisk = c->verify_ondisk;
+       struct btree_node *n_sorted = c->verify_data->data;
+       struct bset *sorted, *inmemory = &b->data->keys;
+       struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev);
        struct bio *bio;
+       bool failed = false;
 
-       if (c->opts.nochanges)
-               return;
-
-       btree_node_io_lock(b);
-       mutex_lock(&c->verify_lock);
-
-       n_ondisk = c->verify_ondisk;
-       n_sorted = c->verify_data->data;
-       n_inmemory = b->data;
-
-       bkey_copy(&v->key, &b->key);
-       v->written      = 0;
-       v->c.level      = b->c.level;
-       v->c.btree_id   = b->c.btree_id;
-       bch2_btree_keys_init(v);
-
-       if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key),
-                                      NULL, &pick) <= 0)
-               return;
-
-       ca = bch_dev_bkey_exists(c, pick.ptr.dev);
        if (!bch2_dev_get_ioref(ca, READ))
-               return;
+               return false;
 
        bio = bio_alloc_bioset(GFP_NOIO,
                        buf_pages(n_sorted, btree_bytes(c)),
@@ -79,12 +58,12 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 
        memcpy(n_ondisk, n_sorted, btree_bytes(c));
 
+       v->written = 0;
        if (bch2_btree_node_read_done(c, ca, v, false))
-               goto out;
+               return false;
 
        n_sorted = c->verify_data->data;
        sorted = &n_sorted->keys;
-       inmemory = &n_inmemory->keys;
 
        if (inmemory->u64s != sorted->u64s ||
            memcmp(inmemory->start,
@@ -102,8 +81,8 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
                printk(KERN_ERR "*** read back in:\n");
                bch2_dump_bset(c, v, sorted, 0);
 
-               while (offset < b->written) {
-                       if (!offset ) {
+               while (offset < v->written) {
+                       if (!offset) {
                                i = &n_ondisk->keys;
                                sectors = vstruct_blocks(n_ondisk, c->block_bits) <<
                                        c->block_bits;
@@ -122,25 +101,84 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
                        offset += sectors;
                }
 
-               printk(KERN_ERR "*** block %u/%u not written\n",
-                      offset >> c->block_bits, btree_blocks(c));
-
                for (j = 0; j < le16_to_cpu(inmemory->u64s); j++)
                        if (inmemory->_data[j] != sorted->_data[j])
                                break;
 
-               printk(KERN_ERR "b->written %u\n", b->written);
-
                console_unlock();
-               panic("verify failed at %u\n", j);
+               bch_err(c, "verify failed at key %u", j);
+
+               failed = true;
+       }
+
+       if (v->written != b->written) {
+               bch_err(c, "written wrong: expected %u, got %u",
+                       b->written, v->written);
+               failed = true;
+       }
+
+       return failed;
+}
+
+void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
+{
+       struct bkey_ptrs_c ptrs;
+       struct extent_ptr_decoded p;
+       const union bch_extent_entry *entry;
+       struct btree *v;
+       struct bset *inmemory = &b->data->keys;
+       struct bkey_packed *k;
+       bool failed = false;
+
+       if (c->opts.nochanges)
+               return;
+
+       btree_node_io_lock(b);
+       mutex_lock(&c->verify_lock);
+
+       if (!c->verify_ondisk) {
+               c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+               if (!c->verify_ondisk)
+                       goto out;
+       }
+
+       if (!c->verify_data) {
+               c->verify_data = __bch2_btree_node_mem_alloc(c);
+               if (!c->verify_data)
+                       goto out;
+
+               list_del_init(&c->verify_data->list);
+       }
+
+       BUG_ON(b->nsets != 1);
+
+       for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_next(k))
+               if (k->type == KEY_TYPE_btree_ptr_v2) {
+                       struct bch_btree_ptr_v2 *v = (void *) bkeyp_val(&b->format, k);
+                       v->mem_ptr = 0;
+               }
+
+       v = c->verify_data;
+       bkey_copy(&v->key, &b->key);
+       v->c.level      = b->c.level;
+       v->c.btree_id   = b->c.btree_id;
+       bch2_btree_keys_init(v);
+
+       ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key));
+       bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry)
+               failed |= bch2_btree_verify_replica(c, b, p);
+
+       if (failed) {
+               char buf[200];
+
+               bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(&b->key));
+               bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf);
        }
 out:
        mutex_unlock(&c->verify_lock);
        btree_node_io_unlock(b);
 }
 
-#endif
-
 #ifdef CONFIG_DEBUG_FS
 
 /* XXX: bch_fs refcounting */
index 7ac1615e9447db326533ac6db5c1b129f02481a2..0b86736e5e1bea3ae631f1484a6e83fc7e5deba8 100644 (file)
@@ -8,11 +8,7 @@ struct bio;
 struct btree;
 struct bch_fs;
 
-#ifdef CONFIG_BCACHEFS_DEBUG
 void __bch2_btree_verify(struct bch_fs *, struct btree *);
-#else
-static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {}
-#endif
 
 static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b)
 {
index f712f685dd0e09122dd4b91a4a63ac2e9fd83cd2..7062ab9c58f18ad89a7ff893fcd43337dffc4f6e 100644 (file)
@@ -1621,6 +1621,7 @@ int bch2_stripes_write(struct bch_fs *c, unsigned flags)
                if (ret)
                        break;
        }
+       bch2_trans_iter_put(&trans, iter);
 
        bch2_trans_exit(&trans);
 
index eb8ac164b0b618d3ad1eb0d1224a0a0caeab1588..26fbd8c2f03df7f6b90e30fc3e6ce88887b22fa3 100644 (file)
@@ -38,9 +38,9 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum)
        return ret ?: sectors;
 }
 
-static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
-                       struct bch_inode_unpacked *inode,
-                       u32 *snapshot)
+static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
+                         struct bch_inode_unpacked *inode,
+                         u32 *snapshot)
 {
        struct btree_iter *iter;
        struct bkey_s_c k;
@@ -63,19 +63,34 @@ err:
        return ret;
 }
 
-static int write_inode(struct btree_trans *trans,
-                      struct bch_inode_unpacked *inode,
-                      u32 snapshot)
+static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
+                       struct bch_inode_unpacked *inode,
+                       u32 *snapshot)
+{
+       return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
+}
+
+static int __write_inode(struct btree_trans *trans,
+                        struct bch_inode_unpacked *inode,
+                        u32 snapshot)
 {
        struct btree_iter *inode_iter =
                bch2_trans_get_iter(trans, BTREE_ID_inodes,
                                    SPOS(0, inode->bi_inum, snapshot),
                                    BTREE_ITER_INTENT);
+       int ret = bch2_inode_write(trans, inode_iter, inode);
+       bch2_trans_iter_put(trans, inode_iter);
+       return ret;
+}
+
+static int write_inode(struct btree_trans *trans,
+                      struct bch_inode_unpacked *inode,
+                      u32 snapshot)
+{
        int ret = __bch2_trans_do(trans, NULL, NULL,
                                  BTREE_INSERT_NOFAIL|
                                  BTREE_INSERT_LAZY_RW,
-                                 bch2_inode_write(trans, inode_iter, inode));
-       bch2_trans_iter_put(trans, inode_iter);
+                                 __write_inode(trans, inode, snapshot));
        if (ret)
                bch_err(trans->c, "error in fsck: error %i updating inode", ret);
        return ret;
@@ -114,57 +129,101 @@ static int remove_dirent(struct btree_trans *trans, struct bpos pos)
        return ret;
 }
 
-static int __reattach_inode(struct btree_trans *trans,
-                           struct bch_inode_unpacked *lostfound,
-                           u64 inum)
+/* Get lost+found, create if it doesn't exist: */
+static int lookup_lostfound(struct btree_trans *trans,
+                           struct bch_inode_unpacked *lostfound)
 {
-       struct bch_hash_info dir_hash =
-               bch2_hash_info_init(trans->c, lostfound);
-       struct bch_inode_unpacked inode_u;
+       struct bch_fs *c = trans->c;
+       struct bch_inode_unpacked root;
+       struct bch_hash_info root_hash_info;
+       struct qstr lostfound_str = QSTR("lost+found");
+       u64 inum;
+       u32 snapshot;
+       int ret;
+
+       ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root, &snapshot);
+       if (ret && ret != -ENOENT)
+               return ret;
+
+       root_hash_info = bch2_hash_info_init(c, &root);
+       inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
+                                 &lostfound_str);
+       if (!inum) {
+               bch_notice(c, "creating lost+found");
+               goto create_lostfound;
+       }
+
+       ret = lookup_inode(trans, inum, lostfound, &snapshot);
+       if (ret && ret != -ENOENT) {
+               /*
+                * The check_dirents pass has already run, dangling dirents
+                * shouldn't exist here:
+                */
+               bch_err(c, "error looking up lost+found: %i", ret);
+               return ret;
+       }
+
+       if (ret == -ENOENT) {
+create_lostfound:
+               bch2_inode_init_early(c, lostfound);
+
+               ret = __bch2_trans_do(trans, NULL, NULL,
+                                     BTREE_INSERT_NOFAIL|
+                                     BTREE_INSERT_LAZY_RW,
+                       bch2_create_trans(trans,
+                                         BCACHEFS_ROOT_INO, &root,
+                                         lostfound,
+                                         &lostfound_str,
+                                         0, 0, S_IFDIR|0700, 0, NULL, NULL));
+               if (ret)
+                       bch_err(c, "error creating lost+found: %i", ret);
+       }
+
+       return 0;
+}
+
+static int reattach_inode(struct btree_trans *trans,
+                         struct bch_inode_unpacked *inode)
+{
+       struct bch_hash_info dir_hash;
+       struct bch_inode_unpacked lostfound;
        char name_buf[20];
        struct qstr name;
        u64 dir_offset = 0;
-       u32 snapshot;
        int ret;
 
-       snprintf(name_buf, sizeof(name_buf), "%llu", inum);
-       name = (struct qstr) QSTR(name_buf);
-
-       ret = lookup_inode(trans, inum, &inode_u, &snapshot);
+       ret = lookup_lostfound(trans, &lostfound);
        if (ret)
                return ret;
 
-       if (S_ISDIR(inode_u.bi_mode)) {
-               lostfound->bi_nlink++;
+       if (S_ISDIR(inode->bi_mode)) {
+               lostfound.bi_nlink++;
 
-               ret = write_inode(trans, lostfound, U32_MAX);
+               ret = write_inode(trans, &lostfound, U32_MAX);
                if (ret)
                        return ret;
        }
 
-       ret = bch2_dirent_create(trans, lostfound->bi_inum, &dir_hash,
-                                mode_to_type(inode_u.bi_mode),
-                                &name, inum, &dir_offset,
-                                BCH_HASH_SET_MUST_CREATE);
-       if (ret)
-               return ret;
+       dir_hash = bch2_hash_info_init(trans->c, &lostfound);
 
-       inode_u.bi_dir          = lostfound->bi_inum;
-       inode_u.bi_dir_offset   = dir_offset;
+       snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
+       name = (struct qstr) QSTR(name_buf);
 
-       return write_inode(trans, &inode_u, U32_MAX);
-}
+       ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
+               bch2_dirent_create(trans, lostfound.bi_inum, &dir_hash,
+                                  mode_to_type(inode->bi_mode),
+                                  &name, inode->bi_inum, &dir_offset,
+                                  BCH_HASH_SET_MUST_CREATE));
+       if (ret) {
+               bch_err(trans->c, "error %i reattaching inode %llu",
+                       ret, inode->bi_inum);
+               return ret;
+       }
 
-static int reattach_inode(struct btree_trans *trans,
-                         struct bch_inode_unpacked *lostfound,
-                         u64 inum)
-{
-       int ret = __bch2_trans_do(trans, NULL, NULL, BTREE_INSERT_LAZY_RW,
-                             __reattach_inode(trans, lostfound, inum));
-       if (ret)
-               bch_err(trans->c, "error %i reattaching inode %llu", ret, inum);
+       inode->bi_dir           = lostfound.bi_inum;
+       inode->bi_dir_offset    = dir_offset;
 
-       return ret;
+       return write_inode(trans, inode, U32_MAX);
 }
 
 static int remove_backpointer(struct btree_trans *trans,
@@ -931,58 +990,6 @@ create_root:
                                 BTREE_INSERT_LAZY_RW);
 }
 
-/* Get lost+found, create if it doesn't exist: */
-static int check_lostfound(struct bch_fs *c,
-                          struct bch_inode_unpacked *root_inode,
-                          struct bch_inode_unpacked *lostfound_inode)
-{
-       struct qstr lostfound = QSTR("lost+found");
-       struct bch_hash_info root_hash_info =
-               bch2_hash_info_init(c, root_inode);
-       u64 inum;
-       u32 snapshot;
-       int ret;
-
-       bch_verbose(c, "checking lost+found");
-
-       inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info,
-                                &lostfound);
-       if (!inum) {
-               bch_notice(c, "creating lost+found");
-               goto create_lostfound;
-       }
-
-       ret = bch2_trans_do(c, NULL, NULL, 0,
-               lookup_inode(&trans, inum, lostfound_inode, &snapshot));
-       if (ret && ret != -ENOENT)
-               return ret;
-
-       if (fsck_err_on(ret, c, "lost+found missing"))
-               goto create_lostfound;
-
-       if (fsck_err_on(!S_ISDIR(lostfound_inode->bi_mode), c,
-                       "lost+found inode not a directory"))
-               goto create_lostfound;
-
-       return 0;
-fsck_err:
-       return ret;
-create_lostfound:
-       bch2_inode_init_early(c, lostfound_inode);
-
-       ret = bch2_trans_do(c, NULL, NULL,
-                           BTREE_INSERT_NOFAIL|
-                           BTREE_INSERT_LAZY_RW,
-               bch2_create_trans(&trans,
-                                 BCACHEFS_ROOT_INO, root_inode,
-                                 lostfound_inode, &lostfound,
-                                 0, 0, S_IFDIR|0700, 0, NULL, NULL));
-       if (ret)
-               bch_err(c, "error creating lost+found: %i", ret);
-
-       return ret;
-}
-
 struct pathbuf {
        size_t          nr;
        size_t          size;
@@ -1014,7 +1021,6 @@ static int path_down(struct pathbuf *p, u64 inum)
 }
 
 static int check_path(struct btree_trans *trans,
-                     struct bch_inode_unpacked *lostfound,
                      struct pathbuf *p,
                      struct bch_inode_unpacked *inode)
 {
@@ -1038,7 +1044,7 @@ static int check_path(struct btree_trans *trans,
                                     inode->bi_nlink,
                                     inode->bi_dir,
                                     inode->bi_dir_offset))
-                               ret = reattach_inode(trans, lostfound, inode->bi_inum);
+                               ret = reattach_inode(trans, inode);
                        break;
                }
                ret = 0;
@@ -1067,12 +1073,11 @@ static int check_path(struct btree_trans *trans,
                                break;
                        }
 
-                       ret = reattach_inode(trans, lostfound, inode->bi_inum);
+                       ret = reattach_inode(trans, inode);
                        break;
                }
 
-               ret = lockrestart_do(trans,
-                               lookup_inode(trans, inode->bi_dir, inode, &snapshot));
+               ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
                if (ret) {
                        /* Should have been caught in dirents pass */
                        bch_err(c, "error looking up parent directory: %i", ret);
@@ -1090,8 +1095,7 @@ fsck_err:
  * After check_dirents(), if an inode backpointer doesn't exist that means it's
  * unreachable:
  */
-static int check_directory_structure(struct bch_fs *c,
-                                    struct bch_inode_unpacked *lostfound)
+static int check_directory_structure(struct bch_fs *c)
 {
        struct btree_trans trans;
        struct btree_iter *iter;
@@ -1113,7 +1117,7 @@ static int check_directory_structure(struct bch_fs *c,
                        break;
                }
 
-               ret = check_path(&trans, lostfound, &path, &u);
+               ret = check_path(&trans, &path, &u);
                if (ret)
                        break;
        }
@@ -1190,7 +1194,6 @@ static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links,
 }
 
 static int check_inode_nlink(struct btree_trans *trans,
-                            struct bch_inode_unpacked *lostfound_inode,
                             struct btree_iter *iter,
                             struct bkey_s_c_inode inode,
                             unsigned nlink)
@@ -1238,7 +1241,6 @@ fsck_err:
 
 noinline_for_stack
 static int bch2_gc_walk_inodes(struct bch_fs *c,
-                              struct bch_inode_unpacked *lostfound_inode,
                               nlink_table *links,
                               u64 range_start, u64 range_end)
 {
@@ -1259,7 +1261,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
                        continue;
 
                link = genradix_ptr(links, k.k->p.offset - range_start);
-               ret = check_inode_nlink(&trans, lostfound_inode, iter,
+               ret = check_inode_nlink(&trans, iter,
                                        bkey_s_c_to_inode(k), link ? link->count : 0);
                if (ret)
                        break;
@@ -1275,8 +1277,7 @@ static int bch2_gc_walk_inodes(struct bch_fs *c,
 }
 
 noinline_for_stack
-static int check_nlinks(struct bch_fs *c,
-                             struct bch_inode_unpacked *lostfound_inode)
+static int check_nlinks(struct bch_fs *c)
 {
        nlink_table links;
        u64 this_iter_range_start, next_iter_range_start = 0;
@@ -1296,7 +1297,7 @@ static int check_nlinks(struct bch_fs *c,
                if (ret)
                        break;
 
-               ret = bch2_gc_walk_inodes(c, lostfound_inode, &links,
+               ret = bch2_gc_walk_inodes(c, &links,
                                         this_iter_range_start,
                                         next_iter_range_start);
                if (ret)
@@ -1316,16 +1317,15 @@ static int check_nlinks(struct bch_fs *c,
  */
 int bch2_fsck_full(struct bch_fs *c)
 {
-       struct bch_inode_unpacked root_inode, lostfound_inode;
+       struct bch_inode_unpacked root_inode;
 
        return  check_inodes(c, true) ?:
                check_extents(c) ?:
                check_dirents(c) ?:
                check_xattrs(c) ?:
                check_root(c, &root_inode) ?:
-               check_lostfound(c, &root_inode, &lostfound_inode) ?:
-               check_directory_structure(c, &lostfound_inode) ?:
-               check_nlinks(c, &lostfound_inode);
+               check_directory_structure(c) ?:
+               check_nlinks(c);
 }
 
 int bch2_fsck_walk_inodes_only(struct bch_fs *c)
index f117d361d58479c81f7d3e9c491505ada9f40cec..24d04e51fb618d5bc5e7e3307ed1050d683f6c49 100644 (file)
@@ -634,7 +634,7 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
                               msecs_to_jiffies(j->reclaim_delay_ms)))
                        min_nr = 1;
 
-               if (j->prereserved.reserved * 2 > j->prereserved.remaining)
+               if (j->prereserved.reserved * 4 > j->prereserved.remaining)
                        min_nr = 1;
 
                if (fifo_free(&j->pin) <= 32)
index 864dfaa67b7a4cb255724409297adae74743b849..cda77835b9ea62381f3962a1d0029d463fe3b2b1 100644 (file)
@@ -62,6 +62,6 @@ void bch2_verify_keylist_sorted(struct keylist *l)
 
        for_each_keylist_key(l, k)
                BUG_ON(bkey_next(k) != l->top &&
-                      bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
+                      bpos_cmp(k->k.p, bkey_next(k)->k.p) >= 0);
 }
 #endif
index aa8e8c25402fa55ba8b392cd257441f10fd61108..778ff72cf5b257200dc831331f7c6f2eaf8e0695 100644 (file)
@@ -762,7 +762,7 @@ static int bch2_move_btree(struct bch_fs *c,
                                    id == start_btree_id ? start_pos : POS_MIN,
                                    BTREE_ITER_PREFETCH, b) {
                        if (kthread && kthread_should_stop())
-                               goto out;
+                               break;
 
                        if ((cmp_int(id, end_btree_id) ?:
                             bkey_cmp(b->key.k.p, end_pos)) > 0)
@@ -789,8 +789,10 @@ next:
                }
 
                ret = bch2_trans_iter_free(&trans, iter) ?: ret;
+               if (kthread && kthread_should_stop())
+                       break;
        }
-out:
+
        bch2_trans_exit(&trans);
 
        if (ret)
index 80772cff0f9dd84740cdc8dc9b72bcbba45bd04a..4ac7e61fb8413fd04e211222fff1793fa80335cd 100644 (file)
@@ -87,9 +87,20 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg,
                if (i >= 0 &&
                    p.ptr.offset < h->data[i].offset + ca->mi.bucket_size &&
                    p.ptr.gen == h->data[i].gen) {
+                       /*
+                        * We need to use the journal reserve here, because
+                        *  - journal reclaim depends on btree key cache
+                        *    flushing to make forward progress,
+                        *  - which has to make forward progress when the
+                        *    journal is pre-reservation full,
+                        *  - and depends on allocation - meaning allocator and
+                        *    copygc
+                        */
+
                        data_opts->target               = io_opts->background_target;
                        data_opts->nr_replicas          = 1;
-                       data_opts->btree_insert_flags   = BTREE_INSERT_USE_RESERVE;
+                       data_opts->btree_insert_flags   = BTREE_INSERT_USE_RESERVE|
+                               BTREE_INSERT_JOURNAL_RESERVED;
                        data_opts->rewrite_dev          = p.ptr.dev;
 
                        if (p.has_ec)
index 4128a1b3ad0014b5f6c6758a6f31e2e9a523f636..8e6cccd3938373bd2cb5d531931d71c04cff398a 100644 (file)
@@ -1063,11 +1063,27 @@ unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca)
        return ret;
 }
 
+void bch2_fs_replicas_exit(struct bch_fs *c)
+{
+       unsigned i;
+
+       kfree(c->usage_scratch);
+       for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+               free_percpu(c->usage[i]);
+       kfree(c->usage_base);
+       kfree(c->replicas.entries);
+       kfree(c->replicas_gc.entries);
+
+       mempool_exit(&c->replicas_delta_pool);
+}
+
 int bch2_fs_replicas_init(struct bch_fs *c)
 {
        bch2_journal_entry_res_resize(&c->journal,
                        &c->replicas_journal_res,
                        reserve_journal_replicas(c, &c->replicas));
 
-       return replicas_table_update(c, &c->replicas);
+       return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1,
+                                        REPLICAS_DELTA_LIST_MAX) ?:
+               replicas_table_update(c, &c->replicas);
 }
index c77e873efc340555368b30786f74b308543556d2..72ac544f16d8d43cf0d456fba1716e79c3929347 100644 (file)
@@ -102,6 +102,7 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *);
 extern const struct bch_sb_field_ops bch_sb_field_ops_replicas;
 extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0;
 
+void bch2_fs_replicas_exit(struct bch_fs *);
 int bch2_fs_replicas_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_REPLICAS_H */
index 61fd11446df7e77c3fc04581385acf9ad2cf1d64..b6e449a7a4d8785417746015db2f2da0dd97f70d 100644 (file)
@@ -477,6 +477,7 @@ static void __bch2_fs_free(struct bch_fs *c)
        bch2_fs_btree_iter_exit(c);
        bch2_fs_btree_key_cache_exit(&c->btree_key_cache);
        bch2_fs_btree_cache_exit(c);
+       bch2_fs_replicas_exit(c);
        bch2_fs_journal_exit(&c->journal);
        bch2_io_clock_exit(&c->io_clock[WRITE]);
        bch2_io_clock_exit(&c->io_clock[READ]);
@@ -484,10 +485,6 @@ static void __bch2_fs_free(struct bch_fs *c)
        bch2_journal_keys_free(&c->journal_keys);
        bch2_journal_entries_free(&c->journal_entries);
        percpu_free_rwsem(&c->mark_lock);
-       kfree(c->usage_scratch);
-       for (i = 0; i < ARRAY_SIZE(c->usage); i++)
-               free_percpu(c->usage[i]);
-       kfree(c->usage_base);
 
        if (c->btree_iters_bufs)
                for_each_possible_cpu(cpu)
@@ -501,8 +498,6 @@ static void __bch2_fs_free(struct bch_fs *c)
        bioset_exit(&c->btree_bio);
        mempool_exit(&c->fill_iter);
        percpu_ref_exit(&c->writes);
-       kfree(c->replicas.entries);
-       kfree(c->replicas_gc.entries);
        kfree(rcu_dereference_protected(c->disk_groups, 1));
        kfree(c->journal_seq_blacklist_table);
        kfree(c->unused_inode_hints);