]> git.sesse.net Git - bcachefs-tools-debian/commitdiff
Update bcachefs sources to 787de128a5 bcachefs: Improvements to fsck check_dirents()
authorKent Overstreet <kent.overstreet@gmail.com>
Tue, 13 Jul 2021 20:31:40 +0000 (16:31 -0400)
committerKent Overstreet <kent.overstreet@gmail.com>
Thu, 15 Jul 2021 00:31:38 +0000 (20:31 -0400)
21 files changed:
.bcachefs_revision
include/linux/blkdev.h
libbcachefs/alloc_background.c
libbcachefs/alloc_background.h
libbcachefs/bcachefs_format.h
libbcachefs/btree_cache.c
libbcachefs/btree_io.c
libbcachefs/btree_io.h
libbcachefs/btree_iter.c
libbcachefs/btree_iter.h
libbcachefs/btree_key_cache.c
libbcachefs/btree_update_interior.c
libbcachefs/dirent.c
libbcachefs/fs-io.c
libbcachefs/fsck.c
libbcachefs/inode.c
libbcachefs/opts.c
libbcachefs/opts.h
libbcachefs/sysfs.c
libbcachefs/varint.c
libbcachefs/varint.h

index 06ebd7da7856e400358644295c9046daa3af268c..46d09322771f1cd061c5152098abea04efe19c22 100644 (file)
@@ -1 +1 @@
-400c2f8d960ac55105bd22905a6ea1a40daa7f4f
+787de128a5caf209845e5a8d0f14f24e1a42492c
index f60972c76b14488770eb1068d9133d04e946edbe..35082ae30460911d26fc11bb1f052724c1d62378 100644 (file)
@@ -105,6 +105,7 @@ struct super_block {
 #define DT_LNK         10
 #define DT_SOCK                12
 #define DT_WHT         14
+#define DT_MAX         16
 #endif
 
 /*
index 2324b81c09cec79f56bb54231457948131d9b4a4..fff85c17e55f16fcbade5cd4db03bab41bd786b3 100644 (file)
@@ -130,7 +130,7 @@ static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
 
 #define x(_name, _bits)                                                        \
        if (fieldnr < a.v->nr_fields) {                                 \
-               ret = bch2_varint_decode(in, end, &v);                  \
+               ret = bch2_varint_decode_fast(in, end, &v);             \
                if (ret < 0)                                            \
                        return ret;                                     \
                in += ret;                                              \
@@ -166,7 +166,7 @@ static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst,
        nr_fields++;                                                    \
                                                                        \
        if (src._name) {                                                \
-               out += bch2_varint_encode(out, src._name);              \
+               out += bch2_varint_encode_fast(out, src._name);         \
                                                                        \
                last_nonzero_field = out;                               \
                last_nonzero_fieldnr = nr_fields;                       \
@@ -1232,3 +1232,22 @@ void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
        spin_lock_init(&c->freelist_lock);
 }
+
+void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
+{
+       struct open_bucket *ob;
+
+       for (ob = c->open_buckets;
+            ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
+            ob++) {
+               spin_lock(&ob->lock);
+               if (ob->valid && !ob->on_partial_list) {
+                       pr_buf(out, "%zu ref %u type %s\n",
+                              ob - c->open_buckets,
+                              atomic_read(&ob->pin),
+                              bch2_data_types[ob->type]);
+               }
+               spin_unlock(&ob->lock);
+       }
+
+}
index 9cadfdb5b83df30211082a06851131c25d5b197b..a4f6bf56b18f6eee5266852e994f28f6d5b5f738 100644 (file)
@@ -132,4 +132,6 @@ int bch2_dev_allocator_start(struct bch_dev *);
 int bch2_alloc_write(struct bch_fs *, unsigned);
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
+void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *);
+
 #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
index 94273d5161f25681b1938aeb073392b11a2b85c4..8a89ab0d8e85cf4ada944bf1404d81121c0d0a89 100644 (file)
@@ -1346,6 +1346,7 @@ LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES,     struct bch_sb, flags[2],  4, 64);
 LE64_BITMASK(BCH_SB_ERASURE_CODE,      struct bch_sb, flags[3],  0, 16);
 LE64_BITMASK(BCH_SB_METADATA_TARGET,   struct bch_sb, flags[3], 16, 28);
 LE64_BITMASK(BCH_SB_SHARD_INUMS,       struct bch_sb, flags[3], 28, 29);
+LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30);
 
 /*
  * Features:
index 73bfd01f2dcbb9e5edd0fcfc2d71b3b1d7f2f9e9..1aacd271021b475433ddc5b546dc93744395b1bf 100644 (file)
@@ -233,7 +233,7 @@ wait_on_io:
                if (bch2_verify_btree_ondisk)
                        bch2_btree_node_write(c, b, SIX_LOCK_intent);
                else
-                       __bch2_btree_node_write(c, b);
+                       __bch2_btree_node_write(c, b, false);
 
                six_unlock_write(&b->c.lock);
                six_unlock_intent(&b->c.lock);
@@ -691,7 +691,9 @@ static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c,
         * currently fails for iterators that aren't pointed at a valid btree
         * node
         */
-       if (iter && !bch2_trans_relock(iter->trans))
+       if (iter &&
+           (!bch2_trans_relock(iter->trans) ||
+            !bch2_btree_iter_relock(iter, _THIS_IP_)))
                return ERR_PTR(-EINTR);
 
        if (!six_relock_type(&b->c.lock, lock_type, seq))
@@ -851,7 +853,9 @@ lock_node:
                 * currently fails for iterators that aren't pointed at a valid
                 * btree node
                 */
-               if (iter && !bch2_trans_relock(iter->trans))
+               if (iter &&
+                   (!bch2_trans_relock(iter->trans) ||
+                    !bch2_btree_iter_relock(iter, _THIS_IP_)))
                        return ERR_PTR(-EINTR);
 
                if (!six_relock_type(&b->c.lock, lock_type, seq))
@@ -1002,7 +1006,7 @@ wait_on_io:
        six_lock_write(&b->c.lock, NULL, NULL);
 
        if (btree_node_dirty(b)) {
-               __bch2_btree_node_write(c, b);
+               __bch2_btree_node_write(c, b, false);
                six_unlock_write(&b->c.lock);
                six_unlock_intent(&b->c.lock);
                goto wait_on_io;
index 12894f8959bfccbe4f7291e290ab703c8f304f02..957a6a9a1559d7017d8a0bf31400786641ca7acf 100644 (file)
@@ -1566,9 +1566,47 @@ void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
 static void btree_node_write_done(struct bch_fs *c, struct btree *b)
 {
        struct btree_write *w = btree_prev_write(b);
+       unsigned long old, new, v;
 
        bch2_btree_complete_write(c, b, w);
-       bch2_btree_node_io_unlock(b);
+
+       v = READ_ONCE(b->flags);
+       do {
+               old = new = v;
+
+               if (old & (1U << BTREE_NODE_need_write))
+                       goto do_write;
+
+               new &= ~(1U << BTREE_NODE_write_in_flight);
+       } while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+       wake_up_bit(&b->flags, BTREE_NODE_write_in_flight);
+       return;
+
+do_write:
+       six_lock_read(&b->c.lock, NULL, NULL);
+       v = READ_ONCE(b->flags);
+       do {
+               old = new = v;
+
+               if ((old & (1U << BTREE_NODE_dirty)) &&
+                   (old & (1U << BTREE_NODE_need_write)) &&
+                   !(old & (1U << BTREE_NODE_never_write)) &&
+                   btree_node_may_write(b)) {
+                       new &= ~(1U << BTREE_NODE_dirty);
+                       new &= ~(1U << BTREE_NODE_need_write);
+                       new |=  (1U << BTREE_NODE_write_in_flight);
+                       new |=  (1U << BTREE_NODE_just_written);
+                       new ^=  (1U << BTREE_NODE_write_idx);
+               } else {
+                       new &= ~(1U << BTREE_NODE_write_in_flight);
+               }
+       } while ((v = cmpxchg(&b->flags, old, new)) != old);
+
+       if (new & (1U << BTREE_NODE_write_in_flight))
+               __bch2_btree_node_write(c, b, true);
+
+       six_unlock_read(&b->c.lock);
 }
 
 static void bch2_btree_node_write_error(struct bch_fs *c,
@@ -1733,7 +1771,7 @@ static void btree_write_submit(struct work_struct *work)
        bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key);
 }
 
-void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, bool already_started)
 {
        struct btree_write_bio *wbio;
        struct bset_tree *t;
@@ -1750,7 +1788,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
        bool validate_before_checksum = false;
        void *data;
 
-       BUG_ON(btree_node_write_in_flight(b));
+       if (already_started)
+               goto do_write;
 
        if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
                return;
@@ -1774,14 +1813,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
                if (old & (1 << BTREE_NODE_never_write))
                        return;
 
-               if (old & (1 << BTREE_NODE_write_in_flight)) {
-                       /*
-                        * XXX waiting on btree writes with btree locks held -
-                        * this can deadlock, and we hit the write error path
-                        */
-                       bch2_btree_node_wait_on_write(b);
-                       continue;
-               }
+               BUG_ON(old & (1 << BTREE_NODE_write_in_flight));
 
                new &= ~(1 << BTREE_NODE_dirty);
                new &= ~(1 << BTREE_NODE_need_write);
@@ -1790,6 +1822,9 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b)
                new ^=  (1 << BTREE_NODE_write_idx);
        } while (cmpxchg_acquire(&b->flags, old, new) != old);
 
+       if (new & (1U << BTREE_NODE_need_write))
+               return;
+do_write:
        atomic_dec(&c->btree_cache.dirty);
 
        BUG_ON(btree_node_fake(b));
@@ -2044,7 +2079,7 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
        if (lock_type_held == SIX_LOCK_intent ||
            (lock_type_held == SIX_LOCK_read &&
             six_lock_tryupgrade(&b->c.lock))) {
-               __bch2_btree_node_write(c, b);
+               __bch2_btree_node_write(c, b, false);
 
                /* don't cycle lock unnecessarily: */
                if (btree_node_just_written(b) &&
@@ -2056,7 +2091,7 @@ void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
                if (lock_type_held == SIX_LOCK_read)
                        six_lock_downgrade(&b->c.lock);
        } else {
-               __bch2_btree_node_write(c, b);
+               __bch2_btree_node_write(c, b, false);
                if (lock_type_held == SIX_LOCK_write &&
                    btree_node_just_written(b))
                        bch2_btree_post_write_cleanup(c, b);
index 89fd4aba521880624e39d73faf28aaff9a9241ae..3732d135de8dd44ca6134fd55a1dde7732632c55 100644 (file)
@@ -139,7 +139,7 @@ void bch2_btree_complete_write(struct bch_fs *, struct btree *,
                              struct btree_write *);
 void bch2_btree_write_error_work(struct work_struct *);
 
-void __bch2_btree_node_write(struct bch_fs *, struct btree *);
+void __bch2_btree_node_write(struct bch_fs *, struct btree *, bool);
 bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *);
 
 void bch2_btree_node_write(struct bch_fs *, struct btree *,
@@ -148,18 +148,11 @@ void bch2_btree_node_write(struct bch_fs *, struct btree *,
 static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b,
                                            enum six_lock_type lock_held)
 {
-       while (b->written &&
-              btree_node_need_write(b) &&
-              btree_node_may_write(b)) {
-               if (!btree_node_write_in_flight(b)) {
-                       bch2_btree_node_write(c, b, lock_held);
-                       break;
-               }
-
-               six_unlock_type(&b->c.lock, lock_held);
-               bch2_btree_node_wait_on_write(b);
-               btree_node_lock_type(c, b, lock_held);
-       }
+       if (b->written &&
+           btree_node_need_write(b) &&
+           btree_node_may_write(b) &&
+           !btree_node_write_in_flight(b))
+               bch2_btree_node_write(c, b, lock_held);
 }
 
 #define bch2_btree_node_write_cond(_c, _b, cond)                       \
index 0444dbd1ac5ea20fee0f0d561098306e3da203bf..24d7422c5c2294599044f1fb90732b75309ccb85 100644 (file)
@@ -347,6 +347,7 @@ bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
 #ifdef CONFIG_BCACHEFS_DEBUG
 static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
 {
+       struct bch_fs *c = iter->trans->c;
        unsigned l;
 
        if (!(iter->trans->iters_linked & (1ULL << iter->idx))) {
@@ -354,7 +355,7 @@ static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
                return;
        }
 
-       for (l = 0; is_btree_node(iter, l); l++) {
+       for (l = 0; btree_iter_node(iter, l); l++) {
                if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
                    !btree_node_locked(iter, l))
                        continue;
@@ -376,7 +377,7 @@ static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
 #endif
 
 __flatten
-static bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip)
+bool bch2_btree_iter_relock(struct btree_iter *iter, unsigned long trace_ip)
 {
        return btree_iter_get_locks(iter, false, trace_ip);
 }
@@ -602,6 +603,8 @@ err:
 
 static void bch2_btree_iter_verify(struct btree_iter *iter)
 {
+       struct btree_trans *trans = iter->trans;
+       struct bch_fs *c = trans->c;
        enum btree_iter_type type = btree_iter_type(iter);
        unsigned i;
 
@@ -620,10 +623,16 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
               (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) &&
               !btree_type_has_snapshots(iter->btree_id));
 
-       bch2_btree_iter_verify_locks(iter);
+       for (i = 0; i < BTREE_MAX_DEPTH; i++) {
+               if (!iter->l[i].b) {
+                       BUG_ON(c->btree_roots[iter->btree_id].b->c.level > i);
+                       break;
+               }
 
-       for (i = 0; i < BTREE_MAX_DEPTH; i++)
                bch2_btree_iter_verify_level(iter, i);
+       }
+
+       bch2_btree_iter_verify_locks(iter);
 }
 
 static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
@@ -1345,30 +1354,30 @@ static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
 static int btree_iter_traverse_one(struct btree_iter *iter,
                                   unsigned long trace_ip)
 {
-       unsigned depth_want = iter->level;
+       unsigned l, depth_want = iter->level;
        int ret = 0;
 
-       /*
-        * if we need interior nodes locked, call btree_iter_relock() to make
-        * sure we walk back up enough that we lock them:
-        */
-       if (iter->uptodate == BTREE_ITER_NEED_RELOCK ||
-           iter->locks_want > 1)
-               bch2_btree_iter_relock(iter, _THIS_IP_);
-
        if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
                ret = bch2_btree_iter_traverse_cached(iter);
                goto out;
        }
 
-       if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
-               goto out;
-
        if (unlikely(iter->level >= BTREE_MAX_DEPTH))
                goto out;
 
        iter->level = btree_iter_up_until_good_node(iter, 0);
 
+       /* If we need intent locks, take them too: */
+       for (l = iter->level + 1;
+            l < iter->locks_want && btree_iter_node(iter, l);
+            l++)
+               if (!bch2_btree_node_relock(iter, l))
+                       while (iter->level <= l) {
+                               btree_node_unlock(iter, iter->level);
+                               iter->l[iter->level].b = BTREE_ITER_NO_NODE_UP;
+                               iter->level++;
+                       }
+
        /*
         * Note: iter->nodes[iter->level] may be temporarily NULL here - that
         * would indicate to other code that we got to the end of the btree,
@@ -1389,6 +1398,7 @@ static int btree_iter_traverse_one(struct btree_iter *iter,
                                goto out;
                        }
 
+                       __bch2_btree_iter_unlock(iter);
                        iter->level = depth_want;
 
                        if (ret == -EIO) {
index 31175cf00c0a163e39739598e9acd7939e513857..58f15b716d49128a15c4a9c24c030edda709c1b7 100644 (file)
@@ -111,6 +111,8 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *,
                              struct btree_node_iter *, struct bkey_packed *,
                              unsigned, unsigned);
 
+bool bch2_btree_iter_relock(struct btree_iter *, unsigned long);
+
 bool bch2_trans_relock(struct btree_trans *);
 void bch2_trans_unlock(struct btree_trans *);
 
index dfaf5e6df917c0ede6cc2a8cf345833384fce9e2..7f47ef3345aae4ea02ff92bd84c268d5433aeaf0 100644 (file)
@@ -270,7 +270,9 @@ int bch2_btree_iter_traverse_cached(struct btree_iter *iter)
 
        BUG_ON(iter->level);
 
-       if (btree_node_locked(iter, 0)) {
+       iter->l[1].b = NULL;
+
+       if (bch2_btree_node_relock(iter, 0)) {
                ck = (void *) iter->l[0].b;
                goto fill;
        }
index 6b55a41084252daaf54444297ab059da77c4babe..0b4e4056e1d974069f5f3fc4f016a04ad5f02dfe 100644 (file)
@@ -948,13 +948,6 @@ bch2_btree_update_start(struct btree_iter *iter, unsigned level,
 
        closure_init_stack(&cl);
 retry:
-       /*
-        * This check isn't necessary for correctness - it's just to potentially
-        * prevent us from doing a lot of work that'll end up being wasted:
-        */
-       ret = bch2_journal_error(&c->journal);
-       if (ret)
-               return ERR_PTR(ret);
 
        /*
         * XXX: figure out how far we might need to split,
@@ -995,6 +988,22 @@ retry:
        bch2_keylist_init(&as->new_keys, as->_new_keys);
        bch2_keylist_init(&as->parent_keys, as->inline_keys);
 
+       mutex_lock(&c->btree_interior_update_lock);
+       list_add_tail(&as->list, &c->btree_interior_update_list);
+       mutex_unlock(&c->btree_interior_update_lock);
+
+       /*
+        * We don't want to allocate if we're in an error state, that can cause
+        * deadlock on emergency shutdown due to open buckets getting stuck in
+        * the btree_reserve_cache after allocator shutdown has cleared it out.
+        * This check needs to come after adding us to the btree_interior_update
+        * list but before calling bch2_btree_reserve_get, to synchronize with
+        * __bch2_fs_read_only().
+        */
+       ret = bch2_journal_error(&c->journal);
+       if (ret)
+               goto err;
+
        ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
                                      BTREE_UPDATE_JOURNAL_RES,
                                      journal_flags|JOURNAL_RES_GET_NONBLOCK);
@@ -1046,10 +1055,6 @@ retry:
                             atomic64_read(&c->journal.seq),
                             &as->journal, NULL);
 
-       mutex_lock(&c->btree_interior_update_lock);
-       list_add_tail(&as->list, &c->btree_interior_update_list);
-       mutex_unlock(&c->btree_interior_update_lock);
-
        return as;
 err:
        bch2_btree_update_free(as);
index d5883ab7de21d899df1a28ea591d4ecbd8df78e9..a95165b8eddf97fa352a1ae577b6337b21d6402e 100644 (file)
@@ -112,7 +112,10 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
 
        bch_scnmemcpy(out, d.v->d_name,
                      bch2_dirent_name_bytes(d));
-       pr_buf(out, " -> %llu type %s", d.v->d_inum, bch2_d_types[d.v->d_type]);
+       pr_buf(out, " -> %llu type %s", d.v->d_inum,
+              d.v->d_type < DT_MAX
+              ? bch2_d_types[d.v->d_type]
+              : "(bad d_type)");
 }
 
 static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
index 2795b37b7dd6bdd4cad1e2f8aa5b5ab237df95ef..ae55453b0582400b65c65fab1cc949c24e45c3f2 100644 (file)
@@ -1870,8 +1870,6 @@ static long bch2_dio_write_loop(struct dio_write *dio)
                         * bio_iov_iter_get_pages was only able to get <
                         * blocksize worth of pages:
                         */
-                       bio_for_each_segment_all(bv, bio, iter)
-                               put_page(bv->bv_page);
                        ret = -EFAULT;
                        goto err;
                }
@@ -1939,6 +1937,7 @@ loop:
                if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
                        bio_for_each_segment_all(bv, bio, iter)
                                put_page(bv->bv_page);
+               bio->bi_vcnt = 0;
 
                if (dio->op.error) {
                        set_bit(EI_INODE_ERROR, &inode->ei_flags);
@@ -1961,6 +1960,9 @@ err:
        if (dio->free_iov)
                kfree(dio->iter.iov);
 
+       if (likely(!bio_flagged(bio, BIO_NO_PAGE_REF)))
+               bio_for_each_segment_all(bv, bio, iter)
+                       put_page(bv->bv_page);
        bio_put(bio);
 
        /* inode->i_dio_count is our ref on inode and thus bch_fs */
index 7ea1a41ac637fa02a1160b10ee929280cd36d73f..bedfd34803ce816c80c2eee62c2fcd5696956422 100644 (file)
@@ -267,11 +267,11 @@ static struct inode_walker inode_walker_init(void)
        };
 }
 
-static int walk_inode(struct btree_trans *trans,
-                     struct inode_walker *w, u64 inum)
+static int __walk_inode(struct btree_trans *trans,
+                       struct inode_walker *w, u64 inum)
 {
        if (inum != w->cur_inum) {
-               int ret = lookup_inode(trans, inum, &w->inode, &w->snapshot);
+               int ret = __lookup_inode(trans, inum, &w->inode, &w->snapshot);
 
                if (ret && ret != -ENOENT)
                        return ret;
@@ -286,6 +286,12 @@ static int walk_inode(struct btree_trans *trans,
        return 0;
 }
 
+static int walk_inode(struct btree_trans *trans,
+                     struct inode_walker *w, u64 inum)
+{
+       return lockrestart_do(trans, __walk_inode(trans, w, inum));
+}
+
 static int hash_redo_key(struct btree_trans *trans,
                         const struct bch_hash_desc desc,
                         struct bch_hash_info *hash_info,
@@ -704,210 +710,215 @@ fsck_err:
        return bch2_trans_exit(&trans) ?: ret;
 }
 
-/*
- * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
- * validate d_type
- */
-noinline_for_stack
-static int check_dirents(struct bch_fs *c)
+static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
+                       struct bch_hash_info *hash_info,
+                       struct inode_walker *w, unsigned *nr_subdirs)
 {
-       struct inode_walker w = inode_walker_init();
-       struct bch_hash_info hash_info;
-       struct btree_trans trans;
-       struct btree_iter *iter;
+       struct bch_fs *c = trans->c;
        struct bkey_s_c k;
+       struct bkey_s_c_dirent d;
+       struct bch_inode_unpacked target;
+       u32 target_snapshot;
+       bool have_target;
+       bool backpointer_exists = true;
+       u64 d_inum;
        char buf[200];
-       unsigned nr_subdirs = 0;
-       int ret = 0;
+       int ret;
 
-       bch_verbose(c, "checking dirents");
+       k = bch2_btree_iter_peek(iter);
+       if (!k.k)
+               return 1;
 
-       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
+       ret = bkey_err(k);
+       if (ret)
+               return ret;
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents,
-                                  POS(BCACHEFS_ROOT_INO, 0),
-                                  BTREE_ITER_INTENT|
-                                  BTREE_ITER_PREFETCH);
-retry:
-       while ((k = bch2_btree_iter_peek(iter)).k &&
-              !(ret = bkey_err(k))) {
-               struct bkey_s_c_dirent d;
-               struct bch_inode_unpacked target;
-               u32 target_snapshot;
-               bool have_target;
-               bool backpointer_exists = true;
-               u64 d_inum;
+       if (w->have_inode &&
+           w->cur_inum != k.k->p.inode &&
+           fsck_err_on(w->inode.bi_nlink != *nr_subdirs, c,
+                       "directory %llu with wrong i_nlink: got %u, should be %u",
+                       w->inode.bi_inum, w->inode.bi_nlink, *nr_subdirs)) {
+               w->inode.bi_nlink = *nr_subdirs;
+               ret = write_inode(trans, &w->inode, w->snapshot);
+               return ret ?: -EINTR;
+       }
 
-               if (w.have_inode &&
-                   w.cur_inum != k.k->p.inode &&
-                   fsck_err_on(w.inode.bi_nlink != nr_subdirs, c,
-                               "directory %llu with wrong i_nlink: got %u, should be %u",
-                               w.inode.bi_inum, w.inode.bi_nlink, nr_subdirs)) {
-                       w.inode.bi_nlink = nr_subdirs;
-                       ret = write_inode(&trans, &w.inode, w.snapshot);
-                       if (ret)
-                               break;
-               }
+       ret = __walk_inode(trans, w, k.k->p.inode);
+       if (ret)
+               return ret;
 
-               ret = walk_inode(&trans, &w, k.k->p.inode);
-               if (ret)
-                       break;
+       if (w->first_this_inode)
+               *nr_subdirs = 0;
 
-               if (w.first_this_inode)
-                       nr_subdirs = 0;
+       if (fsck_err_on(!w->have_inode, c,
+                       "dirent in nonexisting directory:\n%s",
+                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)) ||
+           fsck_err_on(!S_ISDIR(w->inode.bi_mode), c,
+                       "dirent in non directory inode type %u:\n%s",
+                       mode_to_type(w->inode.bi_mode),
+                       (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)))
+               return __bch2_trans_do(trans, NULL, NULL, 0,
+                               bch2_btree_delete_at(trans, iter, 0));
 
-               if (fsck_err_on(!w.have_inode, c,
-                               "dirent in nonexisting directory:\n%s",
-                               (bch2_bkey_val_to_text(&PBUF(buf), c,
-                                                      k), buf)) ||
-                   fsck_err_on(!S_ISDIR(w.inode.bi_mode), c,
-                               "dirent in non directory inode type %u:\n%s",
-                               mode_to_type(w.inode.bi_mode),
-                               (bch2_bkey_val_to_text(&PBUF(buf), c,
-                                                      k), buf))) {
-                       ret = __bch2_trans_do(&trans, NULL, NULL, 0,
-                                       bch2_btree_delete_at(&trans, iter, 0));
-                       if (ret)
-                               goto err;
-                       goto next;
-               }
+       if (!w->have_inode)
+               return 0;
 
-               if (!w.have_inode)
-                       goto next;
+       if (w->first_this_inode)
+               *hash_info = bch2_hash_info_init(c, &w->inode);
 
-               if (w.first_this_inode)
-                       hash_info = bch2_hash_info_init(c, &w.inode);
+       ret = hash_check_key(trans, bch2_dirent_hash_desc,
+                            hash_info, iter, k);
+       if (ret < 0)
+               return ret;
+       if (ret) /* dirent has been deleted */
+               return 0;
 
-               ret = hash_check_key(&trans, bch2_dirent_hash_desc,
-                                    &hash_info, iter, k);
-               if (ret > 0) {
-                       ret = 0;
-                       goto next;
-               }
-               if (ret)
-                       goto fsck_err;
+       if (k.k->type != KEY_TYPE_dirent)
+               return 0;
+
+       d = bkey_s_c_to_dirent(k);
+       d_inum = le64_to_cpu(d.v->d_inum);
 
-               if (k.k->type != KEY_TYPE_dirent)
-                       goto next;
+       ret = __lookup_inode(trans, d_inum, &target, &target_snapshot);
+       if (ret && ret != -ENOENT)
+               return ret;
 
-               d = bkey_s_c_to_dirent(k);
-               d_inum = le64_to_cpu(d.v->d_inum);
+       have_target = !ret;
+       ret = 0;
 
-               ret = lookup_inode(&trans, d_inum, &target, &target_snapshot);
-               if (ret && ret != -ENOENT)
-                       break;
+       if (fsck_err_on(!have_target, c,
+                       "dirent points to missing inode:\n%s",
+                       (bch2_bkey_val_to_text(&PBUF(buf), c,
+                                              k), buf)))
+               return remove_dirent(trans, d.k->p);
 
-               have_target = !ret;
+       if (!have_target)
+               return 0;
+
+       if (!target.bi_dir &&
+           !target.bi_dir_offset) {
+               target.bi_dir           = k.k->p.inode;
+               target.bi_dir_offset    = k.k->p.offset;
+
+               ret = __write_inode(trans, &target, target_snapshot) ?:
+                       bch2_trans_commit(trans, NULL, NULL,
+                                         BTREE_INSERT_NOFAIL|
+                                         BTREE_INSERT_LAZY_RW|
+                                         BTREE_INSERT_NOUNLOCK);
+               if (ret)
+                       return ret;
+               return -EINTR;
+       }
+
+       if (!inode_backpointer_matches(d, &target)) {
+               ret = inode_backpointer_exists(trans, &target);
+               if (ret < 0)
+                       return ret;
+
+               backpointer_exists = ret;
                ret = 0;
 
-               if (fsck_err_on(!have_target, c,
-                               "dirent points to missing inode:\n%s",
-                               (bch2_bkey_val_to_text(&PBUF(buf), c,
-                                                      k), buf))) {
-                       ret = remove_dirent(&trans, d.k->p);
-                       if (ret)
-                               goto err;
-                       goto next;
+               if (fsck_err_on(S_ISDIR(target.bi_mode) &&
+                               backpointer_exists, c,
+                               "directory %llu with multiple links",
+                               target.bi_inum))
+                       return remove_dirent(trans, d.k->p);
+
+               if (fsck_err_on(backpointer_exists &&
+                               !target.bi_nlink, c,
+                               "inode %llu has multiple links but i_nlink 0",
+                               d_inum)) {
+                       target.bi_nlink++;
+                       target.bi_flags &= ~BCH_INODE_UNLINKED;
+
+                       ret = write_inode(trans, &target, target_snapshot);
+                       return ret ?: -EINTR;
                }
 
-               if (!have_target)
-                       goto next;
-
-               if (!target.bi_dir &&
-                   !target.bi_dir_offset) {
+               if (fsck_err_on(!backpointer_exists, c,
+                               "inode %llu has wrong backpointer:\n"
+                               "got       %llu:%llu\n"
+                               "should be %llu:%llu",
+                               d_inum,
+                               target.bi_dir,
+                               target.bi_dir_offset,
+                               k.k->p.inode,
+                               k.k->p.offset)) {
                        target.bi_dir           = k.k->p.inode;
                        target.bi_dir_offset    = k.k->p.offset;
 
-                       ret = write_inode(&trans, &target, target_snapshot);
-                       if (ret)
-                               goto err;
+                       ret = write_inode(trans, &target, target_snapshot);
+                       return ret ?: -EINTR;
                }
+       }
 
-               if (!inode_backpointer_matches(d, &target)) {
-                       ret = inode_backpointer_exists(&trans, &target);
-                       if (ret < 0)
-                               goto err;
-
-                       backpointer_exists = ret;
-                       ret = 0;
+       if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c,
+                       "incorrect d_type: should be %u:\n%s",
+                       mode_to_type(target.bi_mode),
+                       (bch2_bkey_val_to_text(&PBUF(buf), c,
+                                              k), buf))) {
+               struct bkey_i_dirent *n;
 
-                       if (fsck_err_on(S_ISDIR(target.bi_mode) &&
-                                       backpointer_exists, c,
-                                       "directory %llu with multiple links",
-                                       target.bi_inum)) {
-                               ret = remove_dirent(&trans, d.k->p);
-                               if (ret)
-                                       goto err;
-                               continue;
-                       }
+               n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
+               if (!n)
+                       return -ENOMEM;
 
-                       if (fsck_err_on(backpointer_exists &&
-                                       !target.bi_nlink, c,
-                                       "inode %llu has multiple links but i_nlink 0",
-                                       d_inum)) {
-                               target.bi_nlink++;
-                               target.bi_flags &= ~BCH_INODE_UNLINKED;
+               bkey_reassemble(&n->k_i, d.s_c);
+               n->v.d_type = mode_to_type(target.bi_mode);
 
-                               ret = write_inode(&trans, &target, target_snapshot);
-                               if (ret)
-                                       goto err;
-                       }
+               ret = __bch2_trans_do(trans, NULL, NULL,
+                                     BTREE_INSERT_NOFAIL|
+                                     BTREE_INSERT_LAZY_RW,
+                       bch2_btree_iter_traverse(iter) ?:
+                       bch2_trans_update(trans, iter, &n->k_i, 0));
+               kfree(n);
+               return ret ?: -EINTR;
+       }
 
-                       if (fsck_err_on(!backpointer_exists, c,
-                                       "inode %llu has wrong backpointer:\n"
-                                       "got       %llu:%llu\n"
-                                       "should be %llu:%llu",
-                                       d_inum,
-                                       target.bi_dir,
-                                       target.bi_dir_offset,
-                                       k.k->p.inode,
-                                       k.k->p.offset)) {
-                               target.bi_dir           = k.k->p.inode;
-                               target.bi_dir_offset    = k.k->p.offset;
-
-                               ret = write_inode(&trans, &target, target_snapshot);
-                               if (ret)
-                                       goto err;
-                       }
-               }
+       *nr_subdirs += d.v->d_type == DT_DIR;
+       return 0;
+fsck_err:
+       return ret;
+}
 
-               if (fsck_err_on(d.v->d_type != mode_to_type(target.bi_mode), c,
-                               "incorrect d_type: should be %u:\n%s",
-                               mode_to_type(target.bi_mode),
-                               (bch2_bkey_val_to_text(&PBUF(buf), c,
-                                                      k), buf))) {
-                       struct bkey_i_dirent *n;
+/*
+ * Walk dirents: verify that they all have a corresponding S_ISDIR inode,
+ * validate d_type
+ */
+noinline_for_stack
+static int check_dirents(struct bch_fs *c)
+{
+       struct inode_walker w = inode_walker_init();
+       struct bch_hash_info hash_info;
+       struct btree_trans trans;
+       struct btree_iter *iter;
+       unsigned nr_subdirs = 0;
+       int ret = 0;
 
-                       n = kmalloc(bkey_bytes(d.k), GFP_KERNEL);
-                       if (!n) {
-                               ret = -ENOMEM;
-                               goto err;
-                       }
+       bch_verbose(c, "checking dirents");
 
-                       bkey_reassemble(&n->k_i, d.s_c);
-                       n->v.d_type = mode_to_type(target.bi_mode);
+       bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
 
-                       ret = __bch2_trans_do(&trans, NULL, NULL,
-                                             BTREE_INSERT_NOFAIL|
-                                             BTREE_INSERT_LAZY_RW,
-                               bch2_btree_iter_traverse(iter) ?:
-                               bch2_trans_update(&trans, iter, &n->k_i, 0));
-                       kfree(n);
-                       if (ret)
-                               goto err;
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_dirents,
+                                  POS(BCACHEFS_ROOT_INO, 0),
+                                  BTREE_ITER_INTENT|
+                                  BTREE_ITER_PREFETCH);
 
+       while (1) {
+               ret = lockrestart_do(&trans,
+                               check_dirent(&trans, iter, &hash_info, &w, &nr_subdirs));
+               if (ret == 1) {
+                       /* at end */
+                       ret = 0;
+                       break;
                }
+               if (ret)
+                       break;
 
-               nr_subdirs += d.v->d_type == DT_DIR;
-next:
                bch2_btree_iter_advance(iter);
        }
-err:
-fsck_err:
-       if (ret == -EINTR)
-               goto retry;
-
        bch2_trans_iter_put(&trans, iter);
+
        return bch2_trans_exit(&trans) ?: ret;
 }
 
index 59edb4cea5f1a0c57a4b542c2a3606d8949cf55c..67983ff4fb2cabb18749cd149625417bd4b103a8 100644 (file)
@@ -137,7 +137,7 @@ static void bch2_inode_pack_v2(struct bkey_inode_buf *packed,
        nr_fields++;                                                    \
                                                                        \
        if (inode->_name) {                                             \
-               ret = bch2_varint_encode(out, inode->_name);            \
+               ret = bch2_varint_encode_fast(out, inode->_name);       \
                out += ret;                                             \
                                                                        \
                if (_bits > 64)                                         \
@@ -246,13 +246,13 @@ static int bch2_inode_unpack_v2(struct bkey_s_c_inode inode,
 
 #define x(_name, _bits)                                                        \
        if (fieldnr < INODE_NR_FIELDS(inode.v)) {                       \
-               ret = bch2_varint_decode(in, end, &v[0]);               \
+               ret = bch2_varint_decode_fast(in, end, &v[0]);          \
                if (ret < 0)                                            \
                        return ret;                                     \
                in += ret;                                              \
                                                                        \
                if (_bits > 64) {                                       \
-                       ret = bch2_varint_decode(in, end, &v[1]);       \
+                       ret = bch2_varint_decode_fast(in, end, &v[1]);  \
                        if (ret < 0)                                    \
                                return ret;                             \
                        in += ret;                                      \
@@ -300,8 +300,10 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *trans,
        struct bkey_s_c k;
        int ret;
 
-       iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum),
-                                  BTREE_ITER_CACHED|flags);
+       if (trans->c->opts.inodes_use_key_cache)
+               flags |= BTREE_ITER_CACHED;
+
+       iter = bch2_trans_get_iter(trans, BTREE_ID_inodes, POS(0, inum), flags);
        k = bch2_btree_iter_peek_slot(iter);
        ret = bkey_err(k);
        if (ret)
@@ -577,8 +579,12 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
        struct bpos end = POS(inode_nr + 1, 0);
        struct bch_inode_unpacked inode_u;
        struct bkey_s_c k;
+       unsigned iter_flags = BTREE_ITER_INTENT;
        int ret;
 
+       if (cached && c->opts.inodes_use_key_cache)
+               iter_flags |= BTREE_ITER_CACHED;
+
        bch2_trans_init(&trans, c, 0, 1024);
 
        /*
@@ -600,11 +606,8 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached)
 retry:
        bch2_trans_begin(&trans);
 
-       iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes, POS(0, inode_nr),
-                                  (cached
-                                   ? BTREE_ITER_CACHED
-                                   : BTREE_ITER_SLOTS)|
-                                  BTREE_ITER_INTENT);
+       iter = bch2_trans_get_iter(&trans, BTREE_ID_inodes,
+                                  POS(0, inode_nr), iter_flags);
        k = bch2_btree_iter_peek_slot(iter);
 
        ret = bkey_err(k);
index fd3f7cddb9abb24cf8f1e15b9b8b369f83f8062f..5de296078219fc42e749193624d4c7fca4a3f25f 100644 (file)
@@ -63,7 +63,7 @@ const char * const bch2_member_states[] = {
 
 #undef x
 
-const char * const bch2_d_types[] = {
+const char * const bch2_d_types[DT_MAX] = {
        [DT_UNKNOWN]    = "unknown",
        [DT_FIFO]       = "fifo",
        [DT_CHR]        = "chr",
index c331535b0063d6b7755f555003f50fdb9d277d99..ed505857bc9e4887a0e5c6b33893430b48387870 100644 (file)
@@ -173,6 +173,11 @@ enum opt_type {
          OPT_BOOL(),                                                   \
          BCH_SB_SHARD_INUMS,           false,                          \
          NULL,         "Shard new inode numbers by CPU id")            \
+       x(inodes_use_key_cache, u8,                                     \
+         OPT_FORMAT|OPT_MOUNT,                                         \
+         OPT_BOOL(),                                                   \
+         BCH_SB_INODES_USE_KEY_CACHE,  true,                           \
+         NULL,         "Use the btree key cache for the inodes btree") \
        x(gc_reserve_percent,           u8,                             \
          OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME,                             \
          OPT_UINT(5, 21),                                              \
index 84a7acb04d01b1915f673164ef98dc93011de5de..9b1ffbf96e14784a63527ff6128ee925564eee8a 100644 (file)
@@ -171,6 +171,7 @@ read_attribute(btree_cache);
 read_attribute(btree_key_cache);
 read_attribute(btree_transactions);
 read_attribute(stripes_heap);
+read_attribute(open_buckets);
 
 read_attribute(internal_uuid);
 
@@ -409,6 +410,11 @@ SHOW(bch2_fs)
                return out.pos - buf;
        }
 
+       if (attr == &sysfs_open_buckets) {
+               bch2_open_buckets_to_text(&out, c);
+               return out.pos - buf;
+       }
+
        if (attr == &sysfs_compression_stats) {
                bch2_compression_stats_to_text(&out, c);
                return out.pos - buf;
@@ -567,6 +573,7 @@ struct attribute *bch2_fs_internal_files[] = {
        &sysfs_btree_key_cache,
        &sysfs_btree_transactions,
        &sysfs_stripes_heap,
+       &sysfs_open_buckets,
 
        &sysfs_read_realloc_races,
        &sysfs_extent_migrate_done,
index a3d252c741c88cc0b9632b13be38735924220db2..e6a041541792676d8936fdbc44dfac95314de076 100644 (file)
@@ -1,10 +1,18 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include <linux/bitops.h>
+#include <linux/string.h>
 #include <asm/unaligned.h>
 
 #include "varint.h"
 
+/**
+ * bch2_varint_encode - encode a variable length integer
+ * @out - destination to encode to
+ * @v  - unsigned integer to encode
+ *
+ * Returns the size in bytes of the encoded integer - at most 9 bytes
+ */
 int bch2_varint_encode(u8 *out, u64 v)
 {
        unsigned bits = fls64(v|1);
@@ -13,16 +21,79 @@ int bch2_varint_encode(u8 *out, u64 v)
        if (likely(bytes < 9)) {
                v <<= bytes;
                v |= ~(~0 << (bytes - 1));
+               v = cpu_to_le64(v);
+               memcpy(out, &v, bytes);
        } else {
                *out++ = 255;
                bytes = 9;
+               put_unaligned_le64(v, out);
        }
 
-       put_unaligned_le64(v, out);
        return bytes;
 }
 
+/**
+ * bch2_varint_decode - encode a variable length integer
+ * @in - varint to decode
+ * @end        - end of buffer to decode from
+ * @out        - on success, decoded integer
+ *
+ * Returns the size in bytes of the decoded integer - or -1 on failure (would
+ * have read past the end of the buffer)
+ */
 int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out)
+{
+       unsigned bytes = likely(in < end)
+               ? ffz(*in & 255) + 1
+               : 1;
+       u64 v;
+
+       if (unlikely(in + bytes > end))
+               return -1;
+
+       if (likely(bytes < 9)) {
+               v = 0;
+               memcpy(&v, in, bytes);
+               v = le64_to_cpu(v);
+               v >>= bytes;
+       } else {
+               v = get_unaligned_le64(++in);
+       }
+
+       *out = v;
+       return bytes;
+}
+
+/**
+ * bch2_varint_encode_fast - fast version of bch2_varint_encode
+ *
+ * This version assumes it's always safe to write 8 bytes to @out, even if the
+ * encoded integer would be smaller.
+ */
+int bch2_varint_encode_fast(u8 *out, u64 v)
+{
+       unsigned bits = fls64(v|1);
+       unsigned bytes = DIV_ROUND_UP(bits, 7);
+
+       if (likely(bytes < 9)) {
+               v <<= bytes;
+               v |= ~(~0 << (bytes - 1));
+       } else {
+               *out++ = 255;
+               bytes = 9;
+       }
+
+       put_unaligned_le64(v, out);
+       return bytes;
+}
+
+/**
+ * bch2_varint_decode_fast - fast version of bch2_varint_decode
+ *
+ * This version assumes that it is safe to read at most 8 bytes past the end of
+ * @end (we still return an error if the varint extends past @end).
+ */
+int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out)
 {
        u64 v = get_unaligned_le64(in);
        unsigned bytes = ffz(v & 255) + 1;
index 8daf813576b7b7277052899acd1c0b3d43b0f8b7..92a182fb3d7aed9fdcda1600451ca996ca5620b7 100644 (file)
@@ -5,4 +5,7 @@
 int bch2_varint_encode(u8 *, u64);
 int bch2_varint_decode(const u8 *, const u8 *, u64 *);
 
+int bch2_varint_encode_fast(u8 *, u64);
+int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *);
+
 #endif /* _BCACHEFS_VARINT_H */