]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/btree_gc.c
Add a subcommand for resizing the journal
[bcachefs-tools-debian] / libbcachefs / btree_gc.c
index 146f2428fe04ced98b545b545518d7c14903dc42..ba4acc112ed34ef22e6ad7526381966026b277fa 100644 (file)
@@ -8,6 +8,7 @@
 #include "alloc_background.h"
 #include "alloc_foreground.h"
 #include "bkey_methods.h"
+#include "bkey_on_stack.h"
 #include "btree_locking.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
 
 static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
 {
+       preempt_disable();
        write_seqcount_begin(&c->gc_pos_lock);
        c->gc_pos = new_pos;
        write_seqcount_end(&c->gc_pos_lock);
+       preempt_enable();
 }
 
 static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
@@ -98,7 +101,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
        int ret = 0;
 
        if (initial) {
-               BUG_ON(journal_seq_verify(c) &&
+               BUG_ON(bch2_journal_seq_verify &&
                       k.k->version.lo > journal_cur_seq(&c->journal));
 
                /* XXX change to fsck check */
@@ -109,7 +112,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k,
                        atomic64_set(&c->key_version, k.k->version.lo);
 
                if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-                   fsck_err_on(!bch2_bkey_replicas_marked(c, k, false), c,
+                   fsck_err_on(!bch2_bkey_replicas_marked(c, k), c,
                                "superblock not marked as containing replicas (type %u)",
                                k.k->type)) {
                        ret = bch2_mark_bkey_replicas(c, k);
@@ -186,7 +189,7 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
 
                bch2_btree_node_iter_advance(&iter, b);
 
-               if (b->level) {
+               if (b->c.level) {
                        ret = bch2_gc_check_topology(c, k,
                                        &next_node_start,
                                        b->data->max_key,
@@ -206,7 +209,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
        struct btree_iter *iter;
        struct btree *b;
        unsigned depth = metadata_only                  ? 1
-               : expensive_debug_checks(c)             ? 0
+               : bch2_expensive_debug_checks           ? 0
                : !btree_node_type_needs_gc(btree_id)   ? 1
                : 0;
        u8 max_stale = 0;
@@ -233,8 +236,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
                                                BTREE_INSERT_USE_RESERVE|
                                                BTREE_INSERT_NOWAIT|
                                                BTREE_INSERT_GC_LOCK_HELD);
-                       else if (!btree_gc_rewrite_disabled(c) &&
-                                (btree_gc_always_rewrite(c) || max_stale > 16))
+                       else if (!bch2_btree_gc_rewrite_disabled &&
+                                (bch2_btree_gc_always_rewrite || max_stale > 16))
                                bch2_btree_node_rewrite(c, iter,
                                                b->data->keys.seq,
                                                BTREE_INSERT_NOWAIT|
@@ -252,7 +255,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
        if (!btree_node_fake(b))
                ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
                                       &max_stale, initial);
-       gc_pos_set(c, gc_pos_btree_root(b->btree_id));
+       gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
        mutex_unlock(&c->btree_root_lock);
 
        return ret;
@@ -280,7 +283,7 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
                if (ret)
                        break;
 
-               if (b->level) {
+               if (b->c.level) {
                        struct btree *child;
                        BKEY_PADDED(k) tmp;
 
@@ -296,16 +299,16 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
                        if (ret)
                                break;
 
-                       if (b->level > target_depth) {
+                       if (b->c.level > target_depth) {
                                child = bch2_btree_node_get_noiter(c, &tmp.k,
-                                                       b->btree_id, b->level - 1);
+                                                       b->c.btree_id, b->c.level - 1);
                                ret = PTR_ERR_OR_ZERO(child);
                                if (ret)
                                        break;
 
                                ret = bch2_gc_btree_init_recurse(c, child,
                                                journal_keys, target_depth);
-                               six_unlock_read(&child->lock);
+                               six_unlock_read(&child->c.lock);
 
                                if (ret)
                                        break;
@@ -325,7 +328,7 @@ static int bch2_gc_btree_init(struct bch_fs *c,
 {
        struct btree *b;
        unsigned target_depth = metadata_only           ? 1
-               : expensive_debug_checks(c)             ? 0
+               : bch2_expensive_debug_checks           ? 0
                : !btree_node_type_needs_gc(btree_id)   ? 1
                : 0;
        u8 max_stale = 0;
@@ -336,7 +339,7 @@ static int bch2_gc_btree_init(struct bch_fs *c,
        if (btree_node_fake(b))
                return 0;
 
-       six_lock_read(&b->lock);
+       six_lock_read(&b->c.lock, NULL, NULL);
        if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c,
                        "btree root with incorrect min_key: %llu:%llu",
                        b->data->min_key.inode,
@@ -351,7 +354,7 @@ static int bch2_gc_btree_init(struct bch_fs *c,
                BUG();
        }
 
-       if (b->level >= target_depth)
+       if (b->c.level >= target_depth)
                ret = bch2_gc_btree_init_recurse(c, b,
                                        journal_keys, target_depth);
 
@@ -359,7 +362,7 @@ static int bch2_gc_btree_init(struct bch_fs *c,
                ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
                                       &max_stale, true);
 fsck_err:
-       six_unlock_read(&b->lock);
+       six_unlock_read(&b->c.lock);
 
        return ret;
 }
@@ -433,16 +436,16 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
 
                if (offset == BCH_SB_SECTOR)
                        mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR,
-                                             BCH_DATA_SB, flags);
+                                             BCH_DATA_sb, flags);
 
                mark_metadata_sectors(c, ca, offset,
                                      offset + (1 << layout->sb_max_size_bits),
-                                     BCH_DATA_SB, flags);
+                                     BCH_DATA_sb, flags);
        }
 
        for (i = 0; i < ca->journal.nr; i++) {
                b = ca->journal.buckets[i];
-               bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL,
+               bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal,
                                          ca->mi.bucket_size,
                                          gc_phase(GC_PHASE_SB), flags);
        }
@@ -464,6 +467,7 @@ static void bch2_mark_superblocks(struct bch_fs *c)
        mutex_unlock(&c->sb_lock);
 }
 
+#if 0
 /* Also see bch2_pending_btree_node_free_insert_done() */
 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 {
@@ -481,6 +485,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 
        mutex_unlock(&c->btree_interior_update_lock);
 }
+#endif
 
 static void bch2_mark_allocator_buckets(struct bch_fs *c)
 {
@@ -565,6 +570,7 @@ static int bch2_gc_done(struct bch_fs *c,
                        fsck_err(c, _msg ": got %llu, should be %llu"   \
                                , ##__VA_ARGS__, dst->_f, src->_f);     \
                dst->_f = src->_f;                                      \
+               ret = 1;                                                \
        }
 #define copy_stripe_field(_f, _msg, ...)                               \
        if (dst->_f != src->_f) {                                       \
@@ -575,14 +581,18 @@ static int bch2_gc_done(struct bch_fs *c,
                                dst->_f, src->_f);                      \
                dst->_f = src->_f;                                      \
                dst->dirty = true;                                      \
+               ret = 1;                                                \
        }
 #define copy_bucket_field(_f)                                          \
        if (dst->b[b].mark._f != src->b[b].mark._f) {                   \
                if (verify)                                             \
-                       fsck_err(c, "dev %u bucket %zu has wrong " #_f  \
+                       fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f  \
                                ": got %u, should be %u", i, b,         \
+                               dst->b[b].mark.gen,                     \
+                               bch2_data_types[dst->b[b].mark.data_type],\
                                dst->b[b].mark._f, src->b[b].mark._f);  \
                dst->b[b]._mark._f = src->b[b].mark._f;                 \
+               ret = 1;                                                \
        }
 #define copy_dev_field(_f, _msg, ...)                                  \
        copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__)
@@ -613,8 +623,11 @@ static int bch2_gc_done(struct bch_fs *c,
                                copy_stripe_field(block_sectors[i],
                                                  "block_sectors[%u]", i);
 
-                       if (dst->alive)
+                       if (dst->alive) {
+                               spin_lock(&c->ec_stripes_heap_lock);
                                bch2_stripes_heap_insert(c, dst, dst_iter.pos);
+                               spin_unlock(&c->ec_stripes_heap_lock);
+                       }
 
                        genradix_iter_advance(&dst_iter, &c->stripes[0]);
                        genradix_iter_advance(&src_iter, &c->stripes[1]);
@@ -669,8 +682,8 @@ static int bch2_gc_done(struct bch_fs *c,
                        char buf[80];
 
                        if (metadata_only &&
-                           (e->data_type == BCH_DATA_USER ||
-                            e->data_type == BCH_DATA_CACHED))
+                           (e->data_type == BCH_DATA_user ||
+                            e->data_type == BCH_DATA_cached))
                                continue;
 
                        bch2_replicas_entry_to_text(&PBUF(buf), e);
@@ -755,8 +768,8 @@ static int bch2_gc_start(struct bch_fs *c,
                        d->gen_valid = s->gen_valid;
 
                        if (metadata_only &&
-                           (s->mark.data_type == BCH_DATA_USER ||
-                            s->mark.data_type == BCH_DATA_CACHED)) {
+                           (s->mark.data_type == BCH_DATA_user ||
+                            s->mark.data_type == BCH_DATA_cached)) {
                                d->_mark = s->mark;
                                d->_mark.owned_by_allocator = 0;
                        }
@@ -794,9 +807,14 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
        unsigned i, iter = 0;
        int ret;
 
+       lockdep_assert_held(&c->state_lock);
        trace_gc_start(c);
 
        down_write(&c->gc_lock);
+
+       /* flush interior btree updates: */
+       closure_wait_event(&c->btree_interior_update_wait,
+                          !bch2_btree_interior_updates_nr_pending(c));
 again:
        ret = bch2_gc_start(c, metadata_only);
        if (ret)
@@ -808,14 +826,16 @@ again:
        if (ret)
                goto out;
 
+#if 0
        bch2_mark_pending_btree_node_frees(c);
+#endif
        bch2_mark_allocator_buckets(c);
 
        c->gc_count++;
 out:
        if (!ret &&
            (test_bit(BCH_FS_FIXED_GENS, &c->flags) ||
-            (!iter && test_restart_gc(c)))) {
+            (!iter && bch2_test_restart_gc))) {
                /*
                 * XXX: make sure gens we fixed got saved
                 */
@@ -874,6 +894,128 @@ out:
        return ret;
 }
 
+static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k)
+{
+       struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+       const struct bch_extent_ptr *ptr;
+
+       percpu_down_read(&c->mark_lock);
+       bkey_for_each_ptr(ptrs, ptr) {
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+               struct bucket *g = PTR_BUCKET(ca, ptr, false);
+
+               if (gen_after(g->mark.gen, ptr->gen) > 16) {
+                       percpu_up_read(&c->mark_lock);
+                       return true;
+               }
+       }
+
+       bkey_for_each_ptr(ptrs, ptr) {
+               struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+               struct bucket *g = PTR_BUCKET(ca, ptr, false);
+
+               if (gen_after(g->gc_gen, ptr->gen))
+                       g->gc_gen = ptr->gen;
+       }
+       percpu_up_read(&c->mark_lock);
+
+       return false;
+}
+
+/*
+ * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree
+ * node pointers currently never have cached pointers that can become stale:
+ */
+static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id)
+{
+       struct btree_trans trans;
+       struct btree_iter *iter;
+       struct bkey_s_c k;
+       struct bkey_on_stack sk;
+       int ret = 0;
+
+       bkey_on_stack_init(&sk);
+       bch2_trans_init(&trans, c, 0, 0);
+
+       iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN,
+                                  BTREE_ITER_PREFETCH);
+
+       while ((k = bch2_btree_iter_peek(iter)).k &&
+              !(ret = bkey_err(k))) {
+               if (gc_btree_gens_key(c, k)) {
+                       bkey_on_stack_reassemble(&sk, c, k);
+                       bch2_extent_normalize(c, bkey_i_to_s(sk.k));
+
+                       bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k));
+
+                       bch2_trans_update(&trans, iter, sk.k, 0);
+
+                       ret = bch2_trans_commit(&trans, NULL, NULL,
+                                               BTREE_INSERT_NOFAIL);
+                       if (ret == -EINTR)
+                               continue;
+                       if (ret) {
+                               break;
+                       }
+               }
+
+               bch2_btree_iter_next(iter);
+       }
+
+       bch2_trans_exit(&trans);
+       bkey_on_stack_exit(&sk, c);
+
+       return ret;
+}
+
+int bch2_gc_gens(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       struct bucket_array *buckets;
+       struct bucket *g;
+       unsigned i;
+       int ret;
+
+       /*
+        * Ideally we would be using state_lock and not gc_lock here, but that
+        * introduces a deadlock in the RO path - we currently take the state
+        * lock at the start of going RO, thus the gc thread may get stuck:
+        */
+       down_read(&c->gc_lock);
+
+       for_each_member_device(ca, c, i) {
+               down_read(&ca->bucket_lock);
+               buckets = bucket_array(ca);
+
+               for_each_bucket(g, buckets)
+                       g->gc_gen = g->mark.gen;
+               up_read(&ca->bucket_lock);
+       }
+
+       for (i = 0; i < BTREE_ID_NR; i++)
+               if (btree_node_type_needs_gc(i)) {
+                       ret = bch2_gc_btree_gens(c, i);
+                       if (ret) {
+                               bch_err(c, "error recalculating oldest_gen: %i", ret);
+                               goto err;
+                       }
+               }
+
+       for_each_member_device(ca, c, i) {
+               down_read(&ca->bucket_lock);
+               buckets = bucket_array(ca);
+
+               for_each_bucket(g, buckets)
+                       g->oldest_gen = g->gc_gen;
+               up_read(&ca->bucket_lock);
+       }
+
+       c->gc_count++;
+err:
+       up_read(&c->gc_lock);
+       return ret;
+}
+
 /* Btree coalescing */
 
 static void recalc_packed_keys(struct btree *b)
@@ -997,9 +1139,9 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 
                        set_btree_bset_end(n1, n1->set);
 
-                       six_unlock_write(&n2->lock);
+                       six_unlock_write(&n2->c.lock);
                        bch2_btree_node_free_never_inserted(c, n2);
-                       six_unlock_intent(&n2->lock);
+                       six_unlock_intent(&n2->c.lock);
 
                        memmove(new_nodes + i - 1,
                                new_nodes + i,
@@ -1033,7 +1175,9 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
                btree_node_reset_sib_u64s(n);
 
                bch2_btree_build_aux_trees(n);
-               six_unlock_write(&n->lock);
+
+               bch2_btree_update_add_new_node(as, n);
+               six_unlock_write(&n->c.lock);
 
                bch2_btree_node_write(c, n, SIX_LOCK_intent);
        }
@@ -1076,12 +1220,12 @@ next:
 
        BUG_ON(!bch2_keylist_empty(&keylist));
 
-       BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]);
+       BUG_ON(iter->l[old_nodes[0]->c.level].b != old_nodes[0]);
 
        bch2_btree_iter_node_replace(iter, new_nodes[0]);
 
        for (i = 0; i < nr_new_nodes; i++)
-               bch2_open_buckets_put(c, &new_nodes[i]->ob);
+               bch2_btree_update_get_open_buckets(as, new_nodes[i]);
 
        /* Free the old nodes and update our sliding window */
        for (i = 0; i < nr_old_nodes; i++) {
@@ -1101,7 +1245,7 @@ next:
        }
 
        for (i = 0; i < nr_new_nodes; i++)
-               six_unlock_intent(&new_nodes[i]->lock);
+               six_unlock_intent(&new_nodes[i]->c.lock);
 
        bch2_btree_update_done(as);
        bch2_keylist_free(&keylist, NULL);
@@ -1142,11 +1286,11 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
 
                for (i = 1; i < GC_MERGE_NODES; i++) {
                        if (!merge[i] ||
-                           !six_relock_intent(&merge[i]->lock, lock_seq[i]))
+                           !six_relock_intent(&merge[i]->c.lock, lock_seq[i]))
                                break;
 
-                       if (merge[i]->level != merge[0]->level) {
-                               six_unlock_intent(&merge[i]->lock);
+                       if (merge[i]->c.level != merge[0]->c.level) {
+                               six_unlock_intent(&merge[i]->c.lock);
                                break;
                        }
                }
@@ -1155,11 +1299,11 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
                bch2_coalesce_nodes(c, iter, merge);
 
                for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
-                       lock_seq[i] = merge[i]->lock.state.seq;
-                       six_unlock_intent(&merge[i]->lock);
+                       lock_seq[i] = merge[i]->c.lock.state.seq;
+                       six_unlock_intent(&merge[i]->c.lock);
                }
 
-               lock_seq[0] = merge[0]->lock.state.seq;
+               lock_seq[0] = merge[0]->c.lock.state.seq;
 
                if (kthread && kthread_should_stop()) {
                        bch2_trans_exit(&trans);
@@ -1247,8 +1391,15 @@ static int bch2_gc_thread(void *arg)
                last = atomic_long_read(&clock->now);
                last_kick = atomic_read(&c->kick_gc);
 
+               /*
+                * Full gc is currently incompatible with btree key cache:
+                */
+#if 0
                ret = bch2_gc(c, NULL, false, false);
-               if (ret)
+#else
+               ret = bch2_gc_gens(c);
+#endif
+               if (ret < 0)
                        bch_err(c, "btree gc failed: %i", ret);
 
                debug_check_no_locks_held();