]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/btree_gc.c
Update bcachefs sources to 4837f82ee1 bcachefs: Use cached iterators for alloc btree
[bcachefs-tools-debian] / libbcachefs / btree_gc.c
index 7c89a6dd7f5a1302f1e95b6c3cbb31803b75a3e9..e8abc1937b5552274c846c56d86f663d06b3fbcf 100644 (file)
@@ -47,65 +47,42 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
        __gc_pos_set(c, new_pos);
 }
 
-/* range_checks - for validating min/max pos of each btree node: */
-
-struct range_checks {
-       struct range_level {
-               struct bpos     min;
-               struct bpos     max;
-       }                       l[BTREE_MAX_DEPTH];
-       unsigned                depth;
-};
-
-static void btree_node_range_checks_init(struct range_checks *r, unsigned depth)
-{
-       unsigned i;
-
-       for (i = 0; i < BTREE_MAX_DEPTH; i++)
-               r->l[i].min = r->l[i].max = POS_MIN;
-       r->depth = depth;
-}
-
-static void btree_node_range_checks(struct bch_fs *c, struct btree *b,
-                                   struct range_checks *r)
+static int bch2_gc_check_topology(struct bch_fs *c,
+                                 struct bkey_s_c k,
+                                 struct bpos *expected_start,
+                                 struct bpos expected_end,
+                                 bool is_last)
 {
-       struct range_level *l = &r->l[b->level];
-
-       struct bpos expected_min = bkey_cmp(l->min, l->max)
-               ? btree_type_successor(b->btree_id, l->max)
-               : l->max;
-
-       bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, expected_min), c,
-               "btree node has incorrect min key: %llu:%llu != %llu:%llu",
-               b->data->min_key.inode,
-               b->data->min_key.offset,
-               expected_min.inode,
-               expected_min.offset);
-
-       l->max = b->data->max_key;
+       int ret = 0;
 
-       if (b->level > r->depth) {
-               l = &r->l[b->level - 1];
+       if (k.k->type == KEY_TYPE_btree_ptr_v2) {
+               struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
-               bch2_fs_inconsistent_on(bkey_cmp(b->data->min_key, l->min), c,
-                       "btree node min doesn't match min of child nodes: %llu:%llu != %llu:%llu",
-                       b->data->min_key.inode,
-                       b->data->min_key.offset,
-                       l->min.inode,
-                       l->min.offset);
+               if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c,
+                               "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu",
+                               bp.v->min_key.inode,
+                               bp.v->min_key.offset,
+                               expected_start->inode,
+                               expected_start->offset)) {
+                       BUG();
+               }
+       }
 
-               bch2_fs_inconsistent_on(bkey_cmp(b->data->max_key, l->max), c,
-                       "btree node max doesn't match max of child nodes: %llu:%llu != %llu:%llu",
-                       b->data->max_key.inode,
-                       b->data->max_key.offset,
-                       l->max.inode,
-                       l->max.offset);
-
-               if (bkey_cmp(b->data->max_key, POS_MAX))
-                       l->min = l->max =
-                               btree_type_successor(b->btree_id,
-                                                    b->data->max_key);
+       *expected_start = bkey_cmp(k.k->p, POS_MAX)
+               ? bkey_successor(k.k->p)
+               : k.k->p;
+
+       if (fsck_err_on(is_last &&
+                       bkey_cmp(k.k->p, expected_end), c,
+                       "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu",
+                       k.k->p.inode,
+                       k.k->p.offset,
+                       expected_end.inode,
+                       expected_end.offset)) {
+               BUG();
        }
+fsck_err:
+       return ret;
 }
 
 /* marking of btree keys/nodes: */
@@ -187,6 +164,7 @@ fsck_err:
 static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
                              bool initial)
 {
+       struct bpos next_node_start = b->data->min_key;
        struct btree_node_iter iter;
        struct bkey unpacked;
        struct bkey_s_c k;
@@ -197,13 +175,25 @@ static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale,
        if (!btree_node_type_needs_gc(btree_node_type(b)))
                return 0;
 
-       for_each_btree_node_key_unpack(b, k, &iter,
-                                      &unpacked) {
+       bch2_btree_node_iter_init_from_start(&iter, b);
+
+       while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
                bch2_bkey_debugcheck(c, b, k);
 
                ret = bch2_gc_mark_key(c, k, max_stale, initial);
                if (ret)
                        break;
+
+               bch2_btree_node_iter_advance(&iter, b);
+
+               if (b->c.level) {
+                       ret = bch2_gc_check_topology(c, k,
+                                       &next_node_start,
+                                       b->data->max_key,
+                                       bch2_btree_node_iter_end(&iter));
+                       if (ret)
+                               break;
+               }
        }
 
        return ret;
@@ -215,7 +205,6 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
        struct btree_trans trans;
        struct btree_iter *iter;
        struct btree *b;
-       struct range_checks r;
        unsigned depth = metadata_only                  ? 1
                : expensive_debug_checks(c)             ? 0
                : !btree_node_type_needs_gc(btree_id)   ? 1
@@ -227,12 +216,8 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
 
        gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
 
-       btree_node_range_checks_init(&r, depth);
-
        __for_each_btree_node(&trans, iter, btree_id, POS_MIN,
                              0, depth, BTREE_ITER_PREFETCH, b) {
-               btree_node_range_checks(c, b, &r);
-
                bch2_verify_btree_nr_keys(b);
 
                gc_pos_set(c, gc_pos_btree_node(b));
@@ -267,18 +252,19 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id,
        if (!btree_node_fake(b))
                ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
                                       &max_stale, initial);
-       gc_pos_set(c, gc_pos_btree_root(b->btree_id));
+       gc_pos_set(c, gc_pos_btree_root(b->c.btree_id));
        mutex_unlock(&c->btree_root_lock);
 
        return ret;
 }
 
 static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
-                                        struct journal_keys *journal_keys,
-                                        unsigned target_depth)
+                                     struct journal_keys *journal_keys,
+                                     unsigned target_depth)
 {
        struct btree_and_journal_iter iter;
        struct bkey_s_c k;
+       struct bpos next_node_start = b->data->min_key;
        u8 max_stale = 0;
        int ret = 0;
 
@@ -287,28 +273,46 @@ static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b,
        while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
                bch2_bkey_debugcheck(c, b, k);
 
+               BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0);
+               BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0);
+
                ret = bch2_gc_mark_key(c, k, &max_stale, true);
                if (ret)
                        break;
 
-               if (b->level > target_depth) {
+               if (b->c.level) {
                        struct btree *child;
                        BKEY_PADDED(k) tmp;
 
                        bkey_reassemble(&tmp.k, k);
+                       k = bkey_i_to_s_c(&tmp.k);
 
-                       child = bch2_btree_node_get_noiter(c, &tmp.k,
-                                               b->btree_id, b->level - 1);
-                       ret = PTR_ERR_OR_ZERO(child);
+                       bch2_btree_and_journal_iter_advance(&iter);
+
+                       ret = bch2_gc_check_topology(c, k,
+                                       &next_node_start,
+                                       b->data->max_key,
+                                       !bch2_btree_and_journal_iter_peek(&iter).k);
                        if (ret)
                                break;
 
-                       bch2_gc_btree_init_recurse(c, child,
-                                       journal_keys, target_depth);
-                       six_unlock_read(&child->lock);
-               }
+                       if (b->c.level > target_depth) {
+                               child = bch2_btree_node_get_noiter(c, &tmp.k,
+                                                       b->c.btree_id, b->c.level - 1);
+                               ret = PTR_ERR_OR_ZERO(child);
+                               if (ret)
+                                       break;
+
+                               ret = bch2_gc_btree_init_recurse(c, child,
+                                               journal_keys, target_depth);
+                               six_unlock_read(&child->c.lock);
 
-               bch2_btree_and_journal_iter_advance(&iter);
+                               if (ret)
+                                       break;
+                       }
+               } else {
+                       bch2_btree_and_journal_iter_advance(&iter);
+               }
        }
 
        return ret;
@@ -332,15 +336,30 @@ static int bch2_gc_btree_init(struct bch_fs *c,
        if (btree_node_fake(b))
                return 0;
 
-       six_lock_read(&b->lock);
-       if (b->level >= target_depth)
+       six_lock_read(&b->c.lock, NULL, NULL);
+       if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c,
+                       "btree root with incorrect min_key: %llu:%llu",
+                       b->data->min_key.inode,
+                       b->data->min_key.offset)) {
+               BUG();
+       }
+
+       if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c,
+                       "btree root with incorrect min_key: %llu:%llu",
+                       b->data->max_key.inode,
+                       b->data->max_key.offset)) {
+               BUG();
+       }
+
+       if (b->c.level >= target_depth)
                ret = bch2_gc_btree_init_recurse(c, b,
                                        journal_keys, target_depth);
 
        if (!ret)
                ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key),
                                       &max_stale, true);
-       six_unlock_read(&b->lock);
+fsck_err:
+       six_unlock_read(&b->c.lock);
 
        return ret;
 }
@@ -445,6 +464,7 @@ static void bch2_mark_superblocks(struct bch_fs *c)
        mutex_unlock(&c->sb_lock);
 }
 
+#if 0
 /* Also see bch2_pending_btree_node_free_insert_done() */
 static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 {
@@ -462,6 +482,7 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
 
        mutex_unlock(&c->btree_interior_update_lock);
 }
+#endif
 
 static void bch2_mark_allocator_buckets(struct bch_fs *c)
 {
@@ -560,8 +581,10 @@ static int bch2_gc_done(struct bch_fs *c,
 #define copy_bucket_field(_f)                                          \
        if (dst->b[b].mark._f != src->b[b].mark._f) {                   \
                if (verify)                                             \
-                       fsck_err(c, "dev %u bucket %zu has wrong " #_f  \
+                       fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f  \
                                ": got %u, should be %u", i, b,         \
+                               dst->b[b].mark.gen,                     \
+                               bch2_data_types[dst->b[b].mark.data_type],\
                                dst->b[b].mark._f, src->b[b].mark._f);  \
                dst->b[b]._mark._f = src->b[b].mark._f;                 \
        }
@@ -680,8 +703,10 @@ static int bch2_gc_start(struct bch_fs *c,
 
        c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
                                         sizeof(u64), GFP_KERNEL);
-       if (!c->usage_gc)
+       if (!c->usage_gc) {
+               bch_err(c, "error allocating c->usage_gc");
                return -ENOMEM;
+       }
 
        for_each_member_device(ca, c, i) {
                BUG_ON(ca->buckets[1]);
@@ -692,19 +717,23 @@ static int bch2_gc_start(struct bch_fs *c,
                                GFP_KERNEL|__GFP_ZERO);
                if (!ca->buckets[1]) {
                        percpu_ref_put(&ca->ref);
+                       bch_err(c, "error allocating ca->buckets[gc]");
                        return -ENOMEM;
                }
 
                ca->usage[1] = alloc_percpu(struct bch_dev_usage);
                if (!ca->usage[1]) {
+                       bch_err(c, "error allocating ca->usage[gc]");
                        percpu_ref_put(&ca->ref);
                        return -ENOMEM;
                }
        }
 
        ret = bch2_ec_mem_alloc(c, true);
-       if (ret)
+       if (ret) {
+               bch_err(c, "error allocating ec gc mem");
                return ret;
+       }
 
        percpu_down_write(&c->mark_lock);
 
@@ -769,9 +798,14 @@ int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys,
        unsigned i, iter = 0;
        int ret;
 
+       lockdep_assert_held(&c->state_lock);
        trace_gc_start(c);
 
        down_write(&c->gc_lock);
+
+       /* flush interior btree updates: */
+       closure_wait_event(&c->btree_interior_update_wait,
+                          !bch2_btree_interior_updates_nr_pending(c));
 again:
        ret = bch2_gc_start(c, metadata_only);
        if (ret)
@@ -783,7 +817,9 @@ again:
        if (ret)
                goto out;
 
+#if 0
        bch2_mark_pending_btree_node_frees(c);
+#endif
        bch2_mark_allocator_buckets(c);
 
        c->gc_count++;
@@ -849,6 +885,76 @@ out:
        return ret;
 }
 
+/*
+ * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree
+ * node pointers currently never have cached pointers that can become stale:
+ */
+static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id id)
+{
+       struct btree_trans trans;
+       struct btree_iter *iter;
+       struct bkey_s_c k;
+       int ret;
+
+       bch2_trans_init(&trans, c, 0, 0);
+
+       for_each_btree_key(&trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH, k, ret) {
+               struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+               const struct bch_extent_ptr *ptr;
+
+               bkey_for_each_ptr(ptrs, ptr) {
+                       struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+                       struct bucket *g = PTR_BUCKET(ca, ptr, false);
+
+                       if (gen_after(g->gc_gen, ptr->gen))
+                               g->gc_gen = ptr->gen;
+
+                       if (gen_after(g->mark.gen, ptr->gen) > 32) {
+                               /* rewrite btree node */
+
+                       }
+               }
+       }
+
+       bch2_trans_exit(&trans);
+       return ret;
+}
+
+int bch2_gc_gens(struct bch_fs *c)
+{
+       struct bch_dev *ca;
+       unsigned i;
+       int ret;
+
+       down_read(&c->state_lock);
+
+       for_each_member_device(ca, c, i) {
+               struct bucket_array *buckets = bucket_array(ca);
+               struct bucket *g;
+
+               for_each_bucket(g, buckets)
+                       g->gc_gen = g->mark.gen;
+       }
+
+       for (i = 0; i < BTREE_ID_NR; i++)
+               if (btree_node_type_needs_gc(i)) {
+                       ret = bch2_gc_btree_gens(c, i);
+                       if (ret)
+                               goto err;
+               }
+
+       for_each_member_device(ca, c, i) {
+               struct bucket_array *buckets = bucket_array(ca);
+               struct bucket *g;
+
+               for_each_bucket(g, buckets)
+                       g->oldest_gen = g->gc_gen;
+       }
+err:
+       up_read(&c->state_lock);
+       return ret;
+}
+
 /* Btree coalescing */
 
 static void recalc_packed_keys(struct btree *b)
@@ -914,7 +1020,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
                return;
        }
 
-       as = bch2_btree_update_start(c, iter->btree_id,
+       as = bch2_btree_update_start(iter->trans, iter->btree_id,
                        btree_update_reserve_required(c, parent) + nr_old_nodes,
                        BTREE_INSERT_NOFAIL|
                        BTREE_INSERT_USE_RESERVE,
@@ -972,9 +1078,9 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
 
                        set_btree_bset_end(n1, n1->set);
 
-                       six_unlock_write(&n2->lock);
+                       six_unlock_write(&n2->c.lock);
                        bch2_btree_node_free_never_inserted(c, n2);
-                       six_unlock_intent(&n2->lock);
+                       six_unlock_intent(&n2->c.lock);
 
                        memmove(new_nodes + i - 1,
                                new_nodes + i,
@@ -985,9 +1091,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
                        n1->key.k.p = n1->data->max_key =
                                bkey_unpack_pos(n1, last);
 
-                       n2->data->min_key =
-                               btree_type_successor(iter->btree_id,
-                                                    n1->data->max_key);
+                       n2->data->min_key = bkey_successor(n1->data->max_key);
 
                        memcpy_u64s(vstruct_last(s1),
                                    s2->start, u64s);
@@ -1010,7 +1114,9 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter,
                btree_node_reset_sib_u64s(n);
 
                bch2_btree_build_aux_trees(n);
-               six_unlock_write(&n->lock);
+
+               bch2_btree_update_add_new_node(as, n);
+               six_unlock_write(&n->c.lock);
 
                bch2_btree_node_write(c, n, SIX_LOCK_intent);
        }
@@ -1053,12 +1159,12 @@ next:
 
        BUG_ON(!bch2_keylist_empty(&keylist));
 
-       BUG_ON(iter->l[old_nodes[0]->level].b != old_nodes[0]);
+       BUG_ON(iter->l[old_nodes[0]->c.level].b != old_nodes[0]);
 
        bch2_btree_iter_node_replace(iter, new_nodes[0]);
 
        for (i = 0; i < nr_new_nodes; i++)
-               bch2_open_buckets_put(c, &new_nodes[i]->ob);
+               bch2_btree_update_get_open_buckets(as, new_nodes[i]);
 
        /* Free the old nodes and update our sliding window */
        for (i = 0; i < nr_old_nodes; i++) {
@@ -1078,7 +1184,7 @@ next:
        }
 
        for (i = 0; i < nr_new_nodes; i++)
-               six_unlock_intent(&new_nodes[i]->lock);
+               six_unlock_intent(&new_nodes[i]->c.lock);
 
        bch2_btree_update_done(as);
        bch2_keylist_free(&keylist, NULL);
@@ -1119,11 +1225,11 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
 
                for (i = 1; i < GC_MERGE_NODES; i++) {
                        if (!merge[i] ||
-                           !six_relock_intent(&merge[i]->lock, lock_seq[i]))
+                           !six_relock_intent(&merge[i]->c.lock, lock_seq[i]))
                                break;
 
-                       if (merge[i]->level != merge[0]->level) {
-                               six_unlock_intent(&merge[i]->lock);
+                       if (merge[i]->c.level != merge[0]->c.level) {
+                               six_unlock_intent(&merge[i]->c.lock);
                                break;
                        }
                }
@@ -1132,11 +1238,11 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id)
                bch2_coalesce_nodes(c, iter, merge);
 
                for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) {
-                       lock_seq[i] = merge[i]->lock.state.seq;
-                       six_unlock_intent(&merge[i]->lock);
+                       lock_seq[i] = merge[i]->c.lock.state.seq;
+                       six_unlock_intent(&merge[i]->c.lock);
                }
 
-               lock_seq[0] = merge[0]->lock.state.seq;
+               lock_seq[0] = merge[0]->c.lock.state.seq;
 
                if (kthread && kthread_should_stop()) {
                        bch2_trans_exit(&trans);
@@ -1224,7 +1330,14 @@ static int bch2_gc_thread(void *arg)
                last = atomic_long_read(&clock->now);
                last_kick = atomic_read(&c->kick_gc);
 
+               /*
+                * Full gc is currently incompatible with btree key cache:
+                */
+#if 0
                ret = bch2_gc(c, NULL, false, false);
+#else
+               ret = bch2_gc_gens(c);
+#endif
                if (ret)
                        bch_err(c, "btree gc failed: %i", ret);