]> git.sesse.net Git - bcachefs-tools-debian/blobdiff - libbcachefs/btree_gc.c
Update bcachefs sources to 17a344f265 bcachefs: Improve fsck for subvols/snapshots
[bcachefs-tools-debian] / libbcachefs / btree_gc.c
index 5c54a0ca681cadca29d256fd0d5d97de4cd2717d..e260689ba830cc97f2c169754366baef967a4f43 100644 (file)
@@ -80,7 +80,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
                        bch2_topology_error(c);
 
                        if (bkey_deleted(&prev->k->k)) {
-                               pr_buf(&buf1, "start of node: ");
+                               prt_printf(&buf1, "start of node: ");
                                bch2_bpos_to_text(&buf1, node_start);
                        } else {
                                bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
@@ -214,7 +214,7 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
        }
 
        bch2_btree_node_drop_keys_outside_node(b);
-
+       bkey_copy(&b->key, &new->k_i);
        return 0;
 }
 
@@ -264,7 +264,7 @@ static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
        int ret = 0;
 
        if (!prev) {
-               pr_buf(&buf1, "start of node: ");
+               prt_printf(&buf1, "start of node: ");
                bch2_bpos_to_text(&buf1, b->data->min_key);
        } else {
                bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
@@ -359,7 +359,7 @@ static int bch2_btree_repair_topology_recurse(struct bch_fs *c, struct btree *b)
        struct bkey_buf prev_k, cur_k;
        struct btree *prev = NULL, *cur = NULL;
        bool have_child, dropped_children = false;
-       struct printbuf buf;
+       struct printbuf buf = PRINTBUF;
        int ret = 0;
 
        if (!b->c.level)
@@ -387,7 +387,7 @@ again:
                bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
 
                if (mustfix_fsck_err_on(ret == -EIO, c,
-                               "Unreadable btree node at btree %s level %u:\n"
+                               "Topology repair: unreadable btree node at btree %s level %u:\n"
                                "  %s",
                                bch2_btree_ids[b->c.btree_id],
                                b->c.level - 1,
@@ -395,6 +395,7 @@ again:
                        bch2_btree_node_evict(c, cur_k.k);
                        ret = bch2_journal_key_delete(c, b->c.btree_id,
                                                      b->c.level, cur_k.k->k.p);
+                       cur = NULL;
                        if (ret)
                                break;
                        continue;
@@ -413,6 +414,7 @@ again:
                        bch2_btree_node_evict(c, cur_k.k);
                        ret = bch2_journal_key_delete(c, b->c.btree_id,
                                                      b->c.level, cur_k.k->k.p);
+                       cur = NULL;
                        if (ret)
                                break;
                        continue;
@@ -562,7 +564,8 @@ static int bch2_check_fix_ptrs(struct bch_fs *c, enum btree_id btree_id,
                struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
                enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr);
 
-               if (fsck_err_on(!g->gen_valid, c,
+               if (c->opts.reconstruct_alloc ||
+                   fsck_err_on(!g->gen_valid, c,
                                "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
                                "while marking %s",
                                p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
@@ -746,13 +749,15 @@ found:
                if (level)
                        bch2_btree_node_update_key_early(c, btree_id, level - 1, *k, new);
 
-               printbuf_reset(&buf);
-               bch2_bkey_val_to_text(&buf, c, *k);
-               bch_info(c, "updated %s", buf.buf);
+               if (c->opts.verbose) {
+                       printbuf_reset(&buf);
+                       bch2_bkey_val_to_text(&buf, c, *k);
+                       bch_info(c, "updated %s", buf.buf);
 
-               printbuf_reset(&buf);
-               bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
-               bch_info(c, "new key %s", buf.buf);
+                       printbuf_reset(&buf);
+                       bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new));
+                       bch_info(c, "new key %s", buf.buf);
+               }
 
                *k = bkey_i_to_s_c(new);
        }
@@ -794,7 +799,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
                        atomic64_set(&c->key_version, k->k->version.lo);
        }
 
-       ret = __bch2_trans_do(trans, NULL, NULL, 0,
+       ret = commit_do(trans, NULL, NULL, 0,
                        bch2_mark_key(trans, old, *k, flags));
 fsck_err:
 err:
@@ -849,10 +854,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
        struct bch_fs *c = trans->c;
        struct btree_iter iter;
        struct btree *b;
-       unsigned depth = metadata_only                  ? 1
-               : bch2_expensive_debug_checks           ? 0
-               : !btree_node_type_needs_gc(btree_id)   ? 1
-               : 0;
+       unsigned depth = metadata_only ? 1 : 0;
        int ret = 0;
 
        gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0));
@@ -995,10 +997,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
 {
        struct bch_fs *c = trans->c;
        struct btree *b;
-       unsigned target_depth = metadata_only           ? 1
-               : bch2_expensive_debug_checks           ? 0
-               : !btree_node_type_needs_gc(btree_id)   ? 1
-               : 0;
+       unsigned target_depth = metadata_only ? 1 : 0;
        struct printbuf buf = PRINTBUF;
        int ret = 0;
 
@@ -1181,29 +1180,28 @@ static int bch2_gc_done(struct bch_fs *c,
 {
        struct bch_dev *ca = NULL;
        struct printbuf buf = PRINTBUF;
-       bool verify = !metadata_only && (!initial ||
-                      (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
+       bool verify = !metadata_only &&
+               !c->opts.reconstruct_alloc &&
+               (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
        unsigned i, dev;
        int ret = 0;
 
        percpu_down_write(&c->mark_lock);
 
 #define copy_field(_f, _msg, ...)                                      \
-       if (dst->_f != src->_f) {                                       \
-               if (verify)                                             \
-                       fsck_err(c, _msg ": got %llu, should be %llu"   \
-                               , ##__VA_ARGS__, dst->_f, src->_f);     \
-               dst->_f = src->_f;                                      \
-       }
+       if (dst->_f != src->_f &&                                       \
+           (!verify ||                                                 \
+            fsck_err(c, _msg ": got %llu, should be %llu"              \
+                     , ##__VA_ARGS__, dst->_f, src->_f)))              \
+               dst->_f = src->_f
 #define copy_stripe_field(_f, _msg, ...)                               \
-       if (dst->_f != src->_f) {                                       \
-               if (verify)                                             \
-                       fsck_err(c, "stripe %zu has wrong "_msg         \
-                               ": got %u, should be %u",               \
-                               iter.pos, ##__VA_ARGS__,                \
-                               dst->_f, src->_f);                      \
-               dst->_f = src->_f;                                      \
-       }
+       if (dst->_f != src->_f &&                                       \
+           (!verify ||                                                 \
+            fsck_err(c, "stripe %zu has wrong "_msg                    \
+                     ": got %u, should be %u",                         \
+                     iter.pos, ##__VA_ARGS__,                          \
+                     dst->_f, src->_f)))                               \
+               dst->_f = src->_f
 #define copy_dev_field(_f, _msg, ...)                                  \
        copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
 #define copy_fs_field(_f, _msg, ...)                                   \
@@ -1219,7 +1217,6 @@ static int bch2_gc_done(struct bch_fs *c,
                                             dev_usage_u64s());
 
                copy_dev_field(buckets_ec,              "buckets_ec");
-               copy_dev_field(buckets_unavailable,     "buckets_unavailable");
 
                for (i = 0; i < BCH_DATA_NR; i++) {
                        copy_dev_field(d[i].buckets,    "%s buckets", bch2_data_types[i]);
@@ -1304,21 +1301,38 @@ static int bch2_gc_start(struct bch_fs *c,
                        percpu_ref_put(&ca->ref);
                        return -ENOMEM;
                }
+
+               this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets,
+                              ca->mi.nbuckets - ca->mi.first_bucket);
        }
 
        return 0;
 }
 
+/* returns true if not equal */
+static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l,
+                                    struct bch_alloc_v4 r)
+{
+       return  l.gen != r.gen                          ||
+               l.oldest_gen != r.oldest_gen            ||
+               l.data_type != r.data_type              ||
+               l.dirty_sectors != r.dirty_sectors      ||
+               l.cached_sectors != r.cached_sectors     ||
+               l.stripe_redundancy != r.stripe_redundancy ||
+               l.stripe != r.stripe;
+}
+
 static int bch2_alloc_write_key(struct btree_trans *trans,
                                struct btree_iter *iter,
                                bool metadata_only)
 {
        struct bch_fs *c = trans->c;
        struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
-       struct bucket gc;
+       struct bucket gc, *b;
        struct bkey_s_c k;
-       struct bkey_alloc_unpacked old_u, new_u;
-       struct bkey_alloc_buf *a;
+       struct bkey_i_alloc_v4 *a;
+       struct bch_alloc_v4 old, new;
+       enum bch_data_type type;
        int ret;
 
        k = bch2_btree_iter_peek_slot(iter);
@@ -1326,10 +1340,33 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
        if (ret)
                return ret;
 
-       old_u = new_u = bch2_alloc_unpack(k);
+       bch2_alloc_to_v4(k, &old);
+       new = old;
 
        percpu_down_read(&c->mark_lock);
-       gc = *gc_bucket(ca, iter->pos.offset);
+       b = gc_bucket(ca, iter->pos.offset);
+
+       /*
+        * b->data_type doesn't yet include need_discard & need_gc_gen states -
+        * fix that here:
+        */
+       type = __alloc_data_type(b->dirty_sectors,
+                                b->cached_sectors,
+                                b->stripe,
+                                old,
+                                b->data_type);
+       if (b->data_type != type) {
+               struct bch_dev_usage *u;
+
+               preempt_disable();
+               u = this_cpu_ptr(ca->usage_gc);
+               u->d[b->data_type].buckets--;
+               b->data_type = type;
+               u->d[b->data_type].buckets++;
+               preempt_enable();
+       }
+
+       gc = *b;
        percpu_up_read(&c->mark_lock);
 
        if (metadata_only &&
@@ -1338,36 +1375,46 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
            gc.data_type != BCH_DATA_btree)
                return 0;
 
-       if (gen_after(old_u.gen, gc.gen))
+       if (gen_after(old.gen, gc.gen))
                return 0;
 
 #define copy_bucket_field(_f)                                          \
-       if (fsck_err_on(new_u._f != gc._f, c,                           \
+       if (c->opts.reconstruct_alloc ||                                \
+           fsck_err_on(new._f != gc._f, c,                             \
                        "bucket %llu:%llu gen %u data type %s has wrong " #_f   \
                        ": got %u, should be %u",                       \
                        iter->pos.inode, iter->pos.offset,              \
                        gc.gen,                                         \
                        bch2_data_types[gc.data_type],                  \
-                       new_u._f, gc._f))                               \
-               new_u._f = gc._f;                                       \
+                       new._f, gc._f))                                 \
+               new._f = gc._f;                                         \
 
        copy_bucket_field(gen);
        copy_bucket_field(data_type);
-       copy_bucket_field(stripe);
        copy_bucket_field(dirty_sectors);
        copy_bucket_field(cached_sectors);
        copy_bucket_field(stripe_redundancy);
        copy_bucket_field(stripe);
 #undef copy_bucket_field
 
-       if (!bkey_alloc_unpacked_cmp(old_u, new_u))
+       if (!bch2_alloc_v4_cmp(old, new))
                return 0;
 
-       a = bch2_alloc_pack(trans, new_u);
-       if (IS_ERR(a))
-               return PTR_ERR(a);
+       a = bch2_alloc_to_v4_mut(trans, k);
+       ret = PTR_ERR_OR_ZERO(a);
+       if (ret)
+               return ret;
+
+       a->v = new;
+
+       /*
+        * The trigger normally makes sure this is set, but we're not running
+        * triggers:
+        */
+       if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ])
+               a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
 
-       ret = bch2_trans_update(trans, iter, &a->k, 0);
+       ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN);
 fsck_err:
        return ret;
 }
@@ -1391,7 +1438,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
                        if (bkey_cmp(iter.pos, POS(ca->dev_idx, ca->mi.nbuckets)) >= 0)
                                break;
 
-                       ret = __bch2_trans_do(&trans, NULL, NULL,
+                       ret = commit_do(&trans, NULL, NULL,
                                              BTREE_INSERT_LAZY_RW,
                                        bch2_alloc_write_key(&trans, &iter,
                                                             metadata_only));
@@ -1418,7 +1465,7 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
        struct btree_iter iter;
        struct bkey_s_c k;
        struct bucket *g;
-       struct bkey_alloc_unpacked u;
+       struct bch_alloc_v4 a;
        unsigned i;
        int ret;
 
@@ -1443,20 +1490,21 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
                           BTREE_ITER_PREFETCH, k, ret) {
                ca = bch_dev_bkey_exists(c, k.k->p.inode);
                g = gc_bucket(ca, k.k->p.offset);
-               u = bch2_alloc_unpack(k);
+
+               bch2_alloc_to_v4(k, &a);
 
                g->gen_valid    = 1;
-               g->gen          = u.gen;
+               g->gen          = a.gen;
 
                if (metadata_only &&
-                   (u.data_type == BCH_DATA_user ||
-                    u.data_type == BCH_DATA_cached ||
-                    u.data_type == BCH_DATA_parity)) {
-                       g->data_type            = u.data_type;
-                       g->dirty_sectors        = u.dirty_sectors;
-                       g->cached_sectors       = u.cached_sectors;
-                       g->stripe               = u.stripe;
-                       g->stripe_redundancy    = u.stripe_redundancy;
+                   (a.data_type == BCH_DATA_user ||
+                    a.data_type == BCH_DATA_cached ||
+                    a.data_type == BCH_DATA_parity)) {
+                       g->data_type            = a.data_type;
+                       g->dirty_sectors        = a.dirty_sectors;
+                       g->cached_sectors       = a.cached_sectors;
+                       g->stripe               = a.stripe;
+                       g->stripe_redundancy    = a.stripe_redundancy;
                }
        }
        bch2_trans_iter_exit(&trans, &iter);
@@ -1484,6 +1532,7 @@ static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
                             g->data_type == BCH_DATA_cached ||
                             g->data_type == BCH_DATA_parity))
                                continue;
+                       g->data_type = 0;
                        g->dirty_sectors = 0;
                        g->cached_sectors = 0;
                }
@@ -1543,7 +1592,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
                        else
                                *bkey_refcount(new) = cpu_to_le64(r->refcount);
 
-                       ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+                       ret = commit_do(&trans, NULL, NULL, 0,
                                __bch2_btree_insert(&trans, BTREE_ID_reflink, new));
                        kfree(new);
 
@@ -1656,7 +1705,7 @@ inconsistent:
                        for (i = 0; i < new->v.nr_blocks; i++)
                                stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
 
-                       ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+                       ret = commit_do(&trans, NULL, NULL, 0,
                                __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i));
                        kfree(new);
                }
@@ -1695,18 +1744,14 @@ static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
  */
 int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only)
 {
-       u64 start_time = local_clock();
        unsigned iter = 0;
        int ret;
 
        lockdep_assert_held(&c->state_lock);
-       trace_gc_start(c);
 
        down_write(&c->gc_lock);
 
-       /* flush interior btree updates: */
-       closure_wait_event(&c->btree_interior_update_wait,
-                          !bch2_btree_interior_updates_nr_pending(c));
+       bch2_btree_interior_updates_flush(c);
 
        ret   = bch2_gc_start(c, metadata_only) ?:
                bch2_gc_alloc_start(c, metadata_only) ?:
@@ -1721,11 +1766,11 @@ again:
        if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) &&
            !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) &&
            c->opts.fix_errors != FSCK_OPT_NO) {
-               bch_info(c, "starting topology repair pass");
+               bch_info(c, "Starting topology repair pass");
                ret = bch2_repair_topology(c);
                if (ret)
                        goto out;
-               bch_info(c, "topology repair pass done");
+               bch_info(c, "Topology repair pass done");
 
                set_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags);
        }
@@ -1736,6 +1781,7 @@ again:
            !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) &&
            !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) {
                set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+               SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, true);
                ret = 0;
        }
 
@@ -1794,9 +1840,6 @@ out:
 
        up_write(&c->gc_lock);
 
-       trace_gc_end(c);
-       bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
-
        /*
         * At startup, allocations can happen directly instead of via the
         * allocator thread - issue wakeup in case they blocked on gc_lock:
@@ -1890,7 +1933,8 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i
 {
        struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode);
        struct bkey_s_c k;
-       struct bkey_alloc_unpacked u;
+       struct bch_alloc_v4 a;
+       struct bkey_i_alloc_v4 *a_mut;
        int ret;
 
        k = bch2_btree_iter_peek_slot(iter);
@@ -1898,14 +1942,20 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i
        if (ret)
                return ret;
 
-       u = bch2_alloc_unpack(k);
+       bch2_alloc_to_v4(k, &a);
 
-       if (u.oldest_gen == ca->oldest_gen[iter->pos.offset])
+       if (a.oldest_gen == ca->oldest_gen[iter->pos.offset])
                return 0;
 
-       u.oldest_gen = ca->oldest_gen[iter->pos.offset];
+       a_mut = bch2_alloc_to_v4_mut(trans, k);
+       ret = PTR_ERR_OR_ZERO(a_mut);
+       if (ret)
+               return ret;
+
+       a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset];
+       a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type);
 
-       return bch2_alloc_write(trans, iter, &u, 0);
+       return bch2_trans_update(trans, iter, &a_mut->k_i, 0);
 }
 
 int bch2_gc_gens(struct bch_fs *c)
@@ -1926,6 +1976,7 @@ int bch2_gc_gens(struct bch_fs *c)
        if (!mutex_trylock(&c->gc_gens_lock))
                return 0;
 
+       trace_gc_gens_start(c);
        down_read(&c->gc_lock);
        bch2_trans_init(&trans, c, 0, 0);
 
@@ -1961,7 +2012,7 @@ int bch2_gc_gens(struct bch_fs *c)
 
        for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
                           BTREE_ITER_PREFETCH, k, ret) {
-               ret = __bch2_trans_do(&trans, NULL, NULL,
+               ret = commit_do(&trans, NULL, NULL,
                                      BTREE_INSERT_NOFAIL,
                                bch2_alloc_write_oldest_gen(&trans, &iter));
                if (ret) {
@@ -1977,6 +2028,7 @@ int bch2_gc_gens(struct bch_fs *c)
        c->gc_count++;
 
        bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
+       trace_gc_gens_end(c);
 err:
        for_each_member_device(ca, c, i) {
                kvfree(ca->oldest_gen);