From 63c8f14756921c1d1d6a99082a679b92aef288c1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 28 Aug 2023 15:20:58 -0400 Subject: [PATCH] Update bcachefs sources to e7f6215768 bcachefs: Fix snapshot_skiplist_good() --- .bcachefs_revision | 2 +- libbcachefs/alloc_background.c | 96 ++-- libbcachefs/bcachefs_format.h | 1 + libbcachefs/btree_trans_commit.c | 40 +- libbcachefs/btree_update.c | 53 +- libbcachefs/btree_write_buffer.c | 9 +- libbcachefs/fs.c | 8 +- libbcachefs/fsck.c | 36 +- libbcachefs/move.c | 44 +- libbcachefs/move.h | 1 - libbcachefs/snapshot.c | 872 +++++++++++++++++++++++++++++-- libbcachefs/snapshot.h | 22 + libbcachefs/subvolume.c | 452 ---------------- libbcachefs/super.c | 2 + libbcachefs/sysfs.c | 5 - 15 files changed, 971 insertions(+), 672 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index e91ab63..57c74ea 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -bed61fae3bd3429686d33b87c42ed4f98b14a648 +e7f62157681d96386dc500609149b9685358a2b0 diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 7bf2a50..069d98a 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -241,7 +241,6 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, struct printbuf *err) { struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); - int rw = flags & WRITE; if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) { prt_printf(err, "bad val size (%u > %lu)", @@ -255,71 +254,50 @@ int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, return -BCH_ERR_invalid_bkey; } - if (rw == WRITE && - !(flags & BKEY_INVALID_JOURNAL) && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_btree_backpointers) { - unsigned i, bp_len = 0; - - for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) - bp_len += alloc_v4_backpointers_c(a.v)[i].bucket_len; + if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) { + prt_printf(err, "invalid data type (got %u should be %u)", + a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); + return -BCH_ERR_invalid_bkey; + } - if (bp_len > a.v->dirty_sectors) { - prt_printf(err, "too many backpointers"); + switch (a.v->data_type) { + case BCH_DATA_free: + case BCH_DATA_need_gc_gens: + case BCH_DATA_need_discard: + if (a.v->dirty_sectors || + a.v->cached_sectors || + a.v->stripe) { + prt_printf(err, "empty data type free but have data"); return -BCH_ERR_invalid_bkey; } - } - - if (rw == WRITE) { - if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) { - prt_printf(err, "invalid data type (got %u should be %u)", - a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); + break; + case BCH_DATA_sb: + case BCH_DATA_journal: + case BCH_DATA_btree: + case BCH_DATA_user: + case BCH_DATA_parity: + if (!a.v->dirty_sectors) { + prt_printf(err, "data_type %s but dirty_sectors==0", + bch2_data_types[a.v->data_type]); + return -BCH_ERR_invalid_bkey; + } + break; + case BCH_DATA_cached: + if (!a.v->cached_sectors || + a.v->dirty_sectors || + a.v->stripe) { + prt_printf(err, "data type inconsistency"); return -BCH_ERR_invalid_bkey; } - switch (a.v->data_type) { - case BCH_DATA_free: - case BCH_DATA_need_gc_gens: - case BCH_DATA_need_discard: - if (a.v->dirty_sectors || - a.v->cached_sectors || - a.v->stripe) { - prt_printf(err, "empty data type free but have data"); - return -BCH_ERR_invalid_bkey; - } - break; - case BCH_DATA_sb: - case BCH_DATA_journal: - case BCH_DATA_btree: - case BCH_DATA_user: - case BCH_DATA_parity: - if (!a.v->dirty_sectors) { - prt_printf(err, "data_type %s but dirty_sectors==0", - bch2_data_types[a.v->data_type]); - return -BCH_ERR_invalid_bkey; - } - break; - case BCH_DATA_cached: - if (!a.v->cached_sectors || - a.v->dirty_sectors || - a.v->stripe) { - prt_printf(err, "data type inconsistency"); - return -BCH_ERR_invalid_bkey; - } - - if (!a.v->io_time[READ] && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) { - prt_printf(err, "cached bucket with read_time == 0"); - return -BCH_ERR_invalid_bkey; - } - break; - case BCH_DATA_stripe: - if (!a.v->stripe) { - prt_printf(err, "data_type %s but stripe==0", - bch2_data_types[a.v->data_type]); - return -BCH_ERR_invalid_bkey; - } - break; + if (!a.v->io_time[READ] && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) { + prt_printf(err, "cached bucket with read_time == 0"); + return -BCH_ERR_invalid_bkey; } + break; + case BCH_DATA_stripe: + break; } return 0; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 20e96da..f17238b 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1150,6 +1150,7 @@ struct bch_snapshot { __le32 parent; __le32 children[2]; __le32 subvol; + /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */ __le32 tree; __le32 depth; __le32 skip[3]; diff --git a/libbcachefs/btree_trans_commit.c b/libbcachefs/btree_trans_commit.c index 83cc7f6..eafb038 100644 --- a/libbcachefs/btree_trans_commit.c +++ b/libbcachefs/btree_trans_commit.c @@ -97,6 +97,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); EBUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(trans->c, b)); + EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); k = bch2_btree_node_iter_peek_all(node_iter, b); if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) @@ -800,7 +801,6 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, un bch2_inconsistent_error(c); bch2_dump_trans_updates(trans); - printbuf_exit(err); return -EINVAL; } @@ -817,25 +817,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags struct btree_insert_entry *i; int ret = 0, u64s_delta = 0; -#ifdef CONFIG_BCACHEFS_DEBUG - trans_for_each_update(trans, i) { - struct printbuf buf = PRINTBUF; - enum bkey_invalid_flags invalid_flags = 0; - - if (!(flags & BTREE_INSERT_JOURNAL_REPLAY)) - invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; - - if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), - i->bkey_type, invalid_flags, &buf))) - ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf); - btree_insert_entry_checks(trans, i); - printbuf_exit(&buf); - - if (ret) - return ret; - } -#endif - trans_for_each_update(trans, i) { if (i->cached) continue; @@ -1048,6 +1029,25 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (ret) goto out_reset; +#ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) { + struct printbuf buf = PRINTBUF; + enum bkey_invalid_flags invalid_flags = 0; + + if (!(flags & BTREE_INSERT_JOURNAL_REPLAY)) + invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; + + if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), + i->bkey_type, invalid_flags, &buf))) + ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf); + btree_insert_entry_checks(trans, i); + printbuf_exit(&buf); + + if (ret) + return ret; + } +#endif + if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { ret = do_bch2_trans_commit_to_journal_replay(trans); goto out_reset; diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index a7fa207..880ce74 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -28,51 +28,6 @@ bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, struct bkey_i *, enum btree_update_flags, unsigned long ip); -static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans, - enum btree_id id, - struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - bch2_trans_iter_init(trans, &iter, id, pos, - BTREE_ITER_NOT_EXTENTS| - BTREE_ITER_ALL_SNAPSHOTS); - while (1) { - k = bch2_btree_iter_prev(&iter); - ret = bkey_err(k); - if (ret) - break; - - if (!k.k) - break; - - if (!bkey_eq(pos, k.k->p)) - break; - - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { - ret = 1; - break; - } - } - bch2_trans_iter_exit(trans, &iter); - - return ret; -} - -static inline int check_pos_snapshot_overwritten(struct btree_trans *trans, - enum btree_id id, - struct bpos pos) -{ - if (!btree_type_has_snapshots(id) || - bch2_snapshot_is_leaf(trans->c, pos.snapshot)) - return 0; - - return __check_pos_snapshot_overwritten(trans, id, pos); -} - static noinline int extent_front_merge(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -91,8 +46,8 @@ static noinline int extent_front_merge(struct btree_trans *trans, if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert))) return 0; - ret = check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?: - check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p); + ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?: + bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p); if (ret < 0) return ret; if (ret) @@ -114,8 +69,8 @@ static noinline int extent_back_merge(struct btree_trans *trans, struct bch_fs *c = trans->c; int ret; - ret = check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?: - check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p); + ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?: + bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p); if (ret < 0) return ret; if (ret) diff --git a/libbcachefs/btree_write_buffer.c b/libbcachefs/btree_write_buffer.c index 5f96db5..6d2d43b 100644 --- a/libbcachefs/btree_write_buffer.c +++ b/libbcachefs/btree_write_buffer.c @@ -75,7 +75,8 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans, } return 0; trans_commit: - return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k, 0) ?: + return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_trans_commit(trans, NULL, NULL, commit_flags| BTREE_INSERT_NOCHECK_RW| @@ -124,7 +125,8 @@ btree_write_buffered_insert(struct btree_trans *trans, BTREE_ITER_CACHED|BTREE_ITER_INTENT); ret = bch2_btree_iter_traverse(&iter) ?: - bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k, 0); + bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -193,7 +195,8 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f if (!iter.path || iter.path->btree_id != i->btree) { bch2_trans_iter_exit(trans, &iter); - bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, BTREE_ITER_INTENT); + bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, + BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS); } bch2_btree_iter_set_pos(&iter, i->k.k.p); diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 8958957..80dcda4 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -1906,7 +1906,10 @@ out: return dget(sb->s_root); err_put_super: + sb->s_fs_info = NULL; + c->vfs_sb = NULL; deactivate_locked_super(sb); + bch2_fs_stop(c); return ERR_PTR(bch2_err_class(ret)); } @@ -1914,8 +1917,11 @@ static void bch2_kill_sb(struct super_block *sb) { struct bch_fs *c = sb->s_fs_info; + if (c) + c->vfs_sb = NULL; generic_shutdown_super(sb); - bch2_fs_free(c); + if (c) + bch2_fs_free(c); } static struct file_system_type bcache_fs_type = { diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 960280b..57b3dfa 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -853,14 +853,6 @@ static int check_inode(struct btree_trans *trans, if (ret) goto err; - /* - * if snapshot id isn't a leaf node, skip it - deletion in - * particular is not atomic, so on the internal snapshot nodes - * we can see inodes marked for deletion after a clean shutdown - */ - if (bch2_snapshot_is_internal_node(c, k.k->p.snapshot)) - return 0; - if (!bkey_is_inode(k.k)) return 0; @@ -882,6 +874,27 @@ static int check_inode(struct btree_trans *trans, return -EINVAL; } + if ((u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED)) && + bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) { + struct bpos new_min_pos; + + ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos); + if (ret) + goto err; + + u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED; + + ret = __write_inode(trans, &u, iter->pos.snapshot); + if (ret) { + bch_err_msg(c, ret, "in fsck: error updating inode"); + return ret; + } + + if (!bpos_eq(new_min_pos, POS_MIN)) + bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos)); + return 0; + } + if (u.bi_flags & BCH_INODE_UNLINKED && (!c->sb.clean || fsck_err(c, "filesystem marked clean, but inode %llu unlinked", @@ -960,9 +973,10 @@ static int check_inode(struct btree_trans *trans, if (do_update) { ret = __write_inode(trans, &u, iter->pos.snapshot); - if (ret) - bch_err(c, "error in fsck: error updating inode: %s", - bch2_err_str(ret)); + if (ret) { + bch_err_msg(c, ret, "in fsck: error updating inode"); + return ret; + } } err: fsck_err: diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 0527267..fb76a1d 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -1103,46 +1103,40 @@ int bch2_data_job(struct bch_fs *c, return ret; } -void bch2_data_jobs_to_text(struct printbuf *out, struct bch_fs *c) -{ - struct bch_move_stats *stats; - - mutex_lock(&c->data_progress_lock); - list_for_each_entry(stats, &c->data_progress_list, list) { - prt_printf(out, "%s: data type %s btree_id %s position: ", - stats->name, - bch2_data_types[stats->data_type], - bch2_btree_ids[stats->btree_id]); - bch2_bpos_to_text(out, stats->pos); - prt_printf(out, "%s", "\n"); - } - mutex_unlock(&c->data_progress_lock); -} - -static void bch2_moving_ctxt_to_text(struct printbuf *out, struct moving_context *ctxt) +static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) { + struct bch_move_stats *stats = ctxt->stats; struct moving_io *io; - prt_printf(out, "%ps:", ctxt->fn); + prt_printf(out, "%s (%ps):", stats->name, ctxt->fn); + prt_newline(out); + + prt_printf(out, " data type %s btree_id %s position: ", + bch2_data_types[stats->data_type], + bch2_btree_ids[stats->btree_id]); + bch2_bpos_to_text(out, stats->pos); prt_newline(out); printbuf_indent_add(out, 2); - prt_printf(out, "reads: %u sectors %u", + prt_printf(out, "reads: ios %u/%u sectors %u/%u", atomic_read(&ctxt->read_ios), - atomic_read(&ctxt->read_sectors)); + c->opts.move_ios_in_flight, + atomic_read(&ctxt->read_sectors), + c->opts.move_bytes_in_flight >> 9); prt_newline(out); - prt_printf(out, "writes: %u sectors %u", + prt_printf(out, "writes: ios %u/%u sectors %u/%u", atomic_read(&ctxt->write_ios), - atomic_read(&ctxt->write_sectors)); + c->opts.move_ios_in_flight, + atomic_read(&ctxt->write_sectors), + c->opts.move_bytes_in_flight >> 9); prt_newline(out); printbuf_indent_add(out, 2); mutex_lock(&ctxt->lock); - list_for_each_entry(io, &ctxt->ios, io_list) { + list_for_each_entry(io, &ctxt->ios, io_list) bch2_write_op_to_text(out, &io->write.op); - } mutex_unlock(&ctxt->lock); printbuf_indent_sub(out, 4); @@ -1154,7 +1148,7 @@ void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) mutex_lock(&c->moving_context_lock); list_for_each_entry(ctxt, &c->moving_context_list, list) - bch2_moving_ctxt_to_text(out, ctxt); + bch2_moving_ctxt_to_text(out, c, ctxt); mutex_unlock(&c->moving_context_lock); } diff --git a/libbcachefs/move.h b/libbcachefs/move.h index 547ee7b..c3136ab 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -88,7 +88,6 @@ int bch2_data_job(struct bch_fs *, struct bch_ioctl_data); void bch2_move_stats_init(struct bch_move_stats *stats, char *name); -void bch2_data_jobs_to_text(struct printbuf *, struct bch_fs *); void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *); void bch2_fs_move_init(struct bch_fs *); diff --git a/libbcachefs/snapshot.c b/libbcachefs/snapshot.c index 8242591..03ae280 100644 --- a/libbcachefs/snapshot.c +++ b/libbcachefs/snapshot.c @@ -1,8 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "btree_key_cache.h" #include "btree_update.h" +#include "buckets.h" #include "errcode.h" #include "error.h" #include "fs.h" @@ -88,6 +90,20 @@ static int bch2_snapshot_tree_create(struct btree_trans *trans, /* Snapshot nodes: */ +static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) +{ + struct snapshot_table *t; + + rcu_read_lock(); + t = rcu_dereference(c->snapshots); + + while (id && id < ancestor) + id = __snapshot_t(t, id)->parent; + rcu_read_unlock(); + + return id == ancestor; +} + static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) { const struct snapshot_t *s = __snapshot_t(t, id); @@ -114,26 +130,17 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) while (id && id < ancestor - IS_ANCESTOR_BITMAP) id = get_ancestor_below(t, id, ancestor); - ret = id && id < ancestor - ? test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor) - : id == ancestor; - rcu_read_unlock(); + if (id && id < ancestor) { + ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor); - return ret; -} - -static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) -{ - struct snapshot_table *t; - - rcu_read_lock(); - t = rcu_dereference(c->snapshots); + EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor)); + } else { + ret = id == ancestor; + } - while (id && id < ancestor) - id = __snapshot_t(t, id)->parent; rcu_read_unlock(); - return id == ancestor; + return ret; } static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) @@ -249,10 +256,9 @@ int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { id = le32_to_cpu(s.v->skip[i]); - if (!id != !s.v->parent || - (s.v->parent && - id <= k.k->p.offset)) { - prt_printf(err, "bad skiplist node %u)", id); + if ((id && !s.v->parent) || + (id && id <= k.k->p.offset)) { + prt_printf(err, "bad skiplist node %u", id); return -BCH_ERR_invalid_bkey; } } @@ -261,6 +267,23 @@ int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, return 0; } +static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id) +{ + struct snapshot_t *t = snapshot_t_mut(c, id); + u32 parent = id; + + while ((parent = bch2_snapshot_parent_early(c, parent)) && + parent - id - 1 < IS_ANCESTOR_BITMAP) + __set_bit(parent - id - 1, t->is_ancestor); +} + +static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id) +{ + mutex_lock(&c->snapshot_table_lock); + __set_is_ancestor_bitmap(c, id); + mutex_unlock(&c->snapshot_table_lock); +} + int bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s_c new, @@ -281,7 +304,6 @@ int bch2_mark_snapshot(struct btree_trans *trans, if (new.k->type == KEY_TYPE_snapshot) { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); - u32 parent = id; t->parent = le32_to_cpu(s.v->parent); t->children[0] = le32_to_cpu(s.v->children[0]); @@ -301,9 +323,7 @@ int bch2_mark_snapshot(struct btree_trans *trans, t->skip[2] = 0; } - while ((parent = bch2_snapshot_parent_early(c, parent)) && - parent - id - 1 < IS_ANCESTOR_BITMAP) - __set_bit(parent - id - 1, t->is_ancestor); + __set_is_ancestor_bitmap(c, id); if (BCH_SNAPSHOT_DELETED(s.v)) { set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); @@ -616,28 +636,18 @@ u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id) return id; } -static int snapshot_skiplist_good(struct btree_trans *trans, struct bch_snapshot s) +static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s) { - struct bch_snapshot a; unsigned i; - int ret; - - for (i = 0; i < 3; i++) { - if (!s.parent != !s.skip[i]) - return false; - - if (!s.parent) - continue; - - ret = bch2_snapshot_lookup(trans, le32_to_cpu(s.skip[i]), &a); - if (bch2_err_matches(ret, ENOENT)) - return false; - if (ret) - return ret; - if (a.tree != s.tree) - return false; - } + for (i = 0; i < 3; i++) + if (!s.parent) { + if (s.skip[i]) + return false; + } else { + if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i]))) + return false; + } return true; } @@ -817,7 +827,7 @@ static int check_snapshot(struct btree_trans *trans, s = u->v; } - ret = snapshot_skiplist_good(trans, s); + ret = snapshot_skiplist_good(trans, k.k->p.offset, s); if (ret < 0) goto err; @@ -864,6 +874,775 @@ int bch2_check_snapshots(struct bch_fs *c) return ret; } +/* + * Mark a snapshot as deleted, for future cleanup: + */ +int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) +{ + struct btree_iter iter; + struct bkey_i_snapshot *s; + int ret = 0; + + s = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshots, POS(0, id), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(s); + if (unlikely(ret)) { + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), + trans->c, "missing snapshot %u", id); + return ret; + } + + /* already deleted? */ + if (BCH_SNAPSHOT_DELETED(&s->v)) + goto err; + + SET_BCH_SNAPSHOT_DELETED(&s->v, true); + SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); + s->v.subvol = 0; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) +{ + if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1])) + swap(s->children[0], s->children[1]); +} + +int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; + struct btree_iter c_iter = (struct btree_iter) { NULL }; + struct btree_iter tree_iter = (struct btree_iter) { NULL }; + struct bkey_s_c_snapshot s; + u32 parent_id, child_id; + unsigned i; + int ret = 0; + + s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_INTENT, snapshot); + ret = bkey_err(s); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "missing snapshot %u", id); + + if (ret) + goto err; + + BUG_ON(s.v->children[1]); + + parent_id = le32_to_cpu(s.v->parent); + child_id = le32_to_cpu(s.v->children[0]); + + if (parent_id) { + struct bkey_i_snapshot *parent; + + parent = bch2_bkey_get_mut_typed(trans, &p_iter, + BTREE_ID_snapshots, POS(0, parent_id), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(parent); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "missing snapshot %u", parent_id); + if (unlikely(ret)) + goto err; + + /* find entry in parent->children for node being deleted */ + for (i = 0; i < 2; i++) + if (le32_to_cpu(parent->v.children[i]) == id) + break; + + if (bch2_fs_inconsistent_on(i == 2, c, + "snapshot %u missing child pointer to %u", + parent_id, id)) + goto err; + + parent->v.children[i] = le32_to_cpu(child_id); + + normalize_snapshot_child_pointers(&parent->v); + } + + if (child_id) { + struct bkey_i_snapshot *child; + + child = bch2_bkey_get_mut_typed(trans, &c_iter, + BTREE_ID_snapshots, POS(0, child_id), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(child); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "missing snapshot %u", child_id); + if (unlikely(ret)) + goto err; + + child->v.parent = cpu_to_le32(parent_id); + + if (!child->v.parent) { + child->v.skip[0] = 0; + child->v.skip[1] = 0; + child->v.skip[2] = 0; + } + } + + if (!parent_id) { + /* + * We're deleting the root of a snapshot tree: update the + * snapshot_tree entry to point to the new root, or delete it if + * this is the last snapshot ID in this tree: + */ + struct bkey_i_snapshot_tree *s_t; + + BUG_ON(s.v->children[1]); + + s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, + BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), + 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(s_t); + if (ret) + goto err; + + if (s.v->children[0]) { + s_t->v.root_snapshot = s.v->children[0]; + } else { + s_t->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&s_t->k, 0); + } + } + + ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &tree_iter); + bch2_trans_iter_exit(trans, &p_iter); + bch2_trans_iter_exit(trans, &c_iter); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i_snapshot *n; + struct bkey_s_c k; + unsigned i, j; + u32 depth = bch2_snapshot_depth(c, parent); + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, + POS_MIN, BTREE_ITER_INTENT); + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + for (i = 0; i < nr_snapids; i++) { + k = bch2_btree_iter_prev_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k || !k.k->p.offset) { + ret = -BCH_ERR_ENOSPC_snapshot_create; + goto err; + } + + n = bch2_bkey_alloc(trans, &iter, 0, snapshot); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + n->v.flags = 0; + n->v.parent = cpu_to_le32(parent); + n->v.subvol = cpu_to_le32(snapshot_subvols[i]); + n->v.tree = cpu_to_le32(tree); + n->v.depth = cpu_to_le32(depth); + + for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) + n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent)); + + bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); + SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); + + ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, + bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); + if (ret) + goto err; + + new_snapids[i] = iter.pos.offset; + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* + * Create new snapshot IDs as children of an existing snapshot ID: + */ +static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + struct btree_iter iter; + struct bkey_i_snapshot *n_parent; + int ret = 0; + + n_parent = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshots, POS(0, parent), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(n_parent); + if (unlikely(ret)) { + if (bch2_err_matches(ret, ENOENT)) + bch_err(trans->c, "snapshot %u not found", parent); + return ret; + } + + if (n_parent->v.children[0] || n_parent->v.children[1]) { + bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); + ret = -EINVAL; + goto err; + } + + ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree), + new_snapids, snapshot_subvols, nr_snapids); + if (ret) + goto err; + + n_parent->v.children[0] = cpu_to_le32(new_snapids[0]); + n_parent->v.children[1] = cpu_to_le32(new_snapids[1]); + n_parent->v.subvol = 0; + SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* + * Create a snapshot node that is the root of a new tree: + */ +static int bch2_snapshot_node_create_tree(struct btree_trans *trans, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + struct bkey_i_snapshot_tree *n_tree; + int ret; + + n_tree = __bch2_snapshot_tree_create(trans); + ret = PTR_ERR_OR_ZERO(n_tree) ?: + create_snapids(trans, 0, n_tree->k.p.offset, + new_snapids, snapshot_subvols, nr_snapids); + if (ret) + return ret; + + n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]); + n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]); + return 0; +} + +int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + BUG_ON((parent == 0) != (nr_snapids == 1)); + BUG_ON((parent != 0) != (nr_snapids == 2)); + + return parent + ? bch2_snapshot_node_create_children(trans, parent, + new_snapids, snapshot_subvols, nr_snapids) + : bch2_snapshot_node_create_tree(trans, + new_snapids, snapshot_subvols, nr_snapids); + +} + +/* + * If we have an unlinked inode in an internal snapshot node, and the inode + * really has been deleted in all child snapshots, how does this get cleaned up? + * + * first there is the problem of how keys that have been overwritten in all + * child snapshots get deleted (unimplemented?), but inodes may perhaps be + * special? + * + * also: unlinked inode in internal snapshot appears to not be getting deleted + * correctly if inode doesn't exist in leaf snapshots + * + * solution: + * + * for a key in an interior snapshot node that needs work to be done that + * requires it to be mutated: iterate over all descendent leaf nodes and copy + * that key to snapshot leaf nodes, where we can mutate it + */ + +static int snapshot_delete_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + snapshot_id_list *deleted, + snapshot_id_list *equiv_seen, + struct bpos *last_pos) +{ + struct bch_fs *c = trans->c; + u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + + if (!bkey_eq(k.k->p, *last_pos)) + equiv_seen->nr = 0; + *last_pos = k.k->p; + + if (snapshot_list_has_id(deleted, k.k->p.snapshot) || + snapshot_list_has_id(equiv_seen, equiv)) { + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + } else { + return snapshot_list_add(c, equiv_seen, equiv); + } +} + +static int move_key_to_correct_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + + /* + * When we have a linear chain of snapshot nodes, we consider + * those to form an equivalence class: we're going to collapse + * them all down to a single node, and keep the leaf-most node - + * which has the same id as the equivalence class id. + * + * If there are multiple keys in different snapshots at the same + * position, we're only going to keep the one in the newest + * snapshot - the rest have been overwritten and are redundant, + * and for the key we're going to keep we need to move it to the + * equivalance class ID if it's not there already. + */ + if (equiv != k.k->p.snapshot) { + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + struct btree_iter new_iter; + int ret; + + ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + new->k.p.snapshot = equiv; + + bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p, + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(&new_iter) ?: + bch2_trans_update(trans, &new_iter, new, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_iter_exit(trans, &new_iter); + if (ret) + return ret; + } + + return 0; +} + +/* + * For a given snapshot, if it doesn't have a subvolume that points to it, and + * it doesn't have child snapshot nodes - it's now redundant and we can mark it + * as deleted. + */ +static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bkey_s_c_snapshot snap; + u32 children[2]; + int ret; + + if (k.k->type != KEY_TYPE_snapshot) + return 0; + + snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v) || + BCH_SNAPSHOT_SUBVOL(snap.v)) + return 0; + + children[0] = le32_to_cpu(snap.v->children[0]); + children[1] = le32_to_cpu(snap.v->children[1]); + + ret = bch2_snapshot_live(trans, children[0]) ?: + bch2_snapshot_live(trans, children[1]); + if (ret < 0) + return ret; + + if (!ret) + return bch2_snapshot_node_set_deleted(trans, k.k->p.offset); + return 0; +} + +static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, + snapshot_id_list *skip) +{ + rcu_read_lock(); + while (n--) { + do { + id = __bch2_snapshot_parent(c, id); + } while (snapshot_list_has_id(skip, id)); + } + rcu_read_unlock(); + + return id; +} + +static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, + struct btree_iter *iter, struct bkey_s_c k, + snapshot_id_list *deleted) +{ + struct bch_fs *c = trans->c; + u32 nr_deleted_ancestors = 0; + struct bkey_i_snapshot *s; + u32 *i; + int ret; + + if (k.k->type != KEY_TYPE_snapshot) + return 0; + + if (snapshot_list_has_id(deleted, k.k->p.offset)) + return 0; + + s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot); + ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; + + darray_for_each(*deleted, i) + nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i); + + if (!nr_deleted_ancestors) + return 0; + + le32_add_cpu(&s->v.depth, -nr_deleted_ancestors); + + if (!s->v.depth) { + s->v.skip[0] = 0; + s->v.skip[1] = 0; + s->v.skip[2] = 0; + } else { + u32 depth = le32_to_cpu(s->v.depth); + u32 parent = bch2_snapshot_parent(c, s->k.p.offset); + + for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) { + u32 id = le32_to_cpu(s->v.skip[j]); + + if (snapshot_list_has_id(deleted, id)) { + id = depth > 1 + ? bch2_snapshot_nth_parent_skip(c, + parent, + get_random_u32_below(depth - 1), + deleted) + : parent; + s->v.skip[j] = cpu_to_le32(id); + } + } + + bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32); + } + + return bch2_trans_update(trans, iter, &s->k_i, 0); +} + +int bch2_delete_dead_snapshots(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_snapshot snap; + snapshot_id_list deleted = { 0 }; + snapshot_id_list deleted_interior = { 0 }; + u32 *i, id; + int ret = 0; + + if (!test_bit(BCH_FS_STARTED, &c->flags)) { + ret = bch2_fs_read_write_early(c); + if (ret) { + bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret)); + return ret; + } + } + + bch2_trans_init(&trans, c, 0, 0); + + /* + * For every snapshot node: If we have no live children and it's not + * pointed to by a subvolume, delete it: + */ + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + NULL, NULL, 0, + bch2_delete_redundant_snapshot(&trans, &iter, k)); + if (ret) { + bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret)); + goto err; + } + + for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + bch2_snapshot_set_equiv(&trans, k)); + if (ret) { + bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret)); + goto err; + } + + for_each_btree_key(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_snapshot) + continue; + + snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v)) { + ret = snapshot_list_add(c, &deleted, k.k->p.offset); + if (ret) + break; + } + } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) { + bch_err_msg(c, ret, "walking snapshots"); + goto err; + } + + for (id = 0; id < BTREE_ID_NR; id++) { + struct bpos last_pos = POS_MIN; + snapshot_id_list equiv_seen = { 0 }; + struct disk_reservation res = { 0 }; + + if (!btree_type_has_snapshots(id)) + continue; + + ret = for_each_btree_key_commit(&trans, iter, + id, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + &res, NULL, BTREE_INSERT_NOFAIL, + snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: + for_each_btree_key_commit(&trans, iter, + id, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + &res, NULL, BTREE_INSERT_NOFAIL, + move_key_to_correct_snapshot(&trans, &iter, k)); + + bch2_disk_reservation_put(c, &res); + darray_exit(&equiv_seen); + + if (ret) { + bch_err_msg(c, ret, "deleting keys from dying snapshots"); + goto err; + } + } + + for_each_btree_key(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + u32 snapshot = k.k->p.offset; + u32 equiv = bch2_snapshot_equiv(c, snapshot); + + if (equiv != snapshot) + snapshot_list_add(c, &deleted_interior, snapshot); + } + bch2_trans_iter_exit(&trans, &iter); + + /* + * Fixing children of deleted snapshots can't be done completely + * atomically, if we crash between here and when we delete the interior + * nodes some depth fields will be off: + */ + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, POS_MIN, + BTREE_ITER_INTENT, k, + NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_fix_child_of_deleted_snapshot(&trans, &iter, k, &deleted_interior)); + if (ret) + goto err; + + darray_for_each(deleted, i) { + ret = commit_do(&trans, NULL, NULL, 0, + bch2_snapshot_node_delete(&trans, *i)); + if (ret) { + bch_err_msg(c, ret, "deleting snapshot %u", *i); + goto err; + } + } + + darray_for_each(deleted_interior, i) { + ret = commit_do(&trans, NULL, NULL, 0, + bch2_snapshot_node_delete(&trans, *i)); + if (ret) { + bch_err_msg(c, ret, "deleting snapshot %u", *i); + goto err; + } + } + + clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); +err: + darray_exit(&deleted_interior); + darray_exit(&deleted); + bch2_trans_exit(&trans); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +void bch2_delete_dead_snapshots_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + + if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) + bch2_delete_dead_snapshots(c); + bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); +} + +void bch2_delete_dead_snapshots_async(struct bch_fs *c) +{ + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) && + !queue_work(c->write_ref_wq, &c->snapshot_delete_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); +} + +int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *h) +{ + struct bch_fs *c = trans->c; + + set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); + + if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots) + return 0; + + bch2_delete_dead_snapshots_async(c); + return 0; +} + +int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, + enum btree_id id, + struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, id, pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + while (1) { + k = bch2_btree_iter_prev(&iter); + ret = bkey_err(k); + if (ret) + break; + + if (!k.k) + break; + + if (!bkey_eq(pos, k.k->p)) + break; + + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { + ret = 1; + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id) +{ + const struct snapshot_t *s = snapshot_t(c, id); + + return s->children[1] ?: s->children[0]; +} + +static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id) +{ + u32 child; + + while ((child = bch2_snapshot_smallest_child(c, id))) + id = child; + return id; +} + +static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans, + enum btree_id btree, + struct bkey_s_c interior_k, + u32 leaf_id, struct bpos *new_min_pos) +{ + struct btree_iter iter; + struct bpos pos = interior_k.k->p; + struct bkey_s_c k; + struct bkey_i *new; + int ret; + + pos.snapshot = leaf_id; + + bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto out; + + /* key already overwritten in this snapshot? */ + if (k.k->p.snapshot != interior_k.k->p.snapshot) + goto out; + + if (bpos_eq(*new_min_pos, POS_MIN)) { + *new_min_pos = k.k->p; + new_min_pos->snapshot = leaf_id; + } + + new = bch2_bkey_make_mut_noupdate(trans, interior_k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto out; + + new->k.p.snapshot = leaf_id; + ret = bch2_trans_update(trans, &iter, new, 0); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans, + enum btree_id btree, + struct bkey_s_c k, + struct bpos *new_min_pos) +{ + struct bch_fs *c = trans->c; + struct bkey_buf sk; + int ret; + + bch2_bkey_buf_init(&sk); + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + + *new_min_pos = POS_MIN; + + for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot); + id < k.k->p.snapshot; + id++) { + if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) || + !bch2_snapshot_is_leaf(c, id)) + continue; + + ret = commit_do(trans, NULL, NULL, 0, + bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos)); + if (ret) + break; + } + + bch2_bkey_buf_exit(&sk, c); + return ret; +} + int bch2_snapshots_read(struct bch_fs *c) { struct btree_iter iter; @@ -874,7 +1653,10 @@ int bch2_snapshots_read(struct bch_fs *c) for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: - bch2_snapshot_set_equiv(&trans, k))); + bch2_snapshot_set_equiv(&trans, k)) ?: + for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); if (ret) bch_err_fn(c, ret); return ret; diff --git a/libbcachefs/snapshot.h b/libbcachefs/snapshot.h index c59b7e9..dabc9b9 100644 --- a/libbcachefs/snapshot.h +++ b/libbcachefs/snapshot.h @@ -244,6 +244,28 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, int bch2_check_snapshot_trees(struct bch_fs *); int bch2_check_snapshots(struct bch_fs *); + +int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); +int bch2_delete_dead_snapshots_hook(struct btree_trans *, + struct btree_trans_commit_hook *); +void bch2_delete_dead_snapshots_work(struct work_struct *); + +int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos); + +static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, + enum btree_id id, + struct bpos pos) +{ + if (!btree_type_has_snapshots(id) || + bch2_snapshot_is_leaf(trans->c, pos.snapshot)) + return 0; + + return __bch2_key_has_snapshot_overwrites(trans, id, pos); +} + +int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id, + struct bkey_s_c, struct bpos *); + int bch2_snapshots_read(struct bch_fs *); void bch2_fs_snapshots_exit(struct bch_fs *); diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index 571cb28..0214a98 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -96,458 +96,6 @@ int bch2_check_subvols(struct bch_fs *c) return ret; } -/* - * Mark a snapshot as deleted, for future cleanup: - */ -static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) -{ - struct btree_iter iter; - struct bkey_i_snapshot *s; - int ret = 0; - - s = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_snapshots, POS(0, id), - 0, snapshot); - ret = PTR_ERR_OR_ZERO(s); - if (unlikely(ret)) { - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), - trans->c, "missing snapshot %u", id); - return ret; - } - - /* already deleted? */ - if (BCH_SNAPSHOT_DELETED(&s->v)) - goto err; - - SET_BCH_SNAPSHOT_DELETED(&s->v, true); - SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); - s->v.subvol = 0; -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; - struct btree_iter tree_iter = (struct btree_iter) { NULL }; - struct bkey_s_c_snapshot s; - u32 parent_id; - unsigned i; - int ret = 0; - - s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_INTENT, snapshot); - ret = bkey_err(s); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "missing snapshot %u", id); - - if (ret) - goto err; - - BUG_ON(!BCH_SNAPSHOT_DELETED(s.v)); - parent_id = le32_to_cpu(s.v->parent); - - if (parent_id) { - struct bkey_i_snapshot *parent; - - parent = bch2_bkey_get_mut_typed(trans, &p_iter, - BTREE_ID_snapshots, POS(0, parent_id), - 0, snapshot); - ret = PTR_ERR_OR_ZERO(parent); - if (unlikely(ret)) { - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "missing snapshot %u", parent_id); - goto err; - } - - for (i = 0; i < 2; i++) - if (le32_to_cpu(parent->v.children[i]) == id) - break; - - if (i == 2) - bch_err(c, "snapshot %u missing child pointer to %u", - parent_id, id); - else - parent->v.children[i] = 0; - - if (le32_to_cpu(parent->v.children[0]) < - le32_to_cpu(parent->v.children[1])) - swap(parent->v.children[0], - parent->v.children[1]); - } else { - /* - * We're deleting the root of a snapshot tree: update the - * snapshot_tree entry to point to the new root, or delete it if - * this is the last snapshot ID in this tree: - */ - struct bkey_i_snapshot_tree *s_t; - - BUG_ON(s.v->children[1]); - - s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, - BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), - 0, snapshot_tree); - ret = PTR_ERR_OR_ZERO(s_t); - if (ret) - goto err; - - if (s.v->children[0]) { - s_t->v.root_snapshot = s.v->children[0]; - } else { - s_t->k.type = KEY_TYPE_deleted; - set_bkey_val_u64s(&s_t->k, 0); - } - } - - ret = bch2_btree_delete_at(trans, &iter, 0); -err: - bch2_trans_iter_exit(trans, &tree_iter); - bch2_trans_iter_exit(trans, &p_iter); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_i_snapshot *n; - struct bkey_s_c k; - unsigned i, j; - u32 depth = bch2_snapshot_depth(c, parent); - int ret; - - bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, - POS_MIN, BTREE_ITER_INTENT); - k = bch2_btree_iter_peek(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - for (i = 0; i < nr_snapids; i++) { - k = bch2_btree_iter_prev_slot(&iter); - ret = bkey_err(k); - if (ret) - goto err; - - if (!k.k || !k.k->p.offset) { - ret = -BCH_ERR_ENOSPC_snapshot_create; - goto err; - } - - n = bch2_bkey_alloc(trans, &iter, 0, snapshot); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - n->v.flags = 0; - n->v.parent = cpu_to_le32(parent); - n->v.subvol = cpu_to_le32(snapshot_subvols[i]); - n->v.tree = cpu_to_le32(tree); - n->v.depth = cpu_to_le32(depth); - - for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) - n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent)); - - bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); - SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); - - ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, - bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); - if (ret) - goto err; - - new_snapids[i] = iter.pos.offset; - } -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* - * Create new snapshot IDs as children of an existing snapshot ID: - */ -static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - struct btree_iter iter; - struct bkey_i_snapshot *n_parent; - int ret = 0; - - n_parent = bch2_bkey_get_mut_typed(trans, &iter, - BTREE_ID_snapshots, POS(0, parent), - 0, snapshot); - ret = PTR_ERR_OR_ZERO(n_parent); - if (unlikely(ret)) { - if (bch2_err_matches(ret, ENOENT)) - bch_err(trans->c, "snapshot %u not found", parent); - return ret; - } - - if (n_parent->v.children[0] || n_parent->v.children[1]) { - bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); - ret = -EINVAL; - goto err; - } - - ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree), - new_snapids, snapshot_subvols, nr_snapids); - if (ret) - goto err; - - n_parent->v.children[0] = cpu_to_le32(new_snapids[0]); - n_parent->v.children[1] = cpu_to_le32(new_snapids[1]); - n_parent->v.subvol = 0; - SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -/* - * Create a snapshot node that is the root of a new tree: - */ -static int bch2_snapshot_node_create_tree(struct btree_trans *trans, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - struct bkey_i_snapshot_tree *n_tree; - int ret; - - n_tree = __bch2_snapshot_tree_create(trans); - ret = PTR_ERR_OR_ZERO(n_tree) ?: - create_snapids(trans, 0, n_tree->k.p.offset, - new_snapids, snapshot_subvols, nr_snapids); - if (ret) - return ret; - - n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]); - n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]); - return 0; -} - -int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, - u32 *new_snapids, - u32 *snapshot_subvols, - unsigned nr_snapids) -{ - BUG_ON((parent == 0) != (nr_snapids == 1)); - BUG_ON((parent != 0) != (nr_snapids == 2)); - - return parent - ? bch2_snapshot_node_create_children(trans, parent, - new_snapids, snapshot_subvols, nr_snapids) - : bch2_snapshot_node_create_tree(trans, - new_snapids, snapshot_subvols, nr_snapids); - -} - -/* - * If we have an unlinked inode in an internal snapshot node, and the inode - * really has been deleted in all child snapshots, how does this get cleaned up? - * - * first there is the problem of how keys that have been overwritten in all - * child snapshots get deleted (unimplemented?), but inodes may perhaps be - * special? - * - * also: unlinked inode in internal snapshot appears to not be getting deleted - * correctly if inode doesn't exist in leaf snapshots - */ - -static int snapshot_delete_key(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k, - snapshot_id_list *deleted, - snapshot_id_list *equiv_seen, - struct bpos *last_pos) -{ - struct bch_fs *c = trans->c; - u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); - - if (!bkey_eq(k.k->p, *last_pos)) - equiv_seen->nr = 0; - *last_pos = k.k->p; - - if (snapshot_list_has_id(deleted, k.k->p.snapshot) || - snapshot_list_has_id(equiv_seen, equiv)) { - return bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); - } else { - return snapshot_list_add(c, equiv_seen, equiv); - } -} - -/* - * For a given snapshot, if it doesn't have a subvolume that points to it, and - * it doesn't have child snapshot nodes - it's now redundant and we can mark it - * as deleted. - */ -static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k) -{ - struct bkey_s_c_snapshot snap; - u32 children[2]; - int ret; - - if (k.k->type != KEY_TYPE_snapshot) - return 0; - - snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_DELETED(snap.v) || - BCH_SNAPSHOT_SUBVOL(snap.v)) - return 0; - - children[0] = le32_to_cpu(snap.v->children[0]); - children[1] = le32_to_cpu(snap.v->children[1]); - - ret = bch2_snapshot_live(trans, children[0]) ?: - bch2_snapshot_live(trans, children[1]); - if (ret < 0) - return ret; - - if (!ret) - return bch2_snapshot_node_set_deleted(trans, k.k->p.offset); - return 0; -} - -int bch2_delete_dead_snapshots(struct bch_fs *c) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_snapshot snap; - snapshot_id_list deleted = { 0 }; - u32 i, id; - int ret = 0; - - if (!test_bit(BCH_FS_STARTED, &c->flags)) { - ret = bch2_fs_read_write_early(c); - if (ret) { - bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret)); - return ret; - } - } - - bch2_trans_init(&trans, c, 0, 0); - - /* - * For every snapshot node: If we have no live children and it's not - * pointed to by a subvolume, delete it: - */ - ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - NULL, NULL, 0, - bch2_delete_redundant_snapshot(&trans, &iter, k)); - if (ret) { - bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret)); - goto err; - } - - for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, - bch2_snapshot_set_equiv(&trans, k)); - if (ret) { - bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret)); - goto err; - } - - for_each_btree_key(&trans, iter, BTREE_ID_snapshots, - POS_MIN, 0, k, ret) { - if (k.k->type != KEY_TYPE_snapshot) - continue; - - snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_DELETED(snap.v)) { - ret = snapshot_list_add(c, &deleted, k.k->p.offset); - if (ret) - break; - } - } - bch2_trans_iter_exit(&trans, &iter); - - if (ret) { - bch_err(c, "error walking snapshots: %s", bch2_err_str(ret)); - goto err; - } - - for (id = 0; id < BTREE_ID_NR; id++) { - struct bpos last_pos = POS_MIN; - snapshot_id_list equiv_seen = { 0 }; - - if (!btree_type_has_snapshots(id)) - continue; - - ret = for_each_btree_key_commit(&trans, iter, - id, POS_MIN, - BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, - NULL, NULL, BTREE_INSERT_NOFAIL, - snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos)); - - darray_exit(&equiv_seen); - - if (ret) { - bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret)); - goto err; - } - } - - for (i = 0; i < deleted.nr; i++) { - ret = commit_do(&trans, NULL, NULL, 0, - bch2_snapshot_node_delete(&trans, deleted.data[i])); - if (ret) { - bch_err(c, "error deleting snapshot %u: %s", - deleted.data[i], bch2_err_str(ret)); - goto err; - } - } - - clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); -err: - darray_exit(&deleted); - bch2_trans_exit(&trans); - if (ret) - bch_err_fn(c, ret); - return ret; -} - -static void bch2_delete_dead_snapshots_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); - - if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) - bch2_delete_dead_snapshots(c); - bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); -} - -void bch2_delete_dead_snapshots_async(struct bch_fs *c) -{ - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) && - !queue_work(c->write_ref_wq, &c->snapshot_delete_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); -} - -static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, - struct btree_trans_commit_hook *h) -{ - struct bch_fs *c = trans->c; - - set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); - - if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots) - return 0; - - bch2_delete_dead_snapshots_async(c); - return 0; -} - /* Subvolumes: */ int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k, diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 48670c8..e7dbc31 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -581,6 +581,8 @@ void bch2_fs_free(struct bch_fs *c) { unsigned i; + BUG_ON(!test_bit(BCH_FS_STOPPING, &c->flags)); + mutex_lock(&bch_fs_list_lock); list_del(&c->list); mutex_unlock(&bch_fs_list_lock); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 740305e..941f4bc 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -248,7 +248,6 @@ read_attribute(new_stripes); read_attribute(io_timers_read); read_attribute(io_timers_write); -read_attribute(data_jobs); read_attribute(moving_ctxts); #ifdef CONFIG_BCACHEFS_TESTS @@ -458,9 +457,6 @@ SHOW(bch2_fs) if (attr == &sysfs_io_timers_write) bch2_io_timers_to_text(out, &c->io_clock[WRITE]); - if (attr == &sysfs_data_jobs) - bch2_data_jobs_to_text(out, c); - if (attr == &sysfs_moving_ctxts) bch2_fs_moving_ctxts_to_text(out, c); @@ -681,7 +677,6 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_rebalance_work, sysfs_pd_controller_files(rebalance), - &sysfs_data_jobs, &sysfs_moving_ctxts, &sysfs_internal_uuid, -- 2.39.2