From: Kent Overstreet Date: Sat, 10 Feb 2024 02:30:46 +0000 (-0500) Subject: Update bcachefs sources to bee7b5a4fa21 bcachefs: Pin btree cache in ram for random... X-Git-Url: https://git.sesse.net/?p=bcachefs-tools-debian;a=commitdiff_plain;h=7a716b76b5963dc2d158883f4497bab221932412 Update bcachefs sources to bee7b5a4fa21 bcachefs: Pin btree cache in ram for random access in fsck Signed-off-by: Kent Overstreet --- diff --git a/.bcachefs_revision b/.bcachefs_revision index d3c500b..1aa1789 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -50847e296b34efabe199e408ec4d72f10a866c39 +bee7b5a4fa2135c9ec9d1c9424018ee494500bb5 diff --git a/include/linux/sort.h b/include/linux/sort.h index afea044..17c6ba3 100644 --- a/include/linux/sort.h +++ b/include/linux/sort.h @@ -2,6 +2,12 @@ #define _LINUX_SORT_H #include +#include + +void sort_r(void *base, size_t num, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv); static inline void sort(void *base, size_t num, size_t size, int (*cmp_func)(const void *, const void *), diff --git a/libbcachefs/backpointers.c b/libbcachefs/backpointers.c index b4dc319..23fe937 100644 --- a/libbcachefs/backpointers.c +++ b/libbcachefs/backpointers.c @@ -129,8 +129,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, printbuf_exit(&buf); if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { - bch2_inconsistent_error(c); - return -EIO; + return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0; } else { return 0; } @@ -553,60 +552,61 @@ static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) }; } -static size_t btree_nodes_fit_in_ram(struct bch_fs *c) +static u64 mem_may_pin_bytes(struct bch_fs *c) { struct sysinfo i; - u64 mem_bytes; - si_meminfo(&i); - mem_bytes = i.totalram * i.mem_unit; - return div_u64(mem_bytes >> 1, c->opts.btree_node_size); + + u64 mem_bytes = i.totalram * i.mem_unit; + return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100); +} + +static size_t btree_nodes_fit_in_ram(struct bch_fs *c) +{ + return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size); } static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, - unsigned btree_leaf_mask, - unsigned btree_interior_mask, + u64 btree_leaf_mask, + u64 btree_interior_mask, struct bbpos start, struct bbpos *end) { - struct btree_iter iter; - struct bkey_s_c k; - size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); - enum btree_id btree; + struct bch_fs *c = trans->c; + s64 mem_may_pin = mem_may_pin_bytes(c); int ret = 0; - for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) { - unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2; + btree_interior_mask |= btree_leaf_mask; + + c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask; + c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask; + c->btree_cache.pinned_nodes_start = start; + c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX; + + for (enum btree_id btree = start.btree; + btree < BTREE_ID_NR && !ret; + btree++) { + unsigned depth = ((1U << btree) & btree_leaf_mask) ? 0 : 1; + struct btree_iter iter; + struct btree *b; if (!((1U << btree) & btree_leaf_mask) && !((1U << btree) & btree_interior_mask)) continue; - bch2_trans_node_iter_init(trans, &iter, btree, - btree == start.btree ? start.pos : POS_MIN, - 0, depth, 0); - /* - * for_each_btree_key_contineu() doesn't check the return value - * from bch2_btree_iter_advance(), which is needed when - * iterating over interior nodes where we'll see keys at - * SPOS_MAX: - */ - do { - k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0); - ret = bkey_err(k); - if (!k.k || ret) - break; - - --btree_nodes; - if (!btree_nodes) { - *end = BBPOS(btree, k.k->p); + __for_each_btree_node(trans, iter, btree, + btree == start.btree ? start.pos : POS_MIN, + 0, depth, BTREE_ITER_PREFETCH, b, ret) { + mem_may_pin -= btree_buf_bytes(b); + if (mem_may_pin <= 0) { + c->btree_cache.pinned_nodes_end = *end = + BBPOS(btree, b->key.k.p); bch2_trans_iter_exit(trans, &iter); return 0; } - } while (bch2_btree_iter_advance(&iter)); + } bch2_trans_iter_exit(trans, &iter); } - *end = BBPOS_MAX; return ret; } @@ -664,62 +664,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, return 0; } -static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c, - struct bpos bucket) -{ - return bch2_dev_exists2(c, bucket.inode) - ? bucket_pos_to_bp(c, bucket, 0) - : bucket; -} - -static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, - struct bpos start, struct bpos *end) -{ - struct btree_iter alloc_iter; - struct btree_iter bp_iter; - struct bkey_s_c alloc_k, bp_k; - size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); - bool alloc_end = false, bp_end = false; - int ret = 0; - - bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc, - start, 0, 1, 0); - bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0); - while (1) { - alloc_k = !alloc_end - ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0) - : bkey_s_c_null; - bp_k = !bp_end - ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0) - : bkey_s_c_null; - - ret = bkey_err(alloc_k) ?: bkey_err(bp_k); - if ((!alloc_k.k && !bp_k.k) || ret) { - *end = SPOS_MAX; - break; - } - - --btree_nodes; - if (!btree_nodes) { - *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX; - break; - } - - if (bpos_lt(alloc_iter.pos, SPOS_MAX) && - bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) { - if (!bch2_btree_iter_advance(&alloc_iter)) - alloc_end = true; - } else { - if (!bch2_btree_iter_advance(&bp_iter)) - bp_end = true; - } - } - bch2_trans_iter_exit(trans, &bp_iter); - bch2_trans_iter_exit(trans, &alloc_iter); - return ret; -} - int bch2_check_extents_to_backpointers(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); @@ -730,10 +674,16 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) bkey_init(&s.last_flushed.k->k); while (1) { - ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end); + struct bbpos end; + ret = bch2_get_btree_in_memory_pos(trans, + BIT_ULL(BTREE_ID_backpointers), + BIT_ULL(BTREE_ID_backpointers), + BBPOS(BTREE_ID_backpointers, s.bucket_start), &end); if (ret) break; + s.bucket_end = end.pos; + if ( bpos_eq(s.bucket_start, POS_MIN) && !bpos_eq(s.bucket_end, SPOS_MAX)) bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", @@ -761,6 +711,9 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) bch2_trans_put(trans); bch2_bkey_buf_exit(&s.last_flushed, c); + c->btree_cache.pinned_nodes_leaf_mask = 0; + c->btree_cache.pinned_nodes_interior_mask = 0; + bch_err_fn(c, ret); return ret; } @@ -866,6 +819,9 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) } bch2_trans_put(trans); + c->btree_cache.pinned_nodes_leaf_mask = 0; + c->btree_cache.pinned_nodes_interior_mask = 0; + bch_err_fn(c, ret); return ret; } diff --git a/libbcachefs/bbpos_types.h b/libbcachefs/bbpos_types.h index 5198e94..f638933 100644 --- a/libbcachefs/bbpos_types.h +++ b/libbcachefs/bbpos_types.h @@ -13,6 +13,6 @@ static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) } #define BBPOS_MIN BBPOS(0, POS_MIN) -#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX) +#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, SPOS_MAX) #endif /* _BCACHEFS_BBPOS_TYPES_H */ diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 7036949..b53b321 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -505,6 +505,7 @@ enum gc_phase { GC_PHASE_BTREE_deleted_inodes, GC_PHASE_BTREE_logged_ops, GC_PHASE_BTREE_rebalance_work, + GC_PHASE_BTREE_subvolume_children, GC_PHASE_PENDING_DELETE, }; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 14f6136..1bb24aa 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -840,7 +840,9 @@ struct bch_sb_field_downgrade { x(snapshot_skiplists, BCH_VERSION(1, 1)) \ x(deleted_inodes, BCH_VERSION(1, 2)) \ x(rebalance_work, BCH_VERSION(1, 3)) \ - x(member_seq, BCH_VERSION(1, 4)) + x(member_seq, BCH_VERSION(1, 4)) \ + x(subvolume_fs_parent, BCH_VERSION(1, 5)) \ + x(btree_subvolume_children, BCH_VERSION(1, 6)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -1488,7 +1490,9 @@ enum btree_id_flags { BIT_ULL(KEY_TYPE_logged_op_truncate)| \ BIT_ULL(KEY_TYPE_logged_op_finsert)) \ x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \ - BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) + BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \ + x(subvolume_children, 19, 0, \ + BIT_ULL(KEY_TYPE_set)) enum btree_id { #define x(name, nr, ...) BTREE_ID_##name = nr, diff --git a/libbcachefs/bkey_methods.h b/libbcachefs/bkey_methods.h index 03efe8e..f8217b2 100644 --- a/libbcachefs/bkey_methods.h +++ b/libbcachefs/bkey_methods.h @@ -78,6 +78,7 @@ bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); enum btree_update_flags { __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END, + __BTREE_UPDATE_SNAPSHOT_WHITEOUT_CHECKS_DONE, __BTREE_UPDATE_NOJOURNAL, __BTREE_UPDATE_KEY_CACHE_RECLAIM, @@ -91,6 +92,8 @@ enum btree_update_flags { }; #define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) +#define BTREE_UPDATE_SNAPSHOT_WHITEOUT_CHECKS_DONE \ + (1U << __BTREE_UPDATE_SNAPSHOT_WHITEOUT_CHECKS_DONE) #define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL) #define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index 9b7ea12..7997504 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bbpos.h" #include "bkey_buf.h" #include "btree_cache.h" #include "btree_io.h" @@ -208,6 +209,18 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) int ret = 0; lockdep_assert_held(&bc->lock); + + struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); + + u64 mask = b->c.level + ? bc->pinned_nodes_interior_mask + : bc->pinned_nodes_leaf_mask; + + if ((mask & BIT_ULL(b->c.btree_id)) && + bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && + bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) + return -BCH_ERR_ENOMEM_btree_node_reclaim; + wait_on_io: if (b->flags & ((1U << BTREE_NODE_dirty)| (1U << BTREE_NODE_read_in_flight)| @@ -905,7 +918,7 @@ retry: if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-EIO); + return ERR_PTR(-BCH_ERR_btree_node_read_error); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -996,7 +1009,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-EIO); + return ERR_PTR(-BCH_ERR_btree_node_read_error); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -1079,7 +1092,7 @@ lock_node: if (unlikely(btree_node_read_error(b))) { six_unlock_read(&b->c.lock); - b = ERR_PTR(-EIO); + b = ERR_PTR(-BCH_ERR_btree_node_read_error); goto out; } diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index eb92526..6c52f11 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -407,7 +407,7 @@ again: printbuf_reset(&buf); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); - if (mustfix_fsck_err_on(ret == -EIO, c, + if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), c, btree_node_unreadable, "Topology repair: unreadable btree node at btree %s level %u:\n" " %s", @@ -979,7 +979,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b false); ret = PTR_ERR_OR_ZERO(child); - if (ret == -EIO) { + if (bch2_err_matches(ret, EIO)) { bch2_topology_error(c); if (__fsck_err(c, diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 61b6093..8641570 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -581,8 +581,7 @@ static int __btree_err(int ret, break; case -BCH_ERR_btree_node_read_err_bad_node: bch2_print_string_as_lines(KERN_ERR, out.buf); - bch2_topology_error(c); - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO; + ret = bch2_topology_error(c); break; case -BCH_ERR_btree_node_read_err_incompatible: bch2_print_string_as_lines(KERN_ERR, out.buf); @@ -1737,7 +1736,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, list_move(&b->list, &c->btree_cache.freeable); mutex_unlock(&c->btree_cache.lock); - ret = -EIO; + ret = -BCH_ERR_btree_node_read_error; goto err; } @@ -1841,7 +1840,7 @@ static void btree_node_write_work(struct work_struct *work) bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { - ret = -BCH_ERR_btree_write_all_failed; + ret = -BCH_ERR_btree_node_write_all_failed; goto err; } diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 3aac6ed..07b1de5 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -2303,7 +2303,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) btree_iter_path(trans, iter)->level); if (iter->flags & BTREE_ITER_WITH_JOURNAL) - return bkey_s_c_err(-EIO); + return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); @@ -2501,6 +2501,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) k = bch2_btree_iter_peek_upto(&iter2, end); if (k.k && !bkey_err(k)) { + swap(iter->key_cache_path, iter2.key_cache_path); iter->k = iter2.k; k.k = &iter->k; } @@ -2760,6 +2761,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) struct btree_trans *trans = src->trans; *dst = *src; +#ifdef TRACK_PATH_ALLOCATED + dst->ip_allocated = _RET_IP_; +#endif if (src->path) __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT); if (src->update_path) diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 0d5eecb..b2ebf14 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -6,6 +6,7 @@ #include #include +#include "bbpos_types.h" #include "btree_key_cache_types.h" #include "buckets_types.h" #include "errcode.h" @@ -173,6 +174,11 @@ struct btree_cache { */ struct task_struct *alloc_lock; struct closure_waitlist alloc_wait; + + struct bbpos pinned_nodes_start; + struct bbpos pinned_nodes_end; + u64 pinned_nodes_leaf_mask; + u64 pinned_nodes_interior_mask; }; struct btree_node_iter { @@ -654,6 +660,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type); BIT_ULL(BKEY_TYPE_inodes)| \ BIT_ULL(BKEY_TYPE_stripes)| \ BIT_ULL(BKEY_TYPE_reflink)| \ + BIT_ULL(BKEY_TYPE_subvolumes)| \ BIT_ULL(BKEY_TYPE_btree)) #define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \ @@ -727,7 +734,7 @@ struct btree_root { __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); u8 level; u8 alive; - s8 error; + s16 error; }; enum btree_gc_coalesce_fail_reason { diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update.c index e519311..d3d625d 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update.c @@ -82,40 +82,169 @@ static noinline int extent_back_merge(struct btree_trans *trans, return 0; } +static struct bkey_s_c peek_slot_including_whiteouts(struct btree_trans *trans, struct btree_iter *iter, + enum btree_id btree, struct bpos pos) +{ + struct bkey_s_c k; + int ret; + + for_each_btree_key_norestart(trans, *iter, btree, pos, + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_NOPRESERVE, k, ret) { + if (!bkey_eq(k.k->p, pos)) + break; + if (bch2_snapshot_is_ancestor(trans->c, pos.snapshot, k.k->p.snapshot)) + return k; + } + bch2_trans_iter_exit(trans, iter); + + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; +} + /* * When deleting, check if we need to emit a whiteout (because we're overwriting * something in an ancestor snapshot) */ -static int need_whiteout_for_snapshot(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos) +static int need_whiteout_for_snapshot(struct btree_trans *trans, enum btree_id btree, struct bpos pos) +{ + pos.snapshot = bch2_snapshot_parent(trans->c, pos.snapshot); + if (!pos.snapshot) + return 0; + + struct btree_iter iter; + struct bkey_s_c k = peek_slot_including_whiteouts(trans, &iter, btree, pos); + int ret = bkey_err(k) ?: k.k && !bkey_whiteout(k.k); + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +/* + * We're overwriting a key at @pos in snapshot @snapshot, so we need to insert a + * whiteout: that might be in @snapshot, or if there are overwites in sibling + * snapshots, find the common ancestor where @pos is overwritten in every + * descendent and insert the whiteout there - which might be at @pos. + */ +static int delete_interior_snapshot_key(struct btree_trans *trans, + enum btree_id btree, + struct bpos whiteout, bool deleting, + struct bpos overwrite, bool old_is_whiteout) { + struct bch_fs *c = trans->c; + struct bpos orig_whiteout = whiteout, sib = whiteout; struct btree_iter iter; struct bkey_s_c k; - u32 snapshot = pos.snapshot; int ret; - if (!bch2_snapshot_parent(trans->c, pos.snapshot)) - return 0; + sib.snapshot = bch2_snapshot_sibling(c, sib.snapshot); - pos.snapshot++; + for_each_btree_key_norestart(trans, iter, btree, sib, + BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_INTENT, k, ret) { + BUG_ON(bpos_gt(k.k->p, overwrite)); - for_each_btree_key_norestart(trans, iter, btree_id, pos, - BTREE_ITER_ALL_SNAPSHOTS| - BTREE_ITER_NOPRESERVE, k, ret) { - if (!bkey_eq(k.k->p, pos)) + if (bpos_lt(k.k->p, sib)) /* unrelated branch - skip */ + continue; + if (bpos_gt(k.k->p, sib)) /* did not find @sib */ break; - if (bch2_snapshot_is_ancestor(trans->c, snapshot, - k.k->p.snapshot)) { - ret = !bkey_whiteout(k.k); + /* @overwrite is also written in @sib, now check parent */ + whiteout.snapshot = bch2_snapshot_parent(c, whiteout.snapshot); + if (bpos_eq(whiteout, overwrite)) break; - } + + sib = whiteout; + sib.snapshot = bch2_snapshot_sibling(c, sib.snapshot); } - bch2_trans_iter_exit(trans, &iter); + if (ret) + goto err; + + if (!deleting && bpos_eq(whiteout, orig_whiteout)) + goto out; + + if (!bpos_eq(iter.pos, whiteout)) { + bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_init(trans, &iter, btree, whiteout, BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + } + + iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + iter.flags |= BTREE_ITER_FILTER_SNAPSHOTS; + + struct bkey_i *delete = bch2_trans_kmalloc(trans, sizeof(*delete)); + ret = PTR_ERR_OR_ZERO(delete); + if (ret) + goto err; + + bkey_init(&delete->k); + delete->k.p = whiteout; + + ret = !bpos_eq(whiteout, overwrite) + ? !old_is_whiteout + : need_whiteout_for_snapshot(trans, btree, whiteout); + if (ret < 0) + goto err; + if (ret) + delete->k.type = KEY_TYPE_whiteout; + + ret = bch2_trans_update(trans, &iter, delete, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + BTREE_UPDATE_SNAPSHOT_WHITEOUT_CHECKS_DONE); +out: +err: + bch2_trans_iter_exit(trans, &iter); return ret; } +/* + * We're overwriting a key in a snapshot that has ancestors: if we're + * overwriting a key in a different snapshot, we need to check if it is now + * fully overritten and can be deleted, and if we're deleting a key in the + * current snapshot we need to check if we need to leave a whiteout. + */ +static noinline int +overwrite_interior_snapshot_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *k) +{ + struct bkey_s_c old = bch2_btree_iter_peek_slot(iter); + + int ret = bkey_err(old); + if (ret) + return ret; + + if (!bkey_deleted(old.k)) { + if (old.k->p.snapshot != k->k.p.snapshot) { + /* + * We're overwriting a key in a different snapshot: + * check if it's also been overwritten in siblings + */ + ret = delete_interior_snapshot_key(trans, iter->btree_id, + k->k.p, bkey_deleted(&k->k), + old.k->p, bkey_whiteout(old.k)); + if (ret) + return ret; + if (bkey_deleted(&k->k)) + return 1; + } else if (bkey_deleted(&k->k)) { + /* + * We're deleting a key in the current snapshot: + * check if we need to leave a whiteout + */ + ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); + if (unlikely(ret < 0)) + return ret; + if (ret) + k->k.type = KEY_TYPE_whiteout; + } + } + + return 0; +} + int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, enum btree_id id, struct bpos old_pos, @@ -503,32 +632,29 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k, enum btree_update_flags flags) { - btree_path_idx_t path_idx = iter->update_path ?: iter->path; - int ret; - if (iter->flags & BTREE_ITER_IS_EXTENTS) return bch2_trans_update_extent(trans, iter, k, flags); - if (bkey_deleted(&k->k) && - !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && - (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { - ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); - if (unlikely(ret < 0)) - return ret; - + if (!(flags & (BTREE_UPDATE_SNAPSHOT_WHITEOUT_CHECKS_DONE| + BTREE_UPDATE_KEY_CACHE_RECLAIM)) && + (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + bch2_snapshot_parent(trans->c, k->k.p.snapshot)) { + int ret = overwrite_interior_snapshot_key(trans, iter, k); if (ret) - k->k.type = KEY_TYPE_whiteout; + return ret < 0 ? ret : 0; } /* * Ensure that updates to cached btrees go to the key cache: */ + btree_path_idx_t path_idx = iter->update_path ?: iter->path; struct btree_path *path = trans->paths + path_idx; + if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && !path->cached && !path->level && btree_id_cached(trans->c, path->btree_id)) { - ret = bch2_trans_update_get_key_cache(trans, iter, path); + int ret = bch2_trans_update_get_key_cache(trans, iter, path); if (ret) return ret; @@ -789,6 +915,27 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, struct bpos pos, bool set) +{ + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); + int ret = PTR_ERR_OR_ZERO(k); + if (ret) + return ret; + + bkey_init(&k->k); + k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; + k->k.p = pos; + + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, 0); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, + struct bpos pos, bool set) { struct bkey_i k; diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index b9382b7..cc7c53e 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -63,11 +63,12 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id, struct bpos, struct bpos, unsigned, u64 *); int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); +int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool); static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans, enum btree_id btree, struct bpos pos) { - return bch2_btree_bit_mod(trans, btree, pos, false); + return bch2_btree_bit_mod_buffered(trans, btree, pos, false); } int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 030291c..5fbea33 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -1844,8 +1844,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, __func__, buf1.buf, buf2.buf); printbuf_exit(&buf1); printbuf_exit(&buf2); - bch2_topology_error(c); - ret = -EIO; + ret = bch2_topology_error(c); goto err; } diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 7dca10b..c2f46b2 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -1053,7 +1053,8 @@ int bch2_trigger_extent(struct btree_trans *trans, (int) bch2_bkey_needs_rebalance(c, old); if (mod) { - int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new.k->p, mod > 0); + int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, + new.k->p, mod > 0); if (ret) return ret; } diff --git a/libbcachefs/dirent.c b/libbcachefs/dirent.c index 97773cf..b5ee11b 100644 --- a/libbcachefs/dirent.c +++ b/libbcachefs/dirent.c @@ -201,17 +201,17 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, } int bch2_dirent_create_snapshot(struct btree_trans *trans, - u64 dir, u32 snapshot, + u32 dir_subvol, u64 dir, u32 snapshot, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, bch_str_hash_flags_t str_hash_flags) { - subvol_inum zero_inum = { 0 }; + subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir }; struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, zero_inum, type, name, dst_inum); + dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -220,7 +220,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, dirent->k.p.snapshot = snapshot; ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, - zero_inum, snapshot, + dir_inum, snapshot, &dirent->k_i, str_hash_flags, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); *dir_offset = dirent->k.p.offset; @@ -522,7 +522,7 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot) SPOS(dir, 0, snapshot), POS(dir, U64_MAX), 0, k, ret) if (k.k->type == KEY_TYPE_dirent) { - ret = -ENOTEMPTY; + ret = -BCH_ERR_ENOTEMPTY_dir_not_empty; break; } bch2_trans_iter_exit(trans, &iter); diff --git a/libbcachefs/dirent.h b/libbcachefs/dirent.h index f1dd720..34cb8e1 100644 --- a/libbcachefs/dirent.h +++ b/libbcachefs/dirent.h @@ -35,7 +35,7 @@ static inline unsigned dirent_val_u64s(unsigned len) int bch2_dirent_read_target(struct btree_trans *, subvol_inum, struct bkey_s_c_dirent, subvol_inum *); -int bch2_dirent_create_snapshot(struct btree_trans *, u64, u32, +int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, bch_str_hash_flags_t); diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index 3fd33b3..fe3fc14 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -109,6 +109,8 @@ x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ x(ENOENT, ENOENT_dev_not_found) \ x(ENOENT, ENOENT_dev_idx_not_found) \ + x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ + x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ x(0, open_buckets_empty) \ x(0, freelist_empty) \ x(BCH_ERR_freelist_empty, no_buckets_found) \ @@ -178,6 +180,7 @@ x(EINVAL, opt_parse_error) \ x(EINVAL, remove_with_metadata_missing_unimplemented)\ x(EINVAL, remove_would_lose_data) \ + x(EINVAL, btree_iter_with_journal_not_supported) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ @@ -227,7 +230,10 @@ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ x(EIO, btree_node_read_err) \ x(EIO, sb_not_downgraded) \ - x(EIO, btree_write_all_failed) \ + x(EIO, btree_node_write_all_failed) \ + x(EIO, btree_node_read_error) \ + x(EIO, btree_node_read_validate_error) \ + x(EIO, btree_need_topology_repair) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 70a1253..8ae95b2 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "error.h" +#include "recovery.h" #include "super.h" #include @@ -25,11 +26,16 @@ bool bch2_inconsistent_error(struct bch_fs *c) } } -void bch2_topology_error(struct bch_fs *c) +int bch2_topology_error(struct bch_fs *c) { set_bit(BCH_FS_topology_error, &c->flags); - if (!test_bit(BCH_FS_fsck_running, &c->flags)) + if (!test_bit(BCH_FS_fsck_running, &c->flags)) { bch2_inconsistent_error(c); + return -BCH_ERR_btree_need_topology_repair; + } else { + return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: + -BCH_ERR_btree_node_read_validate_error; + } } void bch2_fatal_error(struct bch_fs *c) diff --git a/libbcachefs/error.h b/libbcachefs/error.h index fec17d1..9449119 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -30,7 +30,7 @@ struct work_struct; bool bch2_inconsistent_error(struct bch_fs *); -void bch2_topology_error(struct bch_fs *); +int bch2_topology_error(struct bch_fs *); #define bch2_fs_inconsistent(c, ...) \ ({ \ diff --git a/libbcachefs/fs-common.c b/libbcachefs/fs-common.c index 523507e..3d43c03 100644 --- a/libbcachefs/fs-common.c +++ b/libbcachefs/fs-common.c @@ -107,6 +107,7 @@ int bch2_create_trans(struct btree_trans *trans, u32 new_subvol, dir_snapshot; ret = bch2_subvolume_create(trans, new_inode->bi_inum, + dir.subvol, snapshot_src.subvol, &new_subvol, &snapshot, (flags & BCH_CREATE_SNAPSHOT_RO) != 0); @@ -242,7 +243,7 @@ int bch2_unlink_trans(struct btree_trans *trans, struct bch_inode_unpacked *dir_u, struct bch_inode_unpacked *inode_u, const struct qstr *name, - bool deleting_snapshot) + bool deleting_subvol) { struct bch_fs *c = trans->c; struct btree_iter dir_iter = { NULL }; @@ -270,18 +271,25 @@ int bch2_unlink_trans(struct btree_trans *trans, if (ret) goto err; - if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) { + if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) { ret = bch2_empty_dir_trans(trans, inum); if (ret) goto err; } - if (deleting_snapshot && !inode_u->bi_subvol) { + if (deleting_subvol && !inode_u->bi_subvol) { ret = -BCH_ERR_ENOENT_not_subvol; goto err; } - if (deleting_snapshot || inode_u->bi_subvol) { + if (inode_u->bi_subvol) { + /* Recursive subvolume destroy not allowed (yet?) */ + ret = bch2_subvol_has_children(trans, inode_u->bi_subvol); + if (ret) + goto err; + } + + if (deleting_subvol || inode_u->bi_subvol) { ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol); if (ret) goto err; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 77ea610..4445fa2 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -503,7 +503,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, bch2_subvol_is_ro(c, inode->ei_subvol) ?: __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) - return ret; + return bch2_err_class(ret); ihold(&inode->v); d_instantiate(dentry, &inode->v); @@ -555,8 +555,9 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) struct bch_inode_info *dir= to_bch_ei(vdir); struct bch_fs *c = dir->v.i_sb->s_fs_info; - return bch2_subvol_is_ro(c, dir->ei_subvol) ?: + int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?: __bch2_unlink(vdir, dentry, false); + return bch2_err_class(ret); } static int bch2_symlink(struct mnt_idmap *idmap, @@ -591,7 +592,7 @@ static int bch2_symlink(struct mnt_idmap *idmap, return 0; err: iput(&inode->v); - return ret; + return bch2_err_class(ret); } static int bch2_mkdir(struct mnt_idmap *idmap, diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index e4a8a14..dfd5470 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -252,7 +252,7 @@ create_lostfound: goto err; ret = bch2_dirent_create_snapshot(trans, - root_inode.bi_inum, snapshot, &root_hash_info, + 0, root_inode.bi_inum, snapshot, &root_hash_info, mode_to_type(lostfound->bi_mode), &lostfound_str, lostfound->bi_inum, @@ -275,9 +275,24 @@ static int reattach_inode(struct btree_trans *trans, char name_buf[20]; struct qstr name; u64 dir_offset = 0; + u32 dirent_snapshot = inode_snapshot; int ret; - ret = lookup_lostfound(trans, inode_snapshot, &lostfound); + if (inode->bi_subvol) { + inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; + + u64 root_inum; + ret = subvol_lookup(trans, inode->bi_parent_subvol, + &dirent_snapshot, &root_inum); + if (ret) + return ret; + + snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol); + } else { + snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); + } + + ret = lookup_lostfound(trans, dirent_snapshot, &lostfound); if (ret) return ret; @@ -291,14 +306,16 @@ static int reattach_inode(struct btree_trans *trans, dir_hash = bch2_hash_info_init(trans->c, &lostfound); - snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); name = (struct qstr) QSTR(name_buf); ret = bch2_dirent_create_snapshot(trans, - lostfound.bi_inum, inode_snapshot, + inode->bi_parent_subvol, lostfound.bi_inum, + dirent_snapshot, &dir_hash, inode_d_type(inode), - &name, inode->bi_inum, &dir_offset, + &name, + inode->bi_subvol ?: inode->bi_inum, + &dir_offset, BCH_HASH_SET_MUST_CREATE); if (ret) return ret; @@ -564,13 +581,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, } static struct inode_walker_entry * -lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, - u32 snapshot, bool is_whiteout) +lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) { - struct inode_walker_entry *i; - - snapshot = bch2_snapshot_equiv(c, snapshot); + bool is_whiteout = k.k->type == KEY_TYPE_whiteout; + u32 snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); + struct inode_walker_entry *i; __darray_for_each(w->inodes, i) if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot)) goto found; @@ -581,20 +597,24 @@ found: if (snapshot != i->snapshot && !is_whiteout) { struct inode_walker_entry new = *i; - size_t pos; - int ret; new.snapshot = snapshot; new.count = 0; - bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u", - w->last_pos.inode, snapshot, i->snapshot); + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, k); + + bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" + "unexpected because we should always update the inode when we update a key in that inode\n" + "%s", + w->last_pos.inode, snapshot, i->snapshot, buf.buf); + printbuf_exit(&buf); while (i > w->inodes.data && i[-1].snapshot > snapshot) --i; - pos = i - w->inodes.data; - ret = darray_insert_item(&w->inodes, pos, new); + size_t pos = i - w->inodes.data; + int ret = darray_insert_item(&w->inodes, pos, new); if (ret) return ERR_PTR(ret); @@ -605,21 +625,21 @@ found: } static struct inode_walker_entry *walk_inode(struct btree_trans *trans, - struct inode_walker *w, struct bpos pos, - bool is_whiteout) + struct inode_walker *w, + struct bkey_s_c k) { - if (w->last_pos.inode != pos.inode) { - int ret = get_inodes_all_snapshots(trans, w, pos.inode); + if (w->last_pos.inode != k.k->p.inode) { + int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); if (ret) return ERR_PTR(ret); - } else if (bkey_cmp(w->last_pos, pos)) { + } else if (bkey_cmp(w->last_pos, k.k->p)) { darray_for_each(w->inodes, i) i->seen_this_pos = false; } - w->last_pos = pos; + w->last_pos = k.k->p; - return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout); + return lookup_inode_for_snapshot(trans->c, w, k); } static int __get_visible_inodes(struct btree_trans *trans, @@ -767,6 +787,43 @@ fsck_err: goto out; } +static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos pos) +{ + return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); +} + +static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode, + u32 *snapshot) +{ + if (inode->bi_subvol) { + u64 inum; + int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum); + if (ret) + return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }); + } + + return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); +} + +static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, + struct bkey_s_c_dirent d) +{ + return inode->bi_dir == d.k->p.inode && + inode->bi_dir_offset == d.k->p.offset; +} + +static bool dirent_points_to_inode(struct bkey_s_c_dirent d, + struct bch_inode_unpacked *inode) +{ + return d.v->d_type == DT_SUBVOL + ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol + : le64_to_cpu(d.v->d_inum) == inode->bi_inum; +} + static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) { struct btree_iter iter; @@ -779,6 +836,49 @@ static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) return k.k->type == KEY_TYPE_set; } +static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k, + struct bch_inode_unpacked *inode, + u32 inode_snapshot, bool *write_inode) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + struct btree_iter dirent_iter = {}; + struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); + int ret = bkey_err(d); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(ret, + c, inode_points_to_missing_dirent, + "inode points to missing dirent\n%s", + (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) || + fsck_err_on(!ret && !dirent_points_to_inode(d, inode), + c, inode_points_to_wrong_dirent, + "inode points to dirent that does not point back:\n%s", + (bch2_bkey_val_to_text(&buf, c, inode_k), + prt_newline(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + /* + * We just clear the backpointer fields for now. If we find a + * dirent that points to this inode in check_dirents(), we'll + * update it then; then when we get to check_path() if the + * backpointer is still 0 we'll reattach it. + */ + inode->bi_dir = 0; + inode->bi_dir_offset = 0; + inode->bi_flags &= ~BCH_INODE_backptr_untrusted; + *write_inode = true; + } + + ret = 0; +fsck_err: + bch2_trans_iter_exit(trans, &dirent_iter); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -923,6 +1023,22 @@ static int check_inode(struct btree_trans *trans, do_update = true; } + if (u.bi_dir || u.bi_dir_offset) { + ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update); + if (ret) + goto err; + } + + if (fsck_err_on(u.bi_parent_subvol && + (u.bi_subvol == 0 || + u.bi_subvol == BCACHEFS_ROOT_SUBVOL), + c, inode_bi_parent_nonzero, + "inode %llu:%u has subvol %u but nonzero parent subvol %u", + u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) { + u.bi_parent_subvol = 0; + do_update = true; + } + if (u.bi_subvol) { struct bch_subvolume s; @@ -980,28 +1096,6 @@ int bch2_check_inodes(struct bch_fs *c) return ret; } -static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos pos) -{ - return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); -} - -static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, - struct bkey_s_c_dirent d) -{ - return inode->bi_dir == d.k->p.inode && - inode->bi_dir_offset == d.k->p.offset; -} - -static bool dirent_points_to_inode(struct bkey_s_c_dirent d, - struct bch_inode_unpacked *inode) -{ - return d.v->d_type == DT_SUBVOL - ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol - : le64_to_cpu(d.v->d_inum) == inode->bi_inum; -} - static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; @@ -1310,7 +1404,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, goto err; } - i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout); + i = walk_inode(trans, inode, k); ret = PTR_ERR_OR_ZERO(i); if (ret) goto err; @@ -1489,84 +1583,82 @@ fsck_err: return ret ?: trans_was_restarted(trans, restart_count); } -static int check_inode_backpointer(struct btree_trans *trans, +static int check_dirent_inode_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c_dirent d, struct bch_inode_unpacked *target, u32 target_snapshot) { struct bch_fs *c = trans->c; - struct btree_iter bp_iter = { NULL }; struct printbuf buf = PRINTBUF; int ret = 0; + if (inode_points_to_dirent(target, d)) + return 0; + if (!target->bi_dir && !target->bi_dir_offset) { target->bi_dir = d.k->p.inode; target->bi_dir_offset = d.k->p.offset; - - ret = __bch2_fsck_write_inode(trans, target, target_snapshot); - if (ret) - goto err; + return __bch2_fsck_write_inode(trans, target, target_snapshot); } - if (!inode_points_to_dirent(target, d)) { - struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, - SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); - ret = bkey_err(bp_dirent); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - bool backpointer_exists = !ret; - ret = 0; - - bch2_bkey_val_to_text(&buf, c, d.s_c); - prt_newline(&buf); - if (backpointer_exists) - bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + struct btree_iter bp_iter = { NULL }; + struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, + SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); + ret = bkey_err(bp_dirent); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; - if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists, - c, inode_dir_multiple_links, - "directory %llu:%u with multiple links\n%s", - target->bi_inum, target_snapshot, buf.buf)) { - ret = __remove_dirent(trans, d.k->p); - goto out; - } + bool backpointer_exists = !ret; + ret = 0; + + if (fsck_err_on(!backpointer_exists, + c, inode_wrong_backpointer, + "inode %llu:%u has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + target->bi_inum, target_snapshot, + target->bi_dir, + target->bi_dir_offset, + d.k->p.inode, + d.k->p.offset)) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + ret = __bch2_fsck_write_inode(trans, target, target_snapshot); + goto out; + } - /* - * hardlinked file with nlink 0: - * We're just adjusting nlink here so check_nlinks() will pick - * it up, it ignores inodes with nlink 0 - */ - if (fsck_err_on(backpointer_exists && !target->bi_nlink, - c, inode_multiple_links_but_nlink_0, - "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", - target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { - target->bi_nlink++; - target->bi_flags &= ~BCH_INODE_unlinked; - - ret = __bch2_fsck_write_inode(trans, target, target_snapshot); - if (ret) - goto err; - } + bch2_bkey_val_to_text(&buf, c, d.s_c); + prt_newline(&buf); + if (backpointer_exists) + bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + + if (fsck_err_on(backpointer_exists && + (S_ISDIR(target->bi_mode) || + target->bi_subvol), + c, inode_dir_multiple_links, + "%s %llu:%u with multiple links\n%s", + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", + target->bi_inum, target_snapshot, buf.buf)) { + ret = __remove_dirent(trans, d.k->p); + goto out; + } - if (fsck_err_on(!backpointer_exists, - c, inode_wrong_backpointer, - "inode %llu:%u has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - target->bi_inum, target_snapshot, - target->bi_dir, - target->bi_dir_offset, - d.k->p.inode, - d.k->p.offset)) { - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - - ret = __bch2_fsck_write_inode(trans, target, target_snapshot); - if (ret) - goto err; - } + /* + * hardlinked file with nlink 0: + * We're just adjusting nlink here so check_nlinks() will pick + * it up, it ignores inodes with nlink 0 + */ + if (fsck_err_on(backpointer_exists && !target->bi_nlink, + c, inode_multiple_links_but_nlink_0, + "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", + target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { + target->bi_nlink++; + target->bi_flags &= ~BCH_INODE_unlinked; + ret = __bch2_fsck_write_inode(trans, target, target_snapshot); + if (ret) + goto err; } out: err: @@ -1588,7 +1680,7 @@ static int check_dirent_target(struct btree_trans *trans, struct printbuf buf = PRINTBUF; int ret = 0; - ret = check_inode_backpointer(trans, iter, d, target, target_snapshot); + ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot); if (ret) goto err; @@ -1606,27 +1698,12 @@ static int check_dirent_target(struct btree_trans *trans, bkey_reassemble(&n->k_i, d.s_c); n->v.d_type = inode_d_type(target); - - ret = bch2_trans_update(trans, iter, &n->k_i, 0); - if (ret) - goto err; - - d = dirent_i_to_s_c(n); - } - - if (fsck_err_on(d.v->d_type == DT_SUBVOL && - target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol), - c, dirent_d_parent_subvol_wrong, - "dirent has wrong d_parent_subvol field: got %u, should be %u", - le32_to_cpu(d.v->d_parent_subvol), - target->bi_parent_subvol)) { - n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - bkey_reassemble(&n->k_i, d.s_c); - n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + if (n->v.d_type == DT_SUBVOL) { + n->v.d_parent_subvol = target->bi_parent_subvol; + n->v.d_child_subvol = target->bi_subvol; + } else { + n->v.d_inum = target->bi_inum; + } ret = bch2_trans_update(trans, iter, &n->k_i, 0); if (ret) @@ -1641,45 +1718,113 @@ fsck_err: return ret; } -static int check_subvol_dirent(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c_dirent d) +/* find a subvolume that's a descendent of @snapshot: */ +static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_subvolume) + continue; + + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) { + bch2_trans_iter_exit(trans, &iter); + *subvolid = k.k->p.offset; + goto found; + } + } + if (!ret) + ret = -ENOENT; +found: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c_dirent d) { struct bch_fs *c = trans->c; + struct btree_iter subvol_iter = {}; struct bch_inode_unpacked subvol_root; + u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); - u32 target_snapshot; - u64 target_inum; + u32 parent_snapshot; + u64 parent_inum; + struct printbuf buf = PRINTBUF; int ret = 0; - ret = subvol_lookup(trans, target_subvol, - &target_snapshot, &target_inum); + ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); if (ret && !bch2_err_matches(ret, ENOENT)) return ret; - if (fsck_err_on(ret, c, dirent_to_missing_subvol, - "dirent points to missing subvolume %u", - le32_to_cpu(d.v->d_child_subvol))) - return __remove_dirent(trans, d.k->p); + if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol, + "dirent parent_subvol points to missing subvolume\n%s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || + fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot), + c, dirent_not_visible_in_parent_subvol, + "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", + parent_snapshot, + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + u32 new_parent_subvol; + ret = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); + if (ret) + goto err; - ret = lookup_inode(trans, target_inum, - &subvol_root, &target_snapshot); + struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); + ret = PTR_ERR_OR_ZERO(new_dirent); + if (ret) + goto err; + + new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); + } + + struct bkey_s_c_subvolume s = + bch2_bkey_get_iter_typed(trans, &subvol_iter, + BTREE_ID_subvolumes, POS(0, target_subvol), + 0, subvolume); + ret = bkey_err(s.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) return ret; - if (fsck_err_on(ret, c, subvol_to_missing_root, - "subvolume %u points to missing subvolume root %llu", - target_subvol, - target_inum)) { - bch_err(c, "repair not implemented yet"); - return -EINVAL; + if (ret) { + if (fsck_err(c, dirent_to_missing_subvol, + "dirent points to missing subvolume\n%s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) + return __remove_dirent(trans, d.k->p); + ret = 0; + goto out; } - if (fsck_err_on(subvol_root.bi_subvol != target_subvol, - c, subvol_root_wrong_bi_subvol, - "subvol root %llu has wrong bi_subvol field: got %u, should be %u", + if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, + c, subvol_fs_path_parent_wrong, + "subvol with wrong fs_path_parent, should be be %u\n%s", + parent_subvol, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + struct bkey_i_subvolume *n = + bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + n->v.fs_path_parent = le32_to_cpu(parent_subvol); + } + + u64 target_inum = le64_to_cpu(s.v->inode); + u32 target_snapshot = le32_to_cpu(s.v->snapshot); + + ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(parent_subvol != subvol_root.bi_parent_subvol, + c, inode_bi_parent_wrong, + "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", target_inum, - subvol_root.bi_subvol, target_subvol)) { - subvol_root.bi_subvol = target_subvol; + subvol_root.bi_parent_subvol, parent_subvol)) { + subvol_root.bi_parent_subvol = parent_subvol; ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot); if (ret) return ret; @@ -1689,7 +1834,11 @@ static int check_subvol_dirent(struct btree_trans *trans, struct btree_iter *ite target_snapshot); if (ret) return ret; +out: +err: fsck_err: + bch2_trans_iter_exit(trans, &subvol_iter); + printbuf_exit(&buf); return ret; } @@ -1731,7 +1880,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, BUG_ON(!btree_iter_path(trans, iter)->should_be_locked); - i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout); + i = walk_inode(trans, dir, k); ret = PTR_ERR_OR_ZERO(i); if (ret < 0) goto err; @@ -1777,7 +1926,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, d = bkey_s_c_to_dirent(k); if (d.v->d_type == DT_SUBVOL) { - ret = check_subvol_dirent(trans, iter, d); + ret = check_dirent_to_subvol(trans, iter, d); if (ret) goto err; } else { @@ -1858,7 +2007,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, if (ret) return ret; - i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout); + i = walk_inode(trans, inode, k); ret = PTR_ERR_OR_ZERO(i); if (ret) return ret; @@ -1997,62 +2146,52 @@ static int path_down(struct bch_fs *c, pathbuf *p, * * XXX: we should also be verifying that inodes are in the right subvolumes */ -static int check_path(struct btree_trans *trans, - pathbuf *p, - struct bch_inode_unpacked *inode, - u32 snapshot) +static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k) { struct bch_fs *c = trans->c; + struct btree_iter inode_iter = {}; + struct bch_inode_unpacked inode; + struct printbuf buf = PRINTBUF; + u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot); int ret = 0; - snapshot = bch2_snapshot_equiv(c, snapshot); p->nr = 0; - while (!(inode->bi_inum == BCACHEFS_ROOT_INO && - inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) { + BUG_ON(bch2_inode_unpack(inode_k, &inode)); + + while (!(inode.bi_inum == BCACHEFS_ROOT_INO && + inode.bi_subvol == BCACHEFS_ROOT_SUBVOL)) { struct btree_iter dirent_iter; struct bkey_s_c_dirent d; u32 parent_snapshot = snapshot; - if (inode->bi_subvol) { - u64 inum; - - ret = subvol_lookup(trans, inode->bi_parent_subvol, - &parent_snapshot, &inum); - if (ret) - break; - } - - d = dirent_get_by_pos(trans, &dirent_iter, - SPOS(inode->bi_dir, inode->bi_dir_offset, - parent_snapshot)); + d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); ret = bkey_err(d.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) break; - if (!ret && !dirent_points_to_inode(d, inode)) { + if (!ret && !dirent_points_to_inode(d, &inode)) { bch2_trans_iter_exit(trans, &dirent_iter); ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; } if (bch2_err_matches(ret, ENOENT)) { - if (fsck_err(c, inode_unreachable, - "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", - inode->bi_inum, snapshot, - bch2_d_type_str(inode_d_type(inode)), - inode->bi_nlink, - inode->bi_dir, - inode->bi_dir_offset)) - ret = reattach_inode(trans, inode, snapshot); - break; + ret = 0; + if (fsck_err(c, inode_unreachable, + "unreachable inode\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, inode_k), + buf.buf))) + ret = reattach_inode(trans, &inode, snapshot); + goto out; } bch2_trans_iter_exit(trans, &dirent_iter); - if (!S_ISDIR(inode->bi_mode)) + if (!S_ISDIR(inode.bi_mode)) break; - ret = path_down(c, p, inode->bi_inum, snapshot); + ret = path_down(c, p, inode.bi_inum, snapshot); if (ret) { bch_err(c, "memory allocation failure"); return ret; @@ -2060,7 +2199,12 @@ static int check_path(struct btree_trans *trans, snapshot = parent_snapshot; - ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); + bch2_trans_iter_exit(trans, &inode_iter); + inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, + SPOS(0, inode.bi_dir, snapshot), 0); + ret = bkey_err(inode_k) ?: + !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode + : bch2_inode_unpack(inode_k, &inode); if (ret) { /* Should have been caught in dirents pass */ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -2068,30 +2212,35 @@ static int check_path(struct btree_trans *trans, break; } - if (path_is_dup(p, inode->bi_inum, snapshot)) { + snapshot = inode_k.k->p.snapshot; + + if (path_is_dup(p, inode.bi_inum, snapshot)) { /* XXX print path */ bch_err(c, "directory structure loop"); darray_for_each(*p, i) pr_err("%llu:%u", i->inum, i->snapshot); - pr_err("%llu:%u", inode->bi_inum, snapshot); + pr_err("%llu:%u", inode.bi_inum, snapshot); if (!fsck_err(c, dir_loop, "directory structure loop")) return 0; - ret = remove_backpointer(trans, inode); + ret = remove_backpointer(trans, &inode); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) bch_err_msg(c, ret, "removing dirent"); if (ret) break; - ret = reattach_inode(trans, inode, snapshot); + ret = reattach_inode(trans, &inode, snapshot); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum); + bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); break; } } +out: fsck_err: + bch2_trans_iter_exit(trans, &inode_iter); + printbuf_exit(&buf); bch_err_fn(c, ret); return ret; } @@ -2103,7 +2252,6 @@ fsck_err: */ int bch2_check_directory_structure(struct bch_fs *c) { - struct bch_inode_unpacked u; pathbuf path = { 0, }; int ret; @@ -2116,12 +2264,10 @@ int bch2_check_directory_structure(struct bch_fs *c) if (!bkey_is_inode(k.k)) continue; - BUG_ON(bch2_inode_unpack(k, &u)); - - if (u.bi_flags & BCH_INODE_unlinked) + if (bch2_inode_flags(k) & BCH_INODE_unlinked) continue; - check_path(trans, &path, &u, iter.pos.snapshot); + check_path(trans, &path, k); }))); darray_exit(&path); diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index dbe37cc..414aebe 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -620,7 +620,8 @@ int bch2_trigger_inode(struct btree_trans *trans, bool old_deleted = bkey_is_deleted_inode(old); bool new_deleted = bkey_is_deleted_inode(new.s_c); if (old_deleted != new_deleted) { - int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted); + int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, + new.k->p, new_deleted); if (ret) return ret; } @@ -1169,7 +1170,7 @@ fsck_err: bch2_trans_iter_exit(trans, &inode_iter); return ret; delete: - ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false); + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false); goto out; } diff --git a/libbcachefs/inode.h b/libbcachefs/inode.h index 9a9353c..0562980 100644 --- a/libbcachefs/inode.h +++ b/libbcachefs/inode.h @@ -177,6 +177,20 @@ static inline u8 inode_d_type(struct bch_inode_unpacked *inode) return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode); } +static inline u32 bch2_inode_flags(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_inode: + return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); + case KEY_TYPE_inode_v2: + return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); + case KEY_TYPE_inode_v3: + return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); + default: + return 0; + } +} + /* i_nlink: */ static inline unsigned nlink_bias(umode_t mode) diff --git a/libbcachefs/lru.c b/libbcachefs/lru.c index 7a4ca5a..ed7577c 100644 --- a/libbcachefs/lru.c +++ b/libbcachefs/lru.c @@ -44,8 +44,8 @@ static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time, bool set) { return time - ? bch2_btree_bit_mod(trans, BTREE_ID_lru, - lru_pos(lru_id, dev_bucket, time), set) + ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, + lru_pos(lru_id, dev_bucket, time), set) : 0; } diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 9a4b7fa..f8c2341 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -332,6 +332,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Run fsck on mount") \ + x(fsck_memory_usage_percent, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_UINT(20, 70), \ + BCH2_NO_SB_OPT, 50, \ + NULL, "Maximum percentage of system ram fsck is allowed to pin")\ x(fix_errors, u8, \ OPT_FS|OPT_MOUNT, \ OPT_FN(bch2_opt_fix_errors), \ diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 9127d0e..4f8782d 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -264,7 +264,7 @@ static int journal_replay_entry_early(struct bch_fs *c, bkey_copy(&r->key, (struct bkey_i *) entry->start); r->error = 0; } else { - r->error = -EIO; + r->error = -BCH_ERR_btree_node_read_error; } r->alive = true; break; diff --git a/libbcachefs/recovery_types.h b/libbcachefs/recovery_types.h index fa0c8ef..f0fc1db 100644 --- a/libbcachefs/recovery_types.h +++ b/libbcachefs/recovery_types.h @@ -34,6 +34,7 @@ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ + x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ x(fs_upgrade_for_subvolumes, 22, 0) \ x(resume_logged_ops, 23, PASS_ALWAYS) \ diff --git a/libbcachefs/sb-downgrade.c b/libbcachefs/sb-downgrade.c index 626eaae..3337419 100644 --- a/libbcachefs/sb-downgrade.c +++ b/libbcachefs/sb-downgrade.c @@ -46,7 +46,13 @@ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \ x(rebalance_work, \ - BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) + BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \ + x(subvolume_fs_parent, \ + BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \ + BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \ + x(btree_subvolume_children, \ + BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \ + BCH_FSCK_ERR_subvol_children_not_set) #define DOWNGRADE_TABLE() diff --git a/libbcachefs/sb-errors_types.h b/libbcachefs/sb-errors_types.h index 63f18c7..1530bd3 100644 --- a/libbcachefs/sb-errors_types.h +++ b/libbcachefs/sb-errors_types.h @@ -231,7 +231,7 @@ x(dirent_name_dot_or_dotdot, 223) \ x(dirent_name_has_slash, 224) \ x(dirent_d_type_wrong, 225) \ - x(dirent_d_parent_subvol_wrong, 226) \ + x(inode_bi_parent_wrong, 226) \ x(dirent_in_missing_dir_inode, 227) \ x(dirent_in_non_dir_inode, 228) \ x(dirent_to_missing_inode, 229) \ @@ -253,7 +253,16 @@ x(reflink_p_front_pad_bad, 245) \ x(journal_entry_dup_same_device, 246) \ x(inode_bi_subvol_missing, 247) \ - x(inode_bi_subvol_wrong, 248) + x(inode_bi_subvol_wrong, 248) \ + x(inode_points_to_missing_dirent, 249) \ + x(inode_points_to_wrong_dirent, 250) \ + x(inode_bi_parent_nonzero, 251) \ + x(dirent_to_missing_parent_subvol, 252) \ + x(dirent_not_visible_in_parent_subvol, 253) \ + x(subvol_fs_path_parent_wrong, 254) \ + x(subvol_root_fs_path_parent_nonzero, 255) \ + x(subvol_children_not_set, 256) \ + x(subvol_children_bad, 257) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/libbcachefs/subvolume.c b/libbcachefs/subvolume.c index e7ee52c..ce7aed1 100644 --- a/libbcachefs/subvolume.c +++ b/libbcachefs/subvolume.c @@ -13,13 +13,26 @@ static int bch2_subvolume_delete(struct btree_trans *, u32); +static struct bpos subvolume_children_pos(struct bkey_s_c k) +{ + if (k.k->type != KEY_TYPE_subvolume) + return POS_MIN; + + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + if (!s.v->fs_path_parent) + return POS_MIN; + return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset); +} + static int check_subvol(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { struct bch_fs *c = trans->c; struct bkey_s_c_subvolume subvol; + struct btree_iter subvol_children_iter = {}; struct bch_snapshot snapshot; + struct printbuf buf = PRINTBUF; unsigned snapid; int ret = 0; @@ -42,6 +55,42 @@ static int check_subvol(struct btree_trans *trans, return ret ?: -BCH_ERR_transaction_restart_nested; } + if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL && + subvol.v->fs_path_parent, + c, subvol_root_fs_path_parent_nonzero, + "root subvolume has nonzero fs_path_parent\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + struct bkey_i_subvolume *n = + bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + n->v.fs_path_parent = 0; + } + + if (subvol.v->fs_path_parent) { + struct bpos pos = subvolume_children_pos(k); + + struct bkey_s_c subvol_children_k = + bch2_bkey_get_iter(trans, &subvol_children_iter, + BTREE_ID_subvolume_children, pos, 0); + ret = bkey_err(subvol_children_k); + if (ret) + goto err; + + if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set, + c, subvol_children_not_set, + "subvolume not set in subvolume_children btree at %llu:%llu\n%s", + pos.inode, pos.offset, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true); + if (ret) + goto err; + } + } + struct bch_inode_unpacked inode; struct btree_iter inode_iter = {}; ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode, @@ -102,9 +151,10 @@ static int check_subvol(struct btree_trans *trans, SET_BCH_SUBVOLUME_SNAP(&s->v, true); } } - err: fsck_err: + bch2_trans_iter_exit(trans, &subvol_children_iter); + printbuf_exit(&buf); return ret; } @@ -119,6 +169,42 @@ int bch2_check_subvols(struct bch_fs *c) return ret; } +static int check_subvol_child(struct btree_trans *trans, + struct btree_iter *child_iter, + struct bkey_s_c child_k) +{ + struct bch_fs *c = trans->c; + struct bch_subvolume s; + int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset), + 0, subvolume, &s); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(ret || + le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode, + c, subvol_children_bad, + "incorrect entry in subvolume_children btree %llu:%llu", + child_k.k->p.inode, child_k.k->p.offset)) { + ret = bch2_btree_delete_at(trans, child_iter, 0); + if (ret) + goto err; + } +err: +fsck_err: + return ret; +} + +int bch2_check_subvol_children(struct bch_fs *c) +{ + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_subvol_child(trans, &iter, k))); + bch_err_fn(c, ret); + return 0; +} + /* Subvolumes: */ int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k, @@ -143,8 +229,50 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, le64_to_cpu(s.v->inode), le32_to_cpu(s.v->snapshot)); - if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent)) - prt_printf(out, " parent %u", le32_to_cpu(s.v->parent)); + if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) { + prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent)); + prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent)); + } +} + +static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set) +{ + return !bpos_eq(pos, POS_MIN) + ? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set) + : 0; +} + +int bch2_subvolume_trigger(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) +{ + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + struct bpos children_pos_old = subvolume_children_pos(old); + struct bpos children_pos_new = subvolume_children_pos(new.s_c); + + if (!bpos_eq(children_pos_old, children_pos_new)) { + int ret = subvolume_children_mod(trans, children_pos_old, false) ?: + subvolume_children_mod(trans, children_pos_new, true); + if (ret) + return ret; + } + } + + return 0; +} + +int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) +{ + struct btree_iter iter; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0); + struct bkey_s_c k = bch2_btree_iter_peek(&iter); + bch2_trans_iter_exit(trans, &iter); + + return bkey_err(k) ?: k.k && k.k->p.inode == subvol + ? -BCH_ERR_ENOTEMPTY_subvol_not_empty + : 0; } static __always_inline int @@ -228,8 +356,8 @@ static int bch2_subvolume_reparent(struct btree_trans *trans, if (k.k->type != KEY_TYPE_subvolume) return 0; - if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) && - le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent) + if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) && + le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent) return 0; s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume); @@ -237,7 +365,7 @@ static int bch2_subvolume_reparent(struct btree_trans *trans, if (ret) return ret; - s->v.parent = cpu_to_le32(new_parent); + s->v.creation_parent = cpu_to_le32(new_parent); return 0; } @@ -260,7 +388,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_subvolume_reparent(trans, &iter, k, - subvolid_to_delete, le32_to_cpu(s.parent))); + subvolid_to_delete, le32_to_cpu(s.creation_parent))); } /* @@ -391,6 +519,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) } int bch2_subvolume_create(struct btree_trans *trans, u64 inode, + u32 parent_subvolid, u32 src_subvolid, u32 *new_subvolid, u32 *new_snapshotid, @@ -447,12 +576,13 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, if (ret) goto err; - new_subvol->v.flags = 0; - new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); - new_subvol->v.inode = cpu_to_le64(inode); - new_subvol->v.parent = cpu_to_le32(src_subvolid); - new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); - new_subvol->v.otime.hi = 0; + new_subvol->v.flags = 0; + new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); + new_subvol->v.inode = cpu_to_le64(inode); + new_subvol->v.creation_parent = cpu_to_le32(src_subvolid); + new_subvol->v.fs_path_parent = cpu_to_le32(parent_subvolid); + new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); + new_subvol->v.otime.hi = 0; SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0); diff --git a/libbcachefs/subvolume.h b/libbcachefs/subvolume.h index 3ca1d18..4045a18 100644 --- a/libbcachefs/subvolume.h +++ b/libbcachefs/subvolume.h @@ -7,17 +7,22 @@ enum bkey_invalid_flags; int bch2_check_subvols(struct bch_fs *); +int bch2_check_subvol_children(struct bch_fs *); int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ .key_invalid = bch2_subvolume_invalid, \ .val_to_text = bch2_subvolume_to_text, \ + .trigger = bch2_subvolume_trigger, \ .min_val_size = 16, \ }) +int bch2_subvol_has_children(struct btree_trans *, u32); int bch2_subvolume_get(struct btree_trans *, unsigned, bool, int, struct bch_subvolume *); int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); @@ -29,8 +34,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *); void bch2_delete_dead_snapshots_async(struct bch_fs *); int bch2_subvolume_unlink(struct btree_trans *, u32); -int bch2_subvolume_create(struct btree_trans *, u64, u32, - u32 *, u32 *, bool); +int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); int bch2_fs_subvolumes_init(struct bch_fs *); diff --git a/libbcachefs/subvolume_format.h b/libbcachefs/subvolume_format.h index af79134..e029df7 100644 --- a/libbcachefs/subvolume_format.h +++ b/libbcachefs/subvolume_format.h @@ -19,8 +19,8 @@ struct bch_subvolume { * This is _not_ necessarily the subvolume of the directory containing * this subvolume: */ - __le32 parent; - __le32 pad; + __le32 creation_parent; + __le32 fs_path_parent; bch_le128 otime; }; diff --git a/linux/mean_and_variance.c b/linux/mean_and_variance.c index b93d150..21ec6af 100644 --- a/linux/mean_and_variance.c +++ b/linux/mean_and_variance.c @@ -102,6 +102,8 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update() * @s: mean and variance number of samples and their sums * @x: new value to include in the &mean_and_variance_weighted + * @initted: caller must track whether this is the first use or not + * @weight: ewma weight * * see linked pdf: function derived from equations 140-143 where alpha = 2^w. * values are stored bitshifted for performance and added precision. @@ -132,6 +134,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); /** * mean_and_variance_weighted_get_mean() - get mean from @s * @s: mean and variance number of samples and their sums + * @weight: ewma weight */ s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s, u8 weight) @@ -143,6 +146,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); /** * mean_and_variance_weighted_get_variance() -- get variance from @s * @s: mean and variance number of samples and their sums + * @weight: ewma weight */ u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, u8 weight) @@ -155,6 +159,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); /** * mean_and_variance_weighted_get_stddev() - get standard deviation from @s * @s: mean and variance number of samples and their sums + * @weight: ewma weight */ u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, u8 weight)