From: Kent Overstreet Date: Sat, 23 Dec 2023 02:39:45 +0000 (-0500) Subject: Update bcachefs sources to 72740a707b64 bcachefs: Split brain detection X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=f0334bbc1032b3be485767ec804ec868a8b1e7f0;p=bcachefs-tools-debian Update bcachefs sources to 72740a707b64 bcachefs: Split brain detection Signed-off-by: Kent Overstreet --- diff --git a/.bcachefs_revision b/.bcachefs_revision index 9e8cc23..e9e23c9 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -c6d45169c6e3b4e42a189c9e87d1d14070033f01 +72740a707b64a4fb5f2bb559d8db27a66abc97cc diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index c6b0950..2105198 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1301,6 +1301,7 @@ struct bch_member { __le64 errors[BCH_MEMBER_ERROR_NR]; __le64 errors_at_reset[BCH_MEMBER_ERROR_NR]; __le64 errors_reset_time; + __le64 seq; }; #define BCH_MEMBER_V1_BYTES 56 @@ -1704,7 +1705,9 @@ LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64); x(deleted_inodes, BCH_VERSION(1, 2), \ BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \ x(rebalance_work, BCH_VERSION(1, 3), \ - BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) + BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \ + x(member_seq, BCH_VERSION(1, 4), \ + 0) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -1770,7 +1773,8 @@ struct bch_sb { __le32 time_base_hi; __le32 time_precision; - __le64 flags[8]; + __le64 flags[7]; + __le64 write_time; __le64 features[2]; __le64 compat[2]; diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index d9abf90..93a548c 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -2013,29 +2013,6 @@ do_write: /* buffer must be a multiple of the block size */ bytes = round_up(bytes, block_bytes(c)); - if (bytes > btree_bytes(c)) { - struct printbuf buf = PRINTBUF; - - prt_printf(&buf, "btree node write bounce buffer overrun: %u > %zu\n", - bytes, btree_bytes(c)); - - prt_printf(&buf, "header: %zu\n", b->written - ? sizeof(struct btree_node) - : sizeof(struct btree_node_entry)); - prt_printf(&buf, "unwritten: %zu\n", b->whiteout_u64s * sizeof(u64)); - - for_each_bset(b, t) { - i = bset(b, t); - - if (bset_written(b, i)) - continue; - prt_printf(&buf, "bset %zu: %zu\n", t - b->set, le16_to_cpu(i->u64s) * sizeof(u64)); - } - - panic("%s", buf.buf); - printbuf_exit(&buf); - } - data = btree_bounce_alloc(c, bytes, &used_mempool); if (!b->written) { diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 47d203c..029e2c1 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -1209,7 +1209,6 @@ static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_i bool intent) { btree_path_idx_t new = btree_path_alloc(trans, src); - btree_path_copy(trans, trans->paths + new, trans->paths + src); __btree_path_get(trans->paths + new, intent); return new; @@ -1512,42 +1511,50 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *trans) return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters); } +static noinline void btree_path_overflow(struct btree_trans *trans) +{ + bch2_dump_trans_paths_updates(trans); + bch_err(trans->c, "trans path overflow"); +} + static noinline void btree_paths_realloc(struct btree_trans *trans) { unsigned nr = trans->nr_paths * 2; void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) + - nr + 8 + sizeof(struct btree_trans_paths) + nr * sizeof(struct btree_path) + + nr * sizeof(btree_path_idx_t) + 8 + nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL); unsigned long *paths_allocated = p; + memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long)); p += BITS_TO_LONGS(nr) * sizeof(unsigned long); + + p += sizeof(struct btree_trans_paths); struct btree_path *paths = p; + *trans_paths_nr(paths) = nr; + memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path)); p += nr * sizeof(struct btree_path); - u8 *sorted = p; - p += nr + 8; - struct btree_insert_entry *updates = p; - *trans_paths_nr(paths) = nr; + btree_path_idx_t *sorted = p; + memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t)); + p += nr * sizeof(btree_path_idx_t) + 8; - memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long)); - memcpy(sorted, trans->sorted, trans->nr_sorted); - memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path)); - memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_path)); + struct btree_insert_entry *updates = p; + memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry)); unsigned long *old = trans->paths_allocated; rcu_assign_pointer(trans->paths_allocated, paths_allocated); - rcu_assign_pointer(trans->sorted, sorted); rcu_assign_pointer(trans->paths, paths); + rcu_assign_pointer(trans->sorted, sorted); rcu_assign_pointer(trans->updates, updates); trans->nr_paths = nr; if (old != trans->_paths_allocated) - kfree_rcu_mightsleep(trans->paths_allocated); + kfree_rcu_mightsleep(old); } static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans, @@ -1555,8 +1562,14 @@ static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans, { btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths); - if (unlikely(idx == trans->nr_paths)) + if (unlikely(idx == trans->nr_paths)) { + if (trans->nr_paths == BTREE_ITER_MAX) { + btree_path_overflow(trans); + return 0; + } + btree_paths_realloc(trans); + } /* * Do this before marking the new path as allocated, since it won't be @@ -2640,21 +2653,18 @@ out: static inline void btree_path_list_remove(struct btree_trans *trans, struct btree_path *path) { - unsigned i; - EBUG_ON(path->sorted_idx >= trans->nr_sorted); #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS trans->nr_sorted--; memmove_u64s_down_small(trans->sorted + path->sorted_idx, trans->sorted + path->sorted_idx + 1, - DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); + DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, + sizeof(u64) / sizeof(btree_path_idx_t))); #else array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); #endif - for (i = path->sorted_idx; i < trans->nr_sorted; i++) + for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++) trans->paths[trans->sorted[i]].sorted_idx = i; - - path->sorted_idx = U8_MAX; } static inline void btree_path_list_add(struct btree_trans *trans, @@ -2662,21 +2672,21 @@ static inline void btree_path_list_add(struct btree_trans *trans, btree_path_idx_t path_idx) { struct btree_path *path = trans->paths + path_idx; - unsigned i; path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted; #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1, trans->sorted + path->sorted_idx, - DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); + DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, + sizeof(u64) / sizeof(btree_path_idx_t))); trans->nr_sorted++; trans->sorted[path->sorted_idx] = path_idx; #else array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx); #endif - for (i = path->sorted_idx; i < trans->nr_sorted; i++) + for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++) trans->paths[trans->sorted[i]].sorted_idx = i; btree_trans_verify_sorted_refs(trans); @@ -2972,7 +2982,7 @@ got_trans: trans->paths = trans->_paths; trans->updates = trans->_updates; - *trans_paths_nr(trans->paths) = BTREE_ITER_MAX; + *trans_paths_nr(trans->paths) = BTREE_ITER_INITIAL; trans->paths_allocated[0] = 1; diff --git a/libbcachefs/btree_iter.h b/libbcachefs/btree_iter.h index 573c44d..da2b74f 100644 --- a/libbcachefs/btree_iter.h +++ b/libbcachefs/btree_iter.h @@ -642,7 +642,7 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *); static inline int btree_trans_too_many_iters(struct btree_trans *trans) { - if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_MAX - 8) + if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8) return __bch2_btree_trans_too_many_iters(trans); return 0; diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index e4ebfc2..d530307 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -358,7 +358,8 @@ struct btree_insert_entry { unsigned long ip_allocated; }; -#define BTREE_ITER_MAX 64 +#define BTREE_ITER_INITIAL 64 +#define BTREE_ITER_MAX (1U << 10) struct btree_trans_commit_hook; typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); @@ -382,7 +383,7 @@ struct btree_trans { unsigned long *paths_allocated; struct btree_path *paths; - u8 *sorted; + btree_path_idx_t *sorted; struct btree_insert_entry *updates; void *mem; @@ -438,11 +439,11 @@ struct btree_trans { struct list_head list; struct closure ref; - unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_MAX)]; + unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_INITIAL)]; struct btree_trans_paths trans_paths; - struct btree_path _paths[BTREE_ITER_MAX]; - u8 _sorted[BTREE_ITER_MAX + 8]; - struct btree_insert_entry _updates[BTREE_ITER_MAX]; + struct btree_path _paths[BTREE_ITER_INITIAL]; + btree_path_idx_t _sorted[BTREE_ITER_INITIAL + 4]; + struct btree_insert_entry _updates[BTREE_ITER_INITIAL]; }; static inline struct btree_path *btree_iter_path(struct btree_trans *trans, struct btree_iter *iter) diff --git a/libbcachefs/errcode.h b/libbcachefs/errcode.h index c817afb..08a8ee2 100644 --- a/libbcachefs/errcode.h +++ b/libbcachefs/errcode.h @@ -169,6 +169,7 @@ x(EINVAL, device_size_too_small) \ x(EINVAL, device_not_a_member_of_filesystem) \ x(EINVAL, device_has_been_removed) \ + x(EINVAL, device_splitbrain) \ x(EINVAL, device_already_online) \ x(EINVAL, insufficient_devices_to_start) \ x(EINVAL, invalid) \ @@ -220,6 +221,7 @@ x(BCH_ERR_invalid, invalid_bkey) \ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ x(EIO, btree_node_read_err) \ + x(EIO, sb_not_downgraded) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/libbcachefs/extent_update.c b/libbcachefs/extent_update.c index 21af6fb..b9033bb 100644 --- a/libbcachefs/extent_update.c +++ b/libbcachefs/extent_update.c @@ -100,7 +100,7 @@ static int count_iters_for_insert(struct btree_trans *trans, return ret2 ?: ret; } -#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) +#define EXTENT_ITERS_MAX (BTREE_ITER_INITIAL / 3) int bch2_extent_atomic_end(struct btree_trans *trans, struct btree_iter *iter, diff --git a/libbcachefs/io_misc.c b/libbcachefs/io_misc.c index 197ff7a..ca6d5f5 100644 --- a/libbcachefs/io_misc.c +++ b/libbcachefs/io_misc.c @@ -58,7 +58,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, */ ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); if (unlikely(ret)) - goto err; + goto err_noprint; bch2_bkey_buf_reassemble(&old, c, k); @@ -118,7 +118,7 @@ err: inum.inum, iter->pos.offset << 9, "%s(): error: %s", __func__, bch2_err_str(ret)); - +err_noprint: bch2_open_buckets_put(c, &open_buckets); bch2_disk_reservation_put(c, &disk_res); bch2_bkey_buf_exit(&new, c); diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 62ec1df..69eec4e 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -1090,6 +1090,8 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; + c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1; + if (enabled_qtypes(c)) { ret = bch2_fs_quota_read(c); if (ret) diff --git a/libbcachefs/sb-members.c b/libbcachefs/sb-members.c index 7c5db66..4c19a80 100644 --- a/libbcachefs/sb-members.c +++ b/libbcachefs/sb-members.c @@ -235,6 +235,11 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "(never)"); prt_newline(out); + prt_printf(out, "Last superblock write:"); + prt_tab(out); + prt_u64(out, le64_to_cpu(m.seq)); + prt_newline(out); + prt_printf(out, "State:"); prt_tab(out); prt_printf(out, "%s", diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 326faf2..134f2c2 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -530,6 +530,7 @@ static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) dst->time_base_lo = src->time_base_lo; dst->time_base_hi = src->time_base_hi; dst->time_precision = src->time_precision; + dst->write_time = src->write_time; memcpy(dst->flags, src->flags, sizeof(dst->flags)); memcpy(dst->features, src->features, sizeof(dst->features)); @@ -906,8 +907,25 @@ int bch2_write_super(struct bch_fs *c) c->disk_sb.sb->magic = BCHFS_MAGIC; c->disk_sb.sb->layout.magic = BCHFS_MAGIC; + if (le16_to_cpu(c->disk_sb.sb->version) > bcachefs_metadata_version_current) { + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "attempting to write superblock that wasn't version downgraded ("); + bch2_version_to_text(&buf, le16_to_cpu(c->disk_sb.sb->version)); + prt_str(&buf, " > "); + bch2_version_to_text(&buf, bcachefs_metadata_version_current); + prt_str(&buf, ")"); + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + return -BCH_ERR_sb_not_downgraded; + } + le64_add_cpu(&c->disk_sb.sb->seq, 1); + struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); + for_each_online_member(c, ca) + __bch2_members_v2_get_mut(mi, ca->dev_idx)->seq = c->disk_sb.sb->seq; + c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds()); + if (test_bit(BCH_FS_error, &c->flags)) SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); if (test_bit(BCH_FS_topology_error, &c->flags)) @@ -1210,6 +1228,11 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, prt_printf(out, "%llu", le64_to_cpu(sb->seq)); prt_newline(out); + prt_printf(out, "Time of last write:"); + prt_tab(out); + bch2_prt_datetime(out, le64_to_cpu(sb->write_time)); + prt_newline(out); + prt_printf(out, "Superblock size:"); prt_tab(out); prt_printf(out, "%zu", vstruct_bytes(sb)); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 632d717..e7b81db 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -1066,20 +1066,65 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) return 0; } -static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) +static int bch2_dev_in_fs(struct bch_sb_handle *fs, + struct bch_sb_handle *sb) { - struct bch_sb *newest = - le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb; + if (fs == sb) + return 0; - if (!uuid_equal(&fs->uuid, &sb->uuid)) + if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid)) return -BCH_ERR_device_not_a_member_of_filesystem; - if (!bch2_dev_exists(newest, sb->dev_idx)) + if (!bch2_dev_exists(fs->sb, sb->sb->dev_idx)) return -BCH_ERR_device_has_been_removed; - if (fs->block_size != sb->block_size) + if (fs->sb->block_size != sb->sb->block_size) return -BCH_ERR_mismatched_block_size; + if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq || + le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq) + return 0; + + if (fs->sb->seq == sb->sb->seq && + fs->sb->write_time != sb->sb->write_time) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "Split brain detected between %pg and %pg:", + sb->bdev, fs->bdev); + prt_newline(&buf); + prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq)); + prt_newline(&buf); + + prt_printf(&buf, "%pg ", fs->bdev); + bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));; + prt_newline(&buf); + + prt_printf(&buf, "%pg ", sb->bdev); + bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));; + prt_newline(&buf); + + prt_printf(&buf, "Not using older sb"); + + pr_err("%s", buf.buf); + printbuf_exit(&buf); + return -BCH_ERR_device_splitbrain; + } + + struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); + u64 seq_from_fs = le64_to_cpu(m.seq); + u64 seq_from_member = le64_to_cpu(sb->sb->seq); + + if (seq_from_fs && seq_from_fs < seq_from_member) { + pr_err("Split brain detected between %pg and %pg:\n" + "%pg believes seq of %pg to be %llu, but %pg has %llu\n" + "Not using %pg", + sb->bdev, fs->bdev, + fs->bdev, sb->bdev, seq_from_fs, + sb->bdev, seq_from_member, + sb->bdev); + return -BCH_ERR_device_splitbrain; + } + return 0; } @@ -1773,7 +1818,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) dev_idx = sb.sb->dev_idx; - ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); + ret = bch2_dev_in_fs(&c->disk_sb, &sb); bch_err_msg(c, ret, "bringing %s online", path); if (ret) goto err; @@ -1914,6 +1959,12 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) /* Filesystem open: */ +static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) +{ + return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?: + cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); +} + struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, struct bch_opts opts) { @@ -1946,19 +1997,21 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, } darray_for_each(sbs, sb) - if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq)) + if (!best || sb_cmp(sb->sb, best->sb) > 0) best = sb; darray_for_each_reverse(sbs, sb) { - if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) { - pr_info("%pg has been removed, skipping", sb->bdev); + ret = bch2_dev_in_fs(best, sb); + + if (ret == -BCH_ERR_device_has_been_removed || + ret == -BCH_ERR_device_splitbrain) { bch2_free_super(sb); darray_remove_item(&sbs, sb); best -= best > sb; + ret = 0; continue; } - ret = bch2_dev_in_fs(best->sb, sb->sb); if (ret) goto err_print; }