-c6d45169c6e3b4e42a189c9e87d1d14070033f01
+72740a707b64a4fb5f2bb559d8db27a66abc97cc
__le64 errors[BCH_MEMBER_ERROR_NR];
__le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
__le64 errors_reset_time;
+ __le64 seq;
};
#define BCH_MEMBER_V1_BYTES 56
x(deleted_inodes, BCH_VERSION(1, 2), \
BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \
x(rebalance_work, BCH_VERSION(1, 3), \
- BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
+ BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \
+ x(member_seq, BCH_VERSION(1, 4), \
+ 0)
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
__le32 time_base_hi;
__le32 time_precision;
- __le64 flags[8];
+ __le64 flags[7];
+ __le64 write_time;
__le64 features[2];
__le64 compat[2];
/* buffer must be a multiple of the block size */
bytes = round_up(bytes, block_bytes(c));
- if (bytes > btree_bytes(c)) {
- struct printbuf buf = PRINTBUF;
-
- prt_printf(&buf, "btree node write bounce buffer overrun: %u > %zu\n",
- bytes, btree_bytes(c));
-
- prt_printf(&buf, "header: %zu\n", b->written
- ? sizeof(struct btree_node)
- : sizeof(struct btree_node_entry));
- prt_printf(&buf, "unwritten: %zu\n", b->whiteout_u64s * sizeof(u64));
-
- for_each_bset(b, t) {
- i = bset(b, t);
-
- if (bset_written(b, i))
- continue;
- prt_printf(&buf, "bset %zu: %zu\n", t - b->set, le16_to_cpu(i->u64s) * sizeof(u64));
- }
-
- panic("%s", buf.buf);
- printbuf_exit(&buf);
- }
-
data = btree_bounce_alloc(c, bytes, &used_mempool);
if (!b->written) {
bool intent)
{
btree_path_idx_t new = btree_path_alloc(trans, src);
-
btree_path_copy(trans, trans->paths + new, trans->paths + src);
__btree_path_get(trans->paths + new, intent);
return new;
return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
}
+static noinline void btree_path_overflow(struct btree_trans *trans)
+{
+ bch2_dump_trans_paths_updates(trans);
+ bch_err(trans->c, "trans path overflow");
+}
+
static noinline void btree_paths_realloc(struct btree_trans *trans)
{
unsigned nr = trans->nr_paths * 2;
void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
- nr + 8 +
sizeof(struct btree_trans_paths) +
nr * sizeof(struct btree_path) +
+ nr * sizeof(btree_path_idx_t) + 8 +
nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL);
unsigned long *paths_allocated = p;
+ memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long));
p += BITS_TO_LONGS(nr) * sizeof(unsigned long);
+
+ p += sizeof(struct btree_trans_paths);
struct btree_path *paths = p;
+ *trans_paths_nr(paths) = nr;
+ memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path));
p += nr * sizeof(struct btree_path);
- u8 *sorted = p;
- p += nr + 8;
- struct btree_insert_entry *updates = p;
- *trans_paths_nr(paths) = nr;
+ btree_path_idx_t *sorted = p;
+ memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t));
+ p += nr * sizeof(btree_path_idx_t) + 8;
- memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long));
- memcpy(sorted, trans->sorted, trans->nr_sorted);
- memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path));
- memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_path));
+ struct btree_insert_entry *updates = p;
+ memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry));
unsigned long *old = trans->paths_allocated;
rcu_assign_pointer(trans->paths_allocated, paths_allocated);
- rcu_assign_pointer(trans->sorted, sorted);
rcu_assign_pointer(trans->paths, paths);
+ rcu_assign_pointer(trans->sorted, sorted);
rcu_assign_pointer(trans->updates, updates);
trans->nr_paths = nr;
if (old != trans->_paths_allocated)
- kfree_rcu_mightsleep(trans->paths_allocated);
+ kfree_rcu_mightsleep(old);
}
static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans,
{
btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths);
- if (unlikely(idx == trans->nr_paths))
+ if (unlikely(idx == trans->nr_paths)) {
+ if (trans->nr_paths == BTREE_ITER_MAX) {
+ btree_path_overflow(trans);
+ return 0;
+ }
+
btree_paths_realloc(trans);
+ }
/*
* Do this before marking the new path as allocated, since it won't be
static inline void btree_path_list_remove(struct btree_trans *trans,
struct btree_path *path)
{
- unsigned i;
-
EBUG_ON(path->sorted_idx >= trans->nr_sorted);
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
trans->nr_sorted--;
memmove_u64s_down_small(trans->sorted + path->sorted_idx,
trans->sorted + path->sorted_idx + 1,
- DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
+ DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
+ sizeof(u64) / sizeof(btree_path_idx_t)));
#else
array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
#endif
- for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+ for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
trans->paths[trans->sorted[i]].sorted_idx = i;
-
- path->sorted_idx = U8_MAX;
}
static inline void btree_path_list_add(struct btree_trans *trans,
btree_path_idx_t path_idx)
{
struct btree_path *path = trans->paths + path_idx;
- unsigned i;
path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted;
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
trans->sorted + path->sorted_idx,
- DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
+ DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
+ sizeof(u64) / sizeof(btree_path_idx_t)));
trans->nr_sorted++;
trans->sorted[path->sorted_idx] = path_idx;
#else
array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx);
#endif
- for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+ for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
trans->paths[trans->sorted[i]].sorted_idx = i;
btree_trans_verify_sorted_refs(trans);
trans->paths = trans->_paths;
trans->updates = trans->_updates;
- *trans_paths_nr(trans->paths) = BTREE_ITER_MAX;
+ *trans_paths_nr(trans->paths) = BTREE_ITER_INITIAL;
trans->paths_allocated[0] = 1;
static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
- if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_MAX - 8)
+ if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8)
return __bch2_btree_trans_too_many_iters(trans);
return 0;
unsigned long ip_allocated;
};
-#define BTREE_ITER_MAX 64
+#define BTREE_ITER_INITIAL 64
+#define BTREE_ITER_MAX (1U << 10)
struct btree_trans_commit_hook;
typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
unsigned long *paths_allocated;
struct btree_path *paths;
- u8 *sorted;
+ btree_path_idx_t *sorted;
struct btree_insert_entry *updates;
void *mem;
struct list_head list;
struct closure ref;
- unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_MAX)];
+ unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_INITIAL)];
struct btree_trans_paths trans_paths;
- struct btree_path _paths[BTREE_ITER_MAX];
- u8 _sorted[BTREE_ITER_MAX + 8];
- struct btree_insert_entry _updates[BTREE_ITER_MAX];
+ struct btree_path _paths[BTREE_ITER_INITIAL];
+ btree_path_idx_t _sorted[BTREE_ITER_INITIAL + 4];
+ struct btree_insert_entry _updates[BTREE_ITER_INITIAL];
};
static inline struct btree_path *btree_iter_path(struct btree_trans *trans, struct btree_iter *iter)
x(EINVAL, device_size_too_small) \
x(EINVAL, device_not_a_member_of_filesystem) \
x(EINVAL, device_has_been_removed) \
+ x(EINVAL, device_splitbrain) \
x(EINVAL, device_already_online) \
x(EINVAL, insufficient_devices_to_start) \
x(EINVAL, invalid) \
x(BCH_ERR_invalid, invalid_bkey) \
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
x(EIO, btree_node_read_err) \
+ x(EIO, sb_not_downgraded) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
return ret2 ?: ret;
}
-#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3)
+#define EXTENT_ITERS_MAX (BTREE_ITER_INITIAL / 3)
int bch2_extent_atomic_end(struct btree_trans *trans,
struct btree_iter *iter,
*/
ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
if (unlikely(ret))
- goto err;
+ goto err_noprint;
bch2_bkey_buf_reassemble(&old, c, k);
inum.inum,
iter->pos.offset << 9,
"%s(): error: %s", __func__, bch2_err_str(ret));
-
+err_noprint:
bch2_open_buckets_put(c, &open_buckets);
bch2_disk_reservation_put(c, &disk_res);
bch2_bkey_buf_exit(&new, c);
if (ret)
goto err;
+ c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1;
+
if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
if (ret)
prt_printf(out, "(never)");
prt_newline(out);
+ prt_printf(out, "Last superblock write:");
+ prt_tab(out);
+ prt_u64(out, le64_to_cpu(m.seq));
+ prt_newline(out);
+
prt_printf(out, "State:");
prt_tab(out);
prt_printf(out, "%s",
dst->time_base_lo = src->time_base_lo;
dst->time_base_hi = src->time_base_hi;
dst->time_precision = src->time_precision;
+ dst->write_time = src->write_time;
memcpy(dst->flags, src->flags, sizeof(dst->flags));
memcpy(dst->features, src->features, sizeof(dst->features));
c->disk_sb.sb->magic = BCHFS_MAGIC;
c->disk_sb.sb->layout.magic = BCHFS_MAGIC;
+ if (le16_to_cpu(c->disk_sb.sb->version) > bcachefs_metadata_version_current) {
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "attempting to write superblock that wasn't version downgraded (");
+ bch2_version_to_text(&buf, le16_to_cpu(c->disk_sb.sb->version));
+ prt_str(&buf, " > ");
+ bch2_version_to_text(&buf, bcachefs_metadata_version_current);
+ prt_str(&buf, ")");
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return -BCH_ERR_sb_not_downgraded;
+ }
+
le64_add_cpu(&c->disk_sb.sb->seq, 1);
+ struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ for_each_online_member(c, ca)
+ __bch2_members_v2_get_mut(mi, ca->dev_idx)->seq = c->disk_sb.sb->seq;
+ c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds());
+
if (test_bit(BCH_FS_error, &c->flags))
SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
if (test_bit(BCH_FS_topology_error, &c->flags))
prt_printf(out, "%llu", le64_to_cpu(sb->seq));
prt_newline(out);
+ prt_printf(out, "Time of last write:");
+ prt_tab(out);
+ bch2_prt_datetime(out, le64_to_cpu(sb->write_time));
+ prt_newline(out);
+
prt_printf(out, "Superblock size:");
prt_tab(out);
prt_printf(out, "%zu", vstruct_bytes(sb));
return 0;
}
-static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
+static int bch2_dev_in_fs(struct bch_sb_handle *fs,
+ struct bch_sb_handle *sb)
{
- struct bch_sb *newest =
- le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
+ if (fs == sb)
+ return 0;
- if (!uuid_equal(&fs->uuid, &sb->uuid))
+ if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid))
return -BCH_ERR_device_not_a_member_of_filesystem;
- if (!bch2_dev_exists(newest, sb->dev_idx))
+ if (!bch2_dev_exists(fs->sb, sb->sb->dev_idx))
return -BCH_ERR_device_has_been_removed;
- if (fs->block_size != sb->block_size)
+ if (fs->sb->block_size != sb->sb->block_size)
return -BCH_ERR_mismatched_block_size;
+ if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq ||
+ le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq)
+ return 0;
+
+ if (fs->sb->seq == sb->sb->seq &&
+ fs->sb->write_time != sb->sb->write_time) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "Split brain detected between %pg and %pg:",
+ sb->bdev, fs->bdev);
+ prt_newline(&buf);
+ prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq));
+ prt_newline(&buf);
+
+ prt_printf(&buf, "%pg ", fs->bdev);
+ bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));;
+ prt_newline(&buf);
+
+ prt_printf(&buf, "%pg ", sb->bdev);
+ bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));;
+ prt_newline(&buf);
+
+ prt_printf(&buf, "Not using older sb");
+
+ pr_err("%s", buf.buf);
+ printbuf_exit(&buf);
+ return -BCH_ERR_device_splitbrain;
+ }
+
+ struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
+ u64 seq_from_fs = le64_to_cpu(m.seq);
+ u64 seq_from_member = le64_to_cpu(sb->sb->seq);
+
+ if (seq_from_fs && seq_from_fs < seq_from_member) {
+ pr_err("Split brain detected between %pg and %pg:\n"
+ "%pg believes seq of %pg to be %llu, but %pg has %llu\n"
+ "Not using %pg",
+ sb->bdev, fs->bdev,
+ fs->bdev, sb->bdev, seq_from_fs,
+ sb->bdev, seq_from_member,
+ sb->bdev);
+ return -BCH_ERR_device_splitbrain;
+ }
+
return 0;
}
dev_idx = sb.sb->dev_idx;
- ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
+ ret = bch2_dev_in_fs(&c->disk_sb, &sb);
bch_err_msg(c, ret, "bringing %s online", path);
if (ret)
goto err;
/* Filesystem open: */
+static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
+{
+ return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?:
+ cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time));
+}
+
struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
struct bch_opts opts)
{
}
darray_for_each(sbs, sb)
- if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq))
+ if (!best || sb_cmp(sb->sb, best->sb) > 0)
best = sb;
darray_for_each_reverse(sbs, sb) {
- if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) {
- pr_info("%pg has been removed, skipping", sb->bdev);
+ ret = bch2_dev_in_fs(best, sb);
+
+ if (ret == -BCH_ERR_device_has_been_removed ||
+ ret == -BCH_ERR_device_splitbrain) {
bch2_free_super(sb);
darray_remove_item(&sbs, sb);
best -= best > sb;
+ ret = 0;
continue;
}
- ret = bch2_dev_in_fs(best->sb, sb->sb);
if (ret)
goto err_print;
}