#include "btree_update_interior.h"
#include "buckets.h"
#include "checksum.h"
+#include "disk_groups.h"
#include "error.h"
#include "io.h"
#include "journal.h"
#define FSCK_DELETED_KEY 5
-static int journal_validate_key(struct bch_fs *c, struct jset *jset,
+static int journal_validate_key(struct bch_fs *c, const char *where,
struct jset_entry *entry,
unsigned level, enum btree_id btree_id,
- struct bkey_i *k,
- const char *type, int write)
+ struct bkey_i *k, const char *type,
+ unsigned version, int big_endian, int write)
{
void *next = vstruct_next(entry);
const char *invalid;
- unsigned version = le32_to_cpu(jset->version);
int ret = 0;
if (journal_entry_err_on(!k->k.u64s, c,
- "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: k->u64s 0",
- type, le64_to_cpu(jset->seq),
- (u64 *) entry - jset->_data,
- le32_to_cpu(jset->u64s),
+ "invalid %s in %s entry offset %zi/%u: k->u64s 0",
+ type, where,
(u64 *) k - entry->_data,
le16_to_cpu(entry->u64s))) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
if (journal_entry_err_on((void *) bkey_next(k) >
(void *) vstruct_next(entry), c,
- "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: extends past end of journal entry",
- type, le64_to_cpu(jset->seq),
- (u64 *) entry - jset->_data,
- le32_to_cpu(jset->u64s),
+ "invalid %s in %s entry offset %zi/%u: extends past end of journal entry",
+ type, where,
(u64 *) k - entry->_data,
le16_to_cpu(entry->u64s))) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
}
if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
- "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: bad format %u",
- type, le64_to_cpu(jset->seq),
- (u64 *) entry - jset->_data,
- le32_to_cpu(jset->u64s),
+ "invalid %s in %s entry offset %zi/%u: bad format %u",
+ type, where,
(u64 *) k - entry->_data,
le16_to_cpu(entry->u64s),
k->k.format)) {
}
if (!write)
- bch2_bkey_compat(level, btree_id, version,
- JSET_BIG_ENDIAN(jset), write,
- NULL, bkey_to_packed(k));
+ bch2_bkey_compat(level, btree_id, version, big_endian,
+ write, NULL, bkey_to_packed(k));
invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
__btree_node_type(level, btree_id));
char buf[160];
bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k));
- mustfix_fsck_err(c, "invalid %s in jset %llu offset %zi/%u entry offset %zi/%u: %s\n%s",
- type, le64_to_cpu(jset->seq),
- (u64 *) entry - jset->_data,
- le32_to_cpu(jset->u64s),
+ mustfix_fsck_err(c, "invalid %s in %s entry offset %zi/%u: %s\n%s",
+ type, where,
(u64 *) k - entry->_data,
le16_to_cpu(entry->u64s),
invalid, buf);
}
if (write)
- bch2_bkey_compat(level, btree_id, version,
- JSET_BIG_ENDIAN(jset), write,
- NULL, bkey_to_packed(k));
+ bch2_bkey_compat(level, btree_id, version, big_endian,
+ write, NULL, bkey_to_packed(k));
fsck_err:
return ret;
}
static int journal_entry_validate_btree_keys(struct bch_fs *c,
- struct jset *jset,
+ const char *where,
struct jset_entry *entry,
- int write)
+ unsigned version, int big_endian, int write)
{
struct bkey_i *k = entry->start;
while (k != vstruct_last(entry)) {
- int ret = journal_validate_key(c, jset, entry,
+ int ret = journal_validate_key(c, where, entry,
entry->level,
entry->btree_id,
- k, "key", write);
+ k, "key", version, big_endian, write);
if (ret == FSCK_DELETED_KEY)
continue;
}
static int journal_entry_validate_btree_root(struct bch_fs *c,
- struct jset *jset,
+ const char *where,
struct jset_entry *entry,
- int write)
+ unsigned version, int big_endian, int write)
{
struct bkey_i *k = entry->start;
int ret = 0;
return 0;
}
- return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
- "btree root", write);
+ return journal_validate_key(c, where, entry, 1, entry->btree_id, k,
+ "btree root", version, big_endian, write);
fsck_err:
return ret;
}
static int journal_entry_validate_prio_ptrs(struct bch_fs *c,
- struct jset *jset,
+ const char *where,
struct jset_entry *entry,
- int write)
+ unsigned version, int big_endian, int write)
{
/* obsolete, don't care: */
return 0;
}
static int journal_entry_validate_blacklist(struct bch_fs *c,
- struct jset *jset,
+ const char *where,
struct jset_entry *entry,
- int write)
+ unsigned version, int big_endian, int write)
{
int ret = 0;
}
static int journal_entry_validate_blacklist_v2(struct bch_fs *c,
- struct jset *jset,
+ const char *where,
struct jset_entry *entry,
- int write)
+ unsigned version, int big_endian, int write)
{
struct jset_entry_blacklist_v2 *bl_entry;
int ret = 0;
}
static int journal_entry_validate_usage(struct bch_fs *c,
- struct jset *jset,
+ const char *where,
struct jset_entry *entry,
- int write)
+ unsigned version, int big_endian, int write)
{
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
}
static int journal_entry_validate_data_usage(struct bch_fs *c,
- struct jset *jset,
+ const char *where,
struct jset_entry *entry,
- int write)
+ unsigned version, int big_endian, int write)
{
struct jset_entry_data_usage *u =
container_of(entry, struct jset_entry_data_usage, entry);
return ret;
}
+static int journal_entry_validate_clock(struct bch_fs *c,
+ const char *where,
+ struct jset_entry *entry,
+ unsigned version, int big_endian, int write)
+{
+ struct jset_entry_clock *clock =
+ container_of(entry, struct jset_entry_clock, entry);
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+ int ret = 0;
+
+ if (journal_entry_err_on(bytes != sizeof(*clock),
+ c, "invalid journal entry clock: bad size")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+ if (journal_entry_err_on(clock->rw > 1,
+ c, "invalid journal entry clock: bad rw")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+fsck_err:
+ return ret;
+}
+
+static int journal_entry_validate_dev_usage(struct bch_fs *c,
+ const char *where,
+ struct jset_entry *entry,
+ unsigned version, int big_endian, int write)
+{
+ struct jset_entry_dev_usage *u =
+ container_of(entry, struct jset_entry_dev_usage, entry);
+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64);
+ unsigned expected = sizeof(*u) + sizeof(u->d[0]) * 7; /* Current value of BCH_DATA_NR */
+ unsigned dev;
+ int ret = 0;
+
+ if (journal_entry_err_on(bytes < expected,
+ c, "invalid journal entry dev usage: bad size (%u < %u)",
+ bytes, expected)) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+ dev = le32_to_cpu(u->dev);
+
+ if (journal_entry_err_on(!bch2_dev_exists2(c, dev),
+ c, "invalid journal entry dev usage: bad dev")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+ if (journal_entry_err_on(u->pad,
+ c, "invalid journal entry dev usage: bad pad")) {
+ journal_entry_null_range(entry, vstruct_next(entry));
+ return ret;
+ }
+
+fsck_err:
+ return ret;
+}
+
struct jset_entry_ops {
- int (*validate)(struct bch_fs *, struct jset *,
- struct jset_entry *, int);
+ int (*validate)(struct bch_fs *, const char *,
+ struct jset_entry *, unsigned, int, int);
};
static const struct jset_entry_ops bch2_jset_entry_ops[] = {
#undef x
};
-static int journal_entry_validate(struct bch_fs *c, struct jset *jset,
- struct jset_entry *entry, int write)
+int bch2_journal_entry_validate(struct bch_fs *c, const char *where,
+ struct jset_entry *entry,
+ unsigned version, int big_endian, int write)
{
return entry->type < BCH_JSET_ENTRY_NR
- ? bch2_jset_entry_ops[entry->type].validate(c, jset,
- entry, write)
+ ? bch2_jset_entry_ops[entry->type].validate(c, where, entry,
+ version, big_endian, write)
: 0;
}
static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
int write)
{
+ char buf[100];
struct jset_entry *entry;
int ret = 0;
vstruct_for_each(jset, entry) {
+ scnprintf(buf, sizeof(buf), "jset %llu entry offset %zi/%u",
+ le64_to_cpu(jset->seq),
+ (u64 *) entry - jset->_data,
+ le32_to_cpu(jset->u64s));
+
if (journal_entry_err_on(vstruct_next(entry) >
vstruct_last(jset), c,
"journal entry extends past end of jset")) {
break;
}
- ret = journal_entry_validate(c, jset, entry, write);
+ ret = bch2_journal_entry_validate(c, buf, entry,
+ le32_to_cpu(jset->version),
+ JSET_BIG_ENDIAN(jset), write);
if (ret)
break;
}
for (i = 0; i < j->nr_ptrs; i++) {
struct bch_dev *ca = c->devs[j->ptrs[i].dev];
+ u64 offset;
+
+ div64_u64_rem(j->ptrs[i].offset, ca->mi.bucket_size, &offset);
if (i)
pr_buf(out, " ");
pr_buf(out, "%u:%llu (offset %llu)",
j->ptrs[i].dev,
- (u64) j->ptrs[i].offset,
- (u64) j->ptrs[i].offset % ca->mi.bucket_size);
+ (u64) j->ptrs[i].offset, offset);
}
}
!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
continue;
- if ((ca->mi.state == BCH_MEMBER_STATE_RW ||
- ca->mi.state == BCH_MEMBER_STATE_RO) &&
+ if ((ca->mi.state == BCH_MEMBER_STATE_rw ||
+ ca->mi.state == BCH_MEMBER_STATE_ro) &&
percpu_ref_tryget(&ca->io_ref))
closure_call(&ca->journal.read,
bch2_journal_read_device,
for (ptr = 0; ptr < i->nr_ptrs; ptr++)
replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
+ bch2_replicas_entry_sort(&replicas.e);
+
/*
* If we're mounting in degraded mode - if we didn't read all
* the devices - this is wrong:
* it:
*/
if (!ca->mi.durability ||
- ca->mi.state != BCH_MEMBER_STATE_RW ||
+ ca->mi.state != BCH_MEMBER_STATE_rw ||
!ja->nr ||
bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
ca->dev_idx) ||
unsigned sectors)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_devs_mask devs;
struct journal_device *ja;
struct bch_dev *ca;
struct dev_alloc_list devs_sorted;
+ unsigned target = c->opts.metadata_target ?:
+ c->opts.foreground_target;
unsigned i, replicas = 0, replicas_want =
READ_ONCE(c->opts.metadata_replicas);
rcu_read_lock();
+retry:
+ devs = target_rw_devs(c, BCH_DATA_journal, target);
- devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe,
- &c->rw_devs[BCH_DATA_journal]);
+ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs);
__journal_write_alloc(j, w, &devs_sorted,
sectors, &replicas, replicas_want);
__journal_write_alloc(j, w, &devs_sorted,
sectors, &replicas, replicas_want);
+
+ if (replicas < replicas_want && target) {
+ /* Retry from all devices: */
+ target = 0;
+ goto retry;
+ }
done:
rcu_read_unlock();
bio->bi_private = ca;
bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META;
+ BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
+ ca->prev_journal_sector = bio->bi_iter.bi_sector;
+
if (!JSET_NO_FLUSH(w->data))
bio->bi_opf |= REQ_FUA;
if (!JSET_NO_FLUSH(w->data) && !w->separate_flush)
struct jset_entry *start, *end;
struct jset *jset;
struct bio *bio;
+ char *journal_debug_buf = NULL;
bool validate_before_checksum = false;
unsigned i, sectors, bytes, u64s, nr_rw_members = 0;
int ret;
end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
- end = bch2_journal_super_entries_add_common(c, end,
- le64_to_cpu(jset->seq));
+ bch2_journal_super_entries_add_common(c, &end,
+ le64_to_cpu(jset->seq));
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
journal_write_compact(jset);
- jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand);
- jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand);
jset->magic = cpu_to_le64(jset_magic(c));
-
jset->version = c->sb.version < bcachefs_metadata_version_new_versioning
? cpu_to_le32(BCH_JSET_VERSION_OLD)
: cpu_to_le32(c->sb.version);
goto retry_alloc;
}
+ if (ret) {
+ journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
+ if (journal_debug_buf)
+ __bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
+ }
+
/*
* write is allocated, no longer need to account for it in
* bch2_journal_space_available():
spin_unlock(&j->lock);
if (ret) {
- bch_err(c, "Unable to allocate journal write");
+ bch_err(c, "Unable to allocate journal write:\n%s",
+ journal_debug_buf);
+ kfree(journal_debug_buf);
bch2_fatal_error(c);
continue_at(cl, journal_write_done, system_highpri_wq);
return;