#include <trace/events/bcachefs.h>
-static inline u32 journal_entry_radix_idx(struct bch_fs *c,
- struct jset *j)
+static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
{
- return (le64_to_cpu(j->seq) - c->journal_entries_base_seq) & (~0U >> 1);
+ return (seq - c->journal_entries_base_seq) & (~0U >> 1);
}
static void __journal_replay_free(struct bch_fs *c,
struct journal_replay *i)
{
struct journal_replay **p =
- genradix_ptr(&c->journal_entries, journal_entry_radix_idx(c, &i->j));
+ genradix_ptr(&c->journal_entries,
+ journal_entry_radix_idx(c, le64_to_cpu(i->j.seq)));
BUG_ON(*p != i);
*p = NULL;
struct journal_list {
struct closure cl;
+ u64 last_seq;
struct mutex lock;
int ret;
};
struct journal_replay **_i, *i, *dup;
struct journal_ptr *ptr;
size_t bytes = vstruct_bytes(j);
- u64 last_seq = 0;
+ u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
int ret = JOURNAL_ENTRY_ADD_OK;
+ /* Is this entry older than the range we need? */
+ if (!c->opts.read_entire_journal &&
+ le64_to_cpu(j->seq) < jlist->last_seq)
+ return JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+
/*
- * Xarrays are indexed by a ulong, not a u64, so we can't index them by
- * sequence number directly:
- * Assume instead that they will all fall within the range of +-2billion
- * of the filrst one we find.
+ * genradixes are indexed by a ulong, not a u64, so we can't index them
+ * by sequence number directly: Assume instead that they will all fall
+ * within the range of +-2billion of the filrst one we find.
*/
if (!c->journal_entries_base_seq)
c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX);
-#if 0
- list_for_each_entry_reverse(i, jlist->head, list) {
- if (!JSET_NO_FLUSH(&i->j)) {
- last_seq = le64_to_cpu(i->j.last_seq);
- break;
- }
- }
-#endif
-
- /* Is this entry older than the range we need? */
- if (!c->opts.read_entire_journal &&
- le64_to_cpu(j->seq) < last_seq) {
- ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
- goto out;
- }
-
/* Drop entries we don't need anymore */
- if (!JSET_NO_FLUSH(j) && !c->opts.read_entire_journal) {
- genradix_for_each(&c->journal_entries, iter, _i) {
+ if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) {
+ genradix_for_each_from(&c->journal_entries, iter, _i,
+ journal_entry_radix_idx(c, jlist->last_seq)) {
i = *_i;
- if (!i)
+ if (!i || i->ignore)
continue;
- if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
+ if (le64_to_cpu(i->j.seq) >= last_seq)
break;
journal_replay_free(c, i);
}
}
- _i = genradix_ptr(&c->journal_entries, journal_entry_radix_idx(c, j));
- dup = _i ? *_i : NULL;
+ jlist->last_seq = max(jlist->last_seq, last_seq);
+
+ _i = genradix_ptr_alloc(&c->journal_entries,
+ journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
+ GFP_KERNEL);
+ if (!_i)
+ return -ENOMEM;
/*
* Duplicate journal entries? If so we want the one that didn't have a
* checksum error:
*/
+ dup = *_i;
if (dup) {
if (dup->bad) {
/* we'll replace @dup: */
}
i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
- if (!i) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!i)
+ return -ENOMEM;
i->nr_ptrs = 0;
i->bad = bad;
__journal_replay_free(c, dup);
}
- _i = genradix_ptr_alloc(&c->journal_entries,
- journal_entry_radix_idx(c, &i->j),
- GFP_KERNEL);
- if (!_i) {
- bch_err(c, "failed to allocate c->journal_entries entry");
- ret = -ENOMEM;
- goto out;
- }
*_i = i;
found:
bch_err(c, "corrupt metadata before write:\n" \
msg, ##__VA_ARGS__); \
if (bch2_fs_inconsistent(c)) { \
- ret = BCH_FSCK_ERRORS_NOT_FIXED; \
+ ret = -BCH_ERR_fsck_errors_not_fixed; \
goto fsck_err; \
} \
break; \
static int journal_validate_key(struct bch_fs *c, const char *where,
struct jset_entry *entry,
unsigned level, enum btree_id btree_id,
- struct bkey_i *k, const char *type,
+ struct bkey_i *k,
unsigned version, int big_endian, int write)
{
void *next = vstruct_next(entry);
int ret = 0;
if (journal_entry_err_on(!k->k.u64s, c,
- "invalid %s in %s entry offset %zi/%u: k->u64s 0",
- type, where,
+ "invalid key in %s at %s offset %zi/%u: k->u64s 0",
+ bch2_jset_entry_types[entry->type], where,
(u64 *) k - entry->_data,
le16_to_cpu(entry->u64s))) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
if (journal_entry_err_on((void *) bkey_next(k) >
(void *) vstruct_next(entry), c,
- "invalid %s in %s entry offset %zi/%u: extends past end of journal entry",
- type, where,
+ "invalid key in %s at %s offset %zi/%u: extends past end of journal entry",
+ bch2_jset_entry_types[entry->type], where,
(u64 *) k - entry->_data,
le16_to_cpu(entry->u64s))) {
entry->u64s = cpu_to_le16((u64 *) k - entry->_data);
}
if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c,
- "invalid %s in %s entry offset %zi/%u: bad format %u",
- type, where,
+ "invalid key in %s at %s offset %zi/%u: bad format %u",
+ bch2_jset_entry_types[entry->type], where,
(u64 *) k - entry->_data,
le16_to_cpu(entry->u64s),
k->k.format)) {
if (bch2_bkey_invalid(c, bkey_i_to_s_c(k),
__btree_node_type(level, btree_id), write, &buf)) {
printbuf_reset(&buf);
- pr_buf(&buf, "invalid %s in %s entry offset %zi/%u:",
- type, where,
+ prt_printf(&buf, "invalid key in %s at %s offset %zi/%u:",
+ bch2_jset_entry_types[entry->type], where,
(u64 *) k - entry->_data,
le16_to_cpu(entry->u64s));
- pr_newline(&buf);
- pr_indent_push(&buf, 2);
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
- pr_newline(&buf);
+ prt_newline(&buf);
bch2_bkey_invalid(c, bkey_i_to_s_c(k),
__btree_node_type(level, btree_id), write, &buf);
int ret = journal_validate_key(c, where, entry,
entry->level,
entry->btree_id,
- k, "key", version, big_endian, write);
+ k, version, big_endian, write);
if (ret == FSCK_DELETED_KEY)
continue;
vstruct_for_each(entry, k) {
if (!first) {
- pr_newline(out);
- pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ prt_newline(out);
+ prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
}
- pr_buf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
+ prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level);
bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
first = false;
}
}
return journal_validate_key(c, where, entry, 1, entry->btree_id, k,
- "btree root", version, big_endian, write);
+ version, big_endian, write);
fsck_err:
return ret;
}
struct jset_entry_blacklist *bl =
container_of(entry, struct jset_entry_blacklist, entry);
- pr_buf(out, "seq=%llu", le64_to_cpu(bl->seq));
+ prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq));
}
static int journal_entry_blacklist_v2_validate(struct bch_fs *c,
struct jset_entry_blacklist_v2 *bl =
container_of(entry, struct jset_entry_blacklist_v2, entry);
- pr_buf(out, "start=%llu end=%llu",
+ prt_printf(out, "start=%llu end=%llu",
le64_to_cpu(bl->start),
le64_to_cpu(bl->end));
}
struct jset_entry_usage *u =
container_of(entry, struct jset_entry_usage, entry);
- pr_buf(out, "type=%s v=%llu",
+ prt_printf(out, "type=%s v=%llu",
bch2_fs_usage_types[u->entry.btree_id],
le64_to_cpu(u->v));
}
container_of(entry, struct jset_entry_data_usage, entry);
bch2_replicas_entry_to_text(out, &u->r);
- pr_buf(out, "=%llu", le64_to_cpu(u->v));
+ prt_printf(out, "=%llu", le64_to_cpu(u->v));
}
static int journal_entry_clock_validate(struct bch_fs *c,
struct jset_entry_clock *clock =
container_of(entry, struct jset_entry_clock, entry);
- pr_buf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
+ prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time));
}
static int journal_entry_dev_usage_validate(struct bch_fs *c,
container_of(entry, struct jset_entry_dev_usage, entry);
unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
- pr_buf(out, "dev=%u", le32_to_cpu(u->dev));
+ prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
for (i = 0; i < nr_types; i++) {
if (i < BCH_DATA_NR)
- pr_buf(out, " %s", bch2_data_types[i]);
+ prt_printf(out, " %s", bch2_data_types[i]);
else
- pr_buf(out, " (unknown data type %u)", i);
- pr_buf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
+ prt_printf(out, " (unknown data type %u)", i);
+ prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
le64_to_cpu(u->d[i].buckets),
le64_to_cpu(u->d[i].sectors),
le64_to_cpu(u->d[i].fragmented));
}
- pr_buf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
+ prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
}
static int journal_entry_log_validate(struct bch_fs *c,
struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry);
unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d);
- pr_buf(out, "%.*s", bytes, l->d);
+ prt_printf(out, "%.*s", bytes, l->d);
+}
+
+static int journal_entry_overwrite_validate(struct bch_fs *c, const char *where,
+ struct jset_entry *entry,
+ unsigned version, int big_endian, int write)
+{
+ return journal_entry_btree_keys_validate(c, where, entry, version, big_endian, write);
+}
+
+static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ journal_entry_btree_keys_to_text(out, c, entry);
}
struct jset_entry_ops {
struct jset_entry *entry)
{
if (entry->type < BCH_JSET_ENTRY_NR) {
- pr_buf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+ prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
} else {
- pr_buf(out, "(unknown type %u)", entry->type);
+ prt_printf(out, "(unknown type %u)", entry->type);
}
}
while (offset < end) {
if (!sectors_read) {
struct bio *bio;
+ unsigned nr_bvecs;
reread:
sectors_read = min_t(unsigned,
end - offset, buf->size >> 9);
+ nr_bvecs = buf_pages(buf->data, sectors_read << 9);
+
+ bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+ bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
- bio = bio_kmalloc(GFP_KERNEL,
- buf_pages(buf->data,
- sectors_read << 9));
- bio_set_dev(bio, ca->disk_sb.bdev);
- bio->bi_iter.bi_sector = offset;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bio->bi_iter.bi_sector = offset;
bch2_bio_map(bio, buf->data, sectors_read << 9);
ret = submit_bio_wait(bio);
- bio_put(bio);
+ kfree(bio);
if (bch2_dev_io_err_on(ret, ca,
"journal read error: sector %llu",
end - offset, sectors_read,
READ);
switch (ret) {
- case BCH_FSCK_OK:
+ case 0:
sectors = vstruct_sectors(j, c->block_bits);
break;
case JOURNAL_ENTRY_REREAD:
bch_err(c, "ja->sectors_free == ca->mi.bucket_size");
bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr);
for (i = 0; i < 3; i++) {
- unsigned idx = ja->cur_idx - 1 + i;
+ unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr;
bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]);
}
ja->sectors_free = 0;
div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
if (i)
- pr_buf(out, " ");
- pr_buf(out, "%u:%u:%u (sector %llu)",
+ prt_printf(out, " ");
+ prt_printf(out, "%u:%u:%u (sector %llu)",
j->ptrs[i].dev,
j->ptrs[i].bucket,
j->ptrs[i].bucket_offset,
closure_init_stack(&jlist.cl);
mutex_init(&jlist.lock);
+ jlist.last_seq = 0;
jlist.ret = 0;
for_each_member_device(ca, c, iter) {
- if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) &&
+ if (!c->opts.fsck &&
!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
continue;
if (prev) {
bch2_journal_ptrs_to_text(&buf1, c, prev);
- pr_buf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
+ prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits));
} else
- pr_buf(&buf1, "(none)");
+ prt_printf(&buf1, "(none)");
bch2_journal_ptrs_to_text(&buf2, c, i);
missing_end = seq - 1;
bch2_replicas_entry_to_text(&buf, &replicas.e);
if (!degraded &&
- (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
- fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
- "superblock not marked as containing replicas %s",
- buf.buf))) {
+ fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c,
+ "superblock not marked as containing replicas %s",
+ buf.buf)) {
ret = bch2_mark_replicas(c, &replicas.e);
if (ret)
goto err;
* Must come before signaling write completion, for
* bch2_fs_journal_stop():
*/
- journal_reclaim_kick(&c->journal);
+ if (j->watermark)
+ journal_reclaim_kick(&c->journal);
/* also must come before signalling write completion: */
closure_debug_destroy(cl);
sectors);
bio = ca->journal.bio;
- bio_reset(bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = ptr->offset;
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
- bio->bi_opf = REQ_OP_WRITE|REQ_SYNC|REQ_META;
BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector);
ca->prev_journal_sector = bio->bi_iter.bi_sector;
percpu_ref_get(&ca->io_ref);
bio = ca->journal.bio;
- bio_reset(bio);
- bio_set_dev(bio, ca->disk_sb.bdev);
- bio->bi_opf = REQ_OP_FLUSH;
+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
closure_bio_submit(bio, cl);