#include "journal_reclaim.h"
#include "journal_seq_blacklist.h"
#include "replicas.h"
-
-#include <trace/events/bcachefs.h>
+#include "trace.h"
static struct nonce journal_nonce(const struct jset *jset)
{
journal_entry_radix_idx(c, le64_to_cpu(j->seq)),
GFP_KERNEL);
if (!_i)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_journal_entry_add;
/*
* Duplicate journal entries? If so we want the one that didn't have a
replace:
i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
if (!i)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_journal_entry_add;
i->nr_ptrs = 0;
i->csum_good = entry_ptr.csum_good;
i->ignore = false;
- memcpy(&i->j, j, bytes);
+ unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
i->ptrs[i->nr_ptrs++] = entry_ptr;
if (dup) {
int ret = journal_validate_key(c, jset, entry,
entry->level,
entry->btree_id,
- k, version, big_endian, write);
+ k, version, big_endian,
+ write|BKEY_INVALID_JOURNAL);
if (ret == FSCK_DELETED_KEY)
continue;
struct bkey_i *k;
bool first = true;
- vstruct_for_each(entry, k) {
+ jset_entry_for_each_key(entry, k) {
if (!first) {
prt_newline(out);
prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
struct jset_entry *entry,
unsigned version, int big_endian, int write)
{
- return journal_entry_btree_keys_validate(c, jset, entry, version, big_endian, write);
+ return journal_entry_btree_keys_validate(c, jset, entry,
+ version, big_endian, READ);
}
static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c,
return JOURNAL_ENTRY_NONE;
version = le32_to_cpu(jset->version);
- if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
- version < bcachefs_metadata_version_min) ||
- version >= bcachefs_metadata_version_max,
- c, jset, NULL,
- "%s sector %llu seq %llu: unknown journal entry version %u",
+ if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
+ "%s sector %llu seq %llu: incompatible journal entry version %u.%u",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq),
- version)) {
+ BCH_VERSION_MAJOR(version),
+ BCH_VERSION_MINOR(version))) {
/* don't try to continue: */
return -EINVAL;
}
return JOURNAL_ENTRY_NONE;
version = le32_to_cpu(jset->version);
- if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD &&
- version < bcachefs_metadata_version_min) ||
- version >= bcachefs_metadata_version_max,
- c, jset, NULL,
- "%s sector %llu seq %llu: unknown journal entry version %u",
+ if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL,
+ "%s sector %llu seq %llu: unknown journal entry version %u.%u",
ca ? ca->name : c->name,
sector, le64_to_cpu(jset->seq),
- version)) {
+ BCH_VERSION_MAJOR(version),
+ BCH_VERSION_MINOR(version))) {
/* don't try to continue: */
return -EINVAL;
}
/* the bios are sized for this many pages, max: */
if (new_size > JOURNAL_ENTRY_SIZE_MAX)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
new_size = roundup_pow_of_two(new_size);
n = kvpmalloc(new_size, GFP_KERNEL);
if (!n)
- return -ENOMEM;
+ return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
kvpfree(b->data, b->size);
b->data = n;
if (!ca->mi.durability ||
ca->mi.state != BCH_MEMBER_STATE_rw ||
!ja->nr ||
- bch2_bkey_has_device(bkey_i_to_s_c(&w->key),
- ca->dev_idx) ||
+ bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) ||
sectors > ja->sectors_free)
continue;
if (buf->buf_size >= new_size)
return;
- new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN);
+ new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
if (!new_buf)
return;
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *w = journal_last_unwritten_buf(j);
- struct bch_replicas_padded replicas;
union journal_res_state old, new;
u64 v, seq;
int err = 0;
if (!w->devs_written.nr) {
bch_err(c, "unable to write journal to sufficient devices");
err = -EIO;
- } else {
- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
- w->devs_written);
- if (bch2_mark_replicas(c, &replicas.e))
- err = -EIO;
}
-
if (err)
bch2_fatal_error(c);
bch2_do_discards(c);
closure_wake_up(&c->freelist_wait);
+
+ bch2_reset_alloc_cursors(c);
}
} else if (!j->err_seq || seq < j->err_seq)
j->err_seq = seq;
* Must come before signaling write completion, for
* bch2_fs_journal_stop():
*/
- if (j->watermark)
+ if (j->watermark != BCH_WATERMARK_stripe)
journal_reclaim_kick(&c->journal);
/* also must come before signalling write completion: */
return;
}
+static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset)
+{
+ struct jset_entry *i, *next, *prev = NULL;
+
+ /*
+ * Simple compaction, dropping empty jset_entries (from journal
+ * reservations that weren't fully used) and merging jset_entries that
+ * can be.
+ *
+ * If we wanted to be really fancy here, we could sort all the keys in
+ * the jset and drop keys that were overwritten - probably not worth it:
+ */
+ vstruct_for_each_safe(jset, i, next) {
+ unsigned u64s = le16_to_cpu(i->u64s);
+
+ /* Empty entry: */
+ if (!u64s)
+ continue;
+
+ if (i->type == BCH_JSET_ENTRY_btree_root)
+ bch2_journal_entry_to_btree_root(c, i);
+
+ /* Can we merge with previous entry? */
+ if (prev &&
+ i->btree_id == prev->btree_id &&
+ i->level == prev->level &&
+ i->type == prev->type &&
+ i->type == BCH_JSET_ENTRY_btree_keys &&
+ le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
+ memmove_u64s_down(vstruct_next(prev),
+ i->_data,
+ u64s);
+ le16_add_cpu(&prev->u64s, u64s);
+ continue;
+ }
+
+ /* Couldn't merge, move i into new position (after prev): */
+ prev = prev ? vstruct_next(prev) : jset->start;
+ if (i != prev)
+ memmove_u64s_down(prev, i, jset_u64s(u64s));
+ }
+
+ prev = prev ? vstruct_next(prev) : jset->start;
+ jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
+}
+
void bch2_journal_write(struct closure *cl)
{
struct journal *j = container_of(cl, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_last_unwritten_buf(j);
+ struct bch_replicas_padded replicas;
struct jset_entry *start, *end;
struct jset *jset;
struct bio *bio;
* entry:
*/
- bch2_journal_entries_to_btree_roots(c, jset);
+ bch2_journal_entries_postprocess(c, jset);
start = end = vstruct_last(jset);
BUG_ON(u64s > j->entry_u64s_reserved);
le32_add_cpu(&jset->u64s, u64s);
- BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors);
+
+ sectors = vstruct_sectors(jset, c->block_bits);
+ bytes = vstruct_bytes(jset);
+
+ if (sectors > w->sectors) {
+ bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
+ vstruct_bytes(jset), w->sectors << 9,
+ u64s, w->u64s_reserved, j->entry_u64s_reserved);
+ goto err;
+ }
jset->magic = cpu_to_le64(jset_magic(c));
- jset->version = c->sb.version < bcachefs_metadata_version_bkey_renumber
- ? cpu_to_le32(BCH_JSET_VERSION_OLD)
- : cpu_to_le32(c->sb.version);
+ jset->version = cpu_to_le32(c->sb.version);
SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN);
SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
jset_validate(c, NULL, jset, 0, WRITE))
goto err;
- sectors = vstruct_sectors(jset, c->block_bits);
- BUG_ON(sectors > w->sectors);
-
- bytes = vstruct_bytes(jset);
memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
retry_alloc:
bch_err(c, "Unable to allocate journal write:\n%s",
journal_debug_buf.buf);
printbuf_exit(&journal_debug_buf);
- bch2_fatal_error(c);
- continue_at(cl, journal_write_done, c->io_complete_wq);
- return;
+ goto err;
}
w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
if (nr_rw_members > 1)
w->separate_flush = true;
+ /*
+ * Mark journal replicas before we submit the write to guarantee
+ * recovery will find the journal entries after a crash.
+ */
+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal,
+ w->devs_written);
+ ret = bch2_mark_replicas(c, &replicas.e);
+ if (ret)
+ goto err;
+
if (!JSET_NO_FLUSH(jset) && w->separate_flush) {
for_each_rw_member(ca, c, i) {
percpu_ref_get(&ca->io_ref);