struct journal_replay *r, **_r;
struct genradix_iter iter;
struct journal_read_buf buf = { NULL, 0 };
- u64 min_seq = U64_MAX;
unsigned i;
int ret = 0;
goto err;
}
- /* Find the journal bucket with the highest sequence number: */
- for (i = 0; i < ja->nr; i++) {
- if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
- ja->cur_idx = i;
-
- min_seq = min(ja->bucket_seq[i], min_seq);
- }
-
- /*
- * If there's duplicate journal entries in multiple buckets (which
- * definitely isn't supposed to happen, but...) - make sure to start
- * cur_idx at the last of those buckets, so we don't deadlock trying to
- * allocate
- */
- while (ja->bucket_seq[ja->cur_idx] > min_seq &&
- ja->bucket_seq[ja->cur_idx] ==
- ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
- ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
-
ja->sectors_free = ca->mi.bucket_size;
mutex_lock(&jlist->lock);
- genradix_for_each(&c->journal_entries, iter, _r) {
+ genradix_for_each_reverse(&c->journal_entries, iter, _r) {
r = *_r;
if (!r)
continue;
for (i = 0; i < r->nr_ptrs; i++) {
- if (r->ptrs[i].dev == ca->dev_idx &&
- sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
+ if (r->ptrs[i].dev == ca->dev_idx) {
unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
vstruct_sectors(&r->j, c->block_bits);
- ja->sectors_free = min(ja->sectors_free,
- ca->mi.bucket_size - wrote);
+ ja->cur_idx = r->ptrs[i].bucket;
+ ja->sectors_free = ca->mi.bucket_size - wrote;
+ goto found;
}
}
}
+found:
mutex_unlock(&jlist->lock);
if (ja->bucket_seq[ja->cur_idx] &&
}
}
-int bch2_journal_read(struct bch_fs *c, u64 *blacklist_seq, u64 *start_seq)
+int bch2_journal_read(struct bch_fs *c,
+ u64 *last_seq,
+ u64 *blacklist_seq,
+ u64 *start_seq)
{
struct journal_list jlist;
struct journal_replay *i, **_i, *prev = NULL;
struct bch_dev *ca;
unsigned iter;
struct printbuf buf = PRINTBUF;
- size_t keys = 0, entries = 0;
- bool degraded = false;
- u64 seq, last_seq = 0;
+ bool degraded = false, last_write_torn = false;
+ u64 seq;
int ret = 0;
closure_init_stack(&jlist.cl);
if (jlist.ret)
return jlist.ret;
- *start_seq = 0;
+ *last_seq = 0;
+ *start_seq = 0;
+ *blacklist_seq = 0;
/*
* Find most recent flush entry, and ignore newer non flush entries -
* those entries will be blacklisted:
*/
genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) {
+ int write = READ;
+
i = *_i;
if (!i || i->ignore)
continue;
if (!*start_seq)
- *start_seq = le64_to_cpu(i->j.seq) + 1;
-
- if (!JSET_NO_FLUSH(&i->j)) {
- int write = READ;
- if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
- c, &i->j, NULL,
- "invalid journal entry: last_seq > seq (%llu > %llu)",
- le64_to_cpu(i->j.last_seq),
- le64_to_cpu(i->j.seq)))
- i->j.last_seq = i->j.seq;
-
- pr_info("last flush %llu-%llu csum good %u",
- le64_to_cpu(i->j.last_seq),
- le64_to_cpu(i->j.seq),
- i->csum_good);
-
- last_seq = le64_to_cpu(i->j.last_seq);
- *blacklist_seq = le64_to_cpu(i->j.seq) + 1;
- break;
+ *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
+
+ if (JSET_NO_FLUSH(&i->j)) {
+ i->ignore = true;
+ continue;
+ }
+
+ if (!last_write_torn && !i->csum_good) {
+ last_write_torn = true;
+ i->ignore = true;
+ continue;
}
- journal_replay_free(c, i);
+ if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq),
+ c, &i->j, NULL,
+ "invalid journal entry: last_seq > seq (%llu > %llu)",
+ le64_to_cpu(i->j.last_seq),
+ le64_to_cpu(i->j.seq)))
+ i->j.last_seq = i->j.seq;
+
+ *last_seq = le64_to_cpu(i->j.last_seq);
+ *blacklist_seq = le64_to_cpu(i->j.seq) + 1;
+ break;
}
if (!*start_seq) {
return 0;
}
- if (!last_seq) {
+ if (!*last_seq) {
fsck_err(c, "journal read done, but no entries found after dropping non-flushes");
- ret = -1;
- goto err;
+ return 0;
}
+ bch_info(c, "journal read done, replaying entries %llu-%llu",
+ *last_seq, *blacklist_seq - 1);
+
+ if (*start_seq != *blacklist_seq)
+ bch_info(c, "dropped unflushed entries %llu-%llu",
+ *blacklist_seq, *start_seq - 1);
+
/* Drop blacklisted entries and entries older than last_seq: */
genradix_for_each(&c->journal_entries, radix_iter, _i) {
i = *_i;
continue;
seq = le64_to_cpu(i->j.seq);
- if (seq < last_seq) {
+ if (seq < *last_seq) {
journal_replay_free(c, i);
continue;
}
if (bch2_journal_seq_is_blacklisted(c, seq, true)) {
fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
"found blacklisted journal entry %llu", seq);
-
- journal_replay_free(c, i);
+ i->ignore = true;
}
}
/* Check for missing entries: */
- seq = last_seq;
+ seq = *last_seq;
genradix_for_each(&c->journal_entries, radix_iter, _i) {
i = *_i;
" prev at %s\n"
" next at %s",
missing_start, missing_end,
- last_seq, *blacklist_seq - 1,
+ *last_seq, *blacklist_seq - 1,
buf1.buf, buf2.buf);
printbuf_exit(&buf1);
}
genradix_for_each(&c->journal_entries, radix_iter, _i) {
- struct jset_entry *entry;
- struct bkey_i *k, *_n;
struct bch_replicas_padded replicas = {
.e.data_type = BCH_DATA_journal,
.e.nr_required = 1,
struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
if (!i->ptrs[ptr].csum_good)
- printk(KERN_ERR "bcachefs (%s) sector %llu: invalid journal checksum, seq %llu%s\n",
- ca->name, i->ptrs[ptr].sector,
- le64_to_cpu(i->j.seq),
- i->csum_good ? " (had good copy on another device)" : "");
+ bch_err_dev_offset(ca, i->ptrs[ptr].sector,
+ "invalid journal checksum, seq %llu%s",
+ le64_to_cpu(i->j.seq),
+ i->csum_good ? " (had good copy on another device)" : "");
}
ret = jset_validate(c,
if (ret)
goto err;
}
-
- for_each_jset_key(k, _n, entry, &i->j)
- keys++;
- entries++;
}
-
- bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
- keys, entries, *start_seq);
-
- if (*start_seq != *blacklist_seq)
- bch_info(c, "dropped unflushed entries %llu-%llu",
- *blacklist_seq, *start_seq - 1);
err:
fsck_err:
printbuf_exit(&buf);
j->write_start_time = local_clock();
spin_lock(&j->lock);
- if (bch2_journal_error(j) ||
- w->noflush ||
- (!w->must_flush &&
- (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
- test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
+
+ /*
+ * If the journal is in an error state - we did an emergency shutdown -
+ * we prefer to continue doing journal writes. We just mark them as
+ * noflush so they'll never be used, but they'll still be visible by the
+ * list_journal tool - this helps in debugging.
+ *
+ * There's a caveat: the first journal write after marking the
+ * superblock dirty must always be a flush write, because on startup
+ * from a clean shutdown we didn't necessarily read the journal and the
+ * new journal write might overwrite whatever was in the journal
+ * previously - we can't leave the journal without any flush writes in
+ * it.
+ *
+ * So if we're in an error state, and we're still starting up, we don't
+ * write anything at all.
+ */
+ if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) &&
+ (bch2_journal_error(j) ||
+ w->noflush ||
+ (!w->must_flush &&
+ (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
w->noflush = true;
SET_JSET_NO_FLUSH(jset, true);
jset->last_seq = 0;
w->last_seq = 0;
j->nr_noflush_writes++;
- } else {
+ } else if (!bch2_journal_error(j)) {
j->last_flush_write = jiffies;
j->nr_flush_writes++;
+ clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
+ } else {
+ spin_unlock(&j->lock);
+ goto err;
}
spin_unlock(&j->lock);