+// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
-#include "alloc_background.h"
#include "alloc_foreground.h"
-#include "btree_gc.h"
-#include "btree_update.h"
+#include "btree_io.h"
+#include "btree_update_interior.h"
#include "buckets.h"
#include "checksum.h"
#include "error.h"
#include "journal.h"
#include "journal_io.h"
#include "journal_reclaim.h"
-#include "journal_seq_blacklist.h"
#include "replicas.h"
#include <trace/events/bcachefs.h>
list)->j.last_seq
: 0;
- /* Is this entry older than the range we need? */
- if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
- ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
- goto out;
- }
+ if (!c->opts.read_entire_journal) {
+ /* Is this entry older than the range we need? */
+ if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) {
+ ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE;
+ goto out;
+ }
- /* Drop entries we don't need anymore */
- list_for_each_entry_safe(i, pos, jlist->head, list) {
- if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
- break;
- list_del(&i->list);
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
+ /* Drop entries we don't need anymore */
+ list_for_each_entry_safe(i, pos, jlist->head, list) {
+ if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq))
+ break;
+ list_del(&i->list);
+ kvpfree(i, offsetof(struct journal_replay, j) +
+ vstruct_bytes(&i->j));
+ }
}
list_for_each_entry_reverse(i, jlist->head, list) {
static int journal_validate_key(struct bch_fs *c, struct jset *jset,
struct jset_entry *entry,
- struct bkey_i *k, enum btree_node_type key_type,
+ unsigned level, enum btree_id btree_id,
+ struct bkey_i *k,
const char *type, int write)
{
void *next = vstruct_next(entry);
return 0;
}
- if (JSET_BIG_ENDIAN(jset) != CPU_BIG_ENDIAN)
- bch2_bkey_swab(NULL, bkey_to_packed(k));
-
- if (!write &&
- version < bcachefs_metadata_version_bkey_renumber)
- bch2_bkey_renumber(key_type, bkey_to_packed(k), write);
+ if (!write)
+ bch2_bkey_compat(level, btree_id, version,
+ JSET_BIG_ENDIAN(jset), write,
+ NULL, bkey_to_packed(k));
- invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), key_type);
+ invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k),
+ __btree_node_type(level, btree_id));
if (invalid) {
char buf[160];
return 0;
}
- if (write &&
- version < bcachefs_metadata_version_bkey_renumber)
- bch2_bkey_renumber(key_type, bkey_to_packed(k), write);
+ if (write)
+ bch2_bkey_compat(level, btree_id, version,
+ JSET_BIG_ENDIAN(jset), write,
+ NULL, bkey_to_packed(k));
fsck_err:
return ret;
}
struct bkey_i *k;
vstruct_for_each(entry, k) {
- int ret = journal_validate_key(c, jset, entry, k,
- __btree_node_type(entry->level,
- entry->btree_id),
- "key", write);
+ int ret = journal_validate_key(c, jset, entry,
+ entry->level,
+ entry->btree_id,
+ k, "key", write);
if (ret)
return ret;
}
return 0;
}
- return journal_validate_key(c, jset, entry, k, BKEY_TYPE_BTREE,
+ return journal_validate_key(c, jset, entry, 1, entry->btree_id, k,
"btree root", write);
fsck_err:
return ret;
sectors_read << 9));
bio_set_dev(bio, ca->disk_sb.bdev);
bio->bi_iter.bi_sector = offset;
- bio->bi_iter.bi_size = sectors_read << 9;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
- bch2_bio_map(bio, buf->data);
+ bch2_bio_map(bio, buf->data, sectors_read << 9);
ret = submit_bio_wait(bio);
bio_put(bio);
ja->sectors_free = 0;
/*
- * Set last_idx to indicate the entire journal is full and needs to be
+ * Set dirty_idx to indicate the entire journal is full and needs to be
* reclaimed - journal reclaim will immediately reclaim whatever isn't
* pinned when it first runs:
*/
- ja->last_idx = (ja->cur_idx + 1) % ja->nr;
+ ja->discard_idx = ja->dirty_idx_ondisk =
+ ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
out:
kvpfree(buf.data, buf.size);
percpu_ref_put(&ca->io_ref);
goto out;
}
-void bch2_journal_entries_free(struct list_head *list)
-{
-
- while (!list_empty(list)) {
- struct journal_replay *i =
- list_first_entry(list, struct journal_replay, list);
- list_del(&i->list);
- kvpfree(i, offsetof(struct journal_replay, j) +
- vstruct_bytes(&i->j));
- }
-}
-
-int bch2_journal_set_seq(struct bch_fs *c, u64 last_seq, u64 end_seq)
-{
- struct journal *j = &c->journal;
- struct journal_entry_pin_list *p;
- u64 seq, nr = end_seq - last_seq + 1;
-
- if (nr > j->pin.size) {
- free_fifo(&j->pin);
- init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL);
- if (!j->pin.data) {
- bch_err(c, "error reallocating journal fifo (%llu open entries)", nr);
- return -ENOMEM;
- }
- }
-
- atomic64_set(&j->seq, end_seq);
- j->last_seq_ondisk = last_seq;
-
- j->pin.front = last_seq;
- j->pin.back = end_seq + 1;
-
- fifo_for_each_entry_ptr(p, &j->pin, seq) {
- INIT_LIST_HEAD(&p->list);
- INIT_LIST_HEAD(&p->flushed);
- atomic_set(&p->count, 0);
- p->devs.nr = 0;
- }
-
- return 0;
-}
-
int bch2_journal_read(struct bch_fs *c, struct list_head *list)
{
- struct journal *j = &c->journal;
struct journal_list jlist;
struct journal_replay *i;
- struct journal_entry_pin_list *p;
struct bch_dev *ca;
- u64 cur_seq, end_seq;
unsigned iter;
size_t keys = 0, entries = 0;
bool degraded = false;
if (jlist.ret)
return jlist.ret;
- if (list_empty(list)){
- bch_err(c, "no journal entries found");
- return BCH_FSCK_REPAIR_IMPOSSIBLE;
- }
-
list_for_each_entry(i, list, list) {
+ struct jset_entry *entry;
+ struct bkey_i *k, *_n;
struct bch_replicas_padded replicas;
char buf[80];
- bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
-
ret = jset_validate_entries(c, &i->j, READ);
if (ret)
goto fsck_err;
* the devices - this is wrong:
*/
+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
+
if (!degraded &&
(test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c,
if (ret)
return ret;
}
- }
-
- i = list_last_entry(list, struct journal_replay, list);
-
- ret = bch2_journal_set_seq(c,
- le64_to_cpu(i->j.last_seq),
- le64_to_cpu(i->j.seq));
- if (ret)
- return ret;
-
- mutex_lock(&j->blacklist_lock);
-
- list_for_each_entry(i, list, list) {
- p = journal_seq_pin(j, le64_to_cpu(i->j.seq));
-
- atomic_set(&p->count, 1);
- p->devs = i->devs;
-
- if (bch2_journal_seq_blacklist_read(j, i)) {
- mutex_unlock(&j->blacklist_lock);
- return -ENOMEM;
- }
- }
-
- mutex_unlock(&j->blacklist_lock);
-
- cur_seq = journal_last_seq(j);
- end_seq = le64_to_cpu(list_last_entry(list,
- struct journal_replay, list)->j.seq);
-
- list_for_each_entry(i, list, list) {
- struct jset_entry *entry;
- struct bkey_i *k, *_n;
- bool blacklisted;
-
- mutex_lock(&j->blacklist_lock);
- while (cur_seq < le64_to_cpu(i->j.seq) &&
- bch2_journal_seq_blacklist_find(j, cur_seq))
- cur_seq++;
-
- blacklisted = bch2_journal_seq_blacklist_find(j,
- le64_to_cpu(i->j.seq));
- mutex_unlock(&j->blacklist_lock);
-
- fsck_err_on(blacklisted, c,
- "found blacklisted journal entry %llu",
- le64_to_cpu(i->j.seq));
-
- fsck_err_on(le64_to_cpu(i->j.seq) != cur_seq, c,
- "journal entries %llu-%llu missing! (replaying %llu-%llu)",
- cur_seq, le64_to_cpu(i->j.seq) - 1,
- journal_last_seq(j), end_seq);
-
- cur_seq = le64_to_cpu(i->j.seq) + 1;
for_each_jset_key(k, _n, entry, &i->j)
keys++;
entries++;
}
- bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
- keys, entries, journal_cur_seq(j));
-fsck_err:
- return ret;
-}
-
-/* journal replay: */
-
-int bch2_journal_replay(struct bch_fs *c, struct list_head *list)
-{
- struct journal *j = &c->journal;
- struct bkey_i *k, *_n;
- struct jset_entry *entry;
- struct journal_replay *i, *n;
- int ret = 0;
-
- list_for_each_entry_safe(i, n, list, list) {
- j->replay_journal_seq = le64_to_cpu(i->j.seq);
-
- for_each_jset_key(k, _n, entry, &i->j) {
-
- if (entry->btree_id == BTREE_ID_ALLOC) {
- /*
- * allocation code handles replay for
- * BTREE_ID_ALLOC keys:
- */
- ret = bch2_alloc_replay_key(c, k);
- } else {
- /*
- * We might cause compressed extents to be
- * split, so we need to pass in a
- * disk_reservation:
- */
- struct disk_reservation disk_res =
- bch2_disk_reservation_init(c, 0);
-
- ret = bch2_btree_insert(c, entry->btree_id, k,
- &disk_res, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_REPLAY|
- BTREE_INSERT_NOMARK);
- }
-
- if (ret) {
- bch_err(c, "journal replay: error %d while replaying key",
- ret);
- goto err;
- }
-
- cond_resched();
- }
+ if (!list_empty(list)) {
+ i = list_last_entry(list, struct journal_replay, list);
- bch2_journal_pin_put(j, j->replay_journal_seq);
+ bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu",
+ keys, entries, le64_to_cpu(i->j.seq));
}
-
- j->replay_journal_seq = 0;
-
- bch2_journal_set_replay_done(j);
- bch2_journal_flush_all_pins(j);
- ret = bch2_journal_error(j);
-err:
- bch2_journal_entries_free(list);
+fsck_err:
return ret;
}
if (sectors > ja->sectors_free &&
sectors <= ca->mi.bucket_size &&
- bch2_journal_dev_buckets_available(j, ja)) {
+ bch2_journal_dev_buckets_available(j, ja,
+ journal_space_discarded)) {
ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
ja->sectors_free = ca->mi.bucket_size;
+
+ /*
+ * ja->bucket_seq[ja->cur_idx] must always have
+ * something sensible:
+ */
+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
}
}
goto err;
spin_lock(&j->lock);
- j->seq_ondisk = seq;
- j->last_seq_ondisk = last_seq;
-
if (seq >= j->pin.front)
journal_seq_pin(j, seq)->devs = devs;
+ j->seq_ondisk = seq;
+ j->last_seq_ondisk = last_seq;
+ bch2_journal_space_available(j);
+
/*
* Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
* more buckets:
return;
err:
bch2_fatal_error(c);
- bch2_journal_halt(j);
spin_lock(&j->lock);
goto out;
}
j->write_start_time = local_clock();
- start = vstruct_last(jset);
- end = bch2_journal_super_entries_add_common(c, start);
+ /*
+ * New btree roots are set by journalling them; when the journal entry
+ * gets written we have to propagate them to c->btree_roots
+ *
+ * But, every journal entry we write has to contain all the btree roots
+ * (at least for now); so after we copy btree roots to c->btree_roots we
+ * have to get any missing btree roots and add them to this journal
+ * entry:
+ */
+
+ bch2_journal_entries_to_btree_roots(c, jset);
+
+ start = end = vstruct_last(jset);
+
+ end = bch2_btree_roots_to_journal_entries(c, jset->start, end);
+
+ end = bch2_journal_super_entries_add_common(c, end,
+ le64_to_cpu(jset->seq));
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
validate_before_checksum = true;
- if (le32_to_cpu(jset->version) <
- bcachefs_metadata_version_bkey_renumber)
+ if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max)
validate_before_checksum = true;
if (validate_before_checksum &&
bytes = vstruct_bytes(jset);
memset((void *) jset + bytes, 0, (sectors << 9) - bytes);
+retry_alloc:
spin_lock(&j->lock);
ret = journal_write_alloc(j, w, sectors);
+ if (ret && j->can_discard) {
+ spin_unlock(&j->lock);
+ bch2_journal_do_discards(j);
+ goto retry_alloc;
+ }
+
/*
* write is allocated, no longer need to account for it in
* bch2_journal_space_available():
spin_unlock(&j->lock);
if (ret) {
- bch2_journal_halt(j);
bch_err(c, "Unable to allocate journal write");
bch2_fatal_error(c);
continue_at(cl, journal_write_done, system_highpri_wq);
bio_reset(bio);
bio_set_dev(bio, ca->disk_sb.bdev);
bio->bi_iter.bi_sector = ptr->offset;
- bio->bi_iter.bi_size = sectors << 9;
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
bio_set_op_attrs(bio, REQ_OP_WRITE,
REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA);
- bch2_bio_map(bio, jset);
+ bch2_bio_map(bio, jset, sectors << 9);
trace_journal_write(bio);
closure_bio_submit(bio, cl);
for_each_rw_member(ca, c, i)
if (journal_flushes_device(ca) &&
- !bch2_extent_has_device(bkey_i_to_s_c_extent(&w->key), i)) {
+ !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) {
percpu_ref_get(&ca->io_ref);
bio = ca->journal.bio;