-f1c9030ccbf6d7b5c46f08f92ee878bfc9f6ee6b
+be2d60d9484734b4c619ac0ddf54b3103210c9c0
memset(l, 0, sizeof(*l));
- l->magic = BCACHE_MAGIC;
+ l->magic = BCHFS_MAGIC;
l->layout_type = 0;
l->nr_superblocks = 2;
l->sb_max_size_bits = ilog2(sb_size);
sb.sb->version = le16_to_cpu(opts.version);
sb.sb->version_min = le16_to_cpu(opts.version);
- sb.sb->magic = BCACHE_MAGIC;
+ sb.sb->magic = BCHFS_MAGIC;
sb.sb->user_uuid = opts.uuid;
sb.sb->nr_devices = nr_devs;
xpread(fd, &sb, sizeof(sb), sector << 9);
- if (memcmp(&sb.magic, &BCACHE_MAGIC, sizeof(sb.magic)))
+ if (memcmp(&sb.magic, &BCACHE_MAGIC, sizeof(sb.magic)) &&
+ memcmp(&sb.magic, &BCHFS_MAGIC, sizeof(sb.magic)))
die("not a bcachefs superblock");
size_t bytes = vstruct_bytes(&sb);
* @version_min - Oldest metadata version this filesystem contains; so we can
* safely drop compatibility code and refuse to mount filesystems
* we'd need it for
- * @magic - identifies as a bcachefs superblock (BCACHE_MAGIC)
+ * @magic - identifies as a bcachefs superblock (BCHFS_MAGIC)
* @seq - incremented each time superblock is written
* @uuid - used for generating various magic numbers and identifying
* member devices, never changes
#define BCACHE_MAGIC \
UUID_LE(0xf67385c6, 0x1a4e, 0xca45, \
0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81)
+#define BCHFS_MAGIC \
+ UUID_LE(0xf67385c6, 0xce66, 0xa990, \
+ 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef)
#define BCACHEFS_STATFS_MAGIC 0xca451a4e
struct btree_path *path, struct bpos new_pos,
bool intent, unsigned long ip, int cmp)
{
- unsigned l = path->level;
+ unsigned level = path->level;
EBUG_ON(trans->restarted);
EBUG_ON(!path->ref);
goto out;
}
- l = btree_path_up_until_good_node(trans, path, cmp);
+ level = btree_path_up_until_good_node(trans, path, cmp);
- if (btree_path_node(path, l)) {
- BUG_ON(!btree_node_locked(path, l));
+ if (btree_path_node(path, level)) {
+ struct btree_path_level *l = &path->l[level];
+
+ BUG_ON(!btree_node_locked(path, level));
/*
* We might have to skip over many keys, or just a few: try
* advancing the node iterator, and if we have to skip over too
* is expensive).
*/
if (cmp < 0 ||
- !btree_path_advance_to_pos(path, &path->l[l], 8))
- __btree_path_level_init(path, l);
+ !btree_path_advance_to_pos(path, l, 8))
+ bch2_btree_node_iter_init(&l->iter, l->b, &path->pos);
+
+ /*
+ * Iterators to interior nodes should always be pointed at the first non
+ * whiteout:
+ */
+ if (unlikely(level))
+ bch2_btree_node_iter_peek(&l->iter, l->b);
}
- if (unlikely(l != path->level)) {
+ if (unlikely(level != path->level)) {
btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
__bch2_btree_path_unlock(trans, path);
}
struct journal_replay *r, **_r;
struct genradix_iter iter;
struct journal_read_buf buf = { NULL, 0 };
- u64 min_seq = U64_MAX;
unsigned i;
int ret = 0;
goto err;
}
- /* Find the journal bucket with the highest sequence number: */
- for (i = 0; i < ja->nr; i++) {
- if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx])
- ja->cur_idx = i;
-
- min_seq = min(ja->bucket_seq[i], min_seq);
- }
-
- /*
- * If there's duplicate journal entries in multiple buckets (which
- * definitely isn't supposed to happen, but...) - make sure to start
- * cur_idx at the last of those buckets, so we don't deadlock trying to
- * allocate
- */
- while (ja->bucket_seq[ja->cur_idx] > min_seq &&
- ja->bucket_seq[ja->cur_idx] ==
- ja->bucket_seq[(ja->cur_idx + 1) % ja->nr])
- ja->cur_idx = (ja->cur_idx + 1) % ja->nr;
-
ja->sectors_free = ca->mi.bucket_size;
mutex_lock(&jlist->lock);
- genradix_for_each(&c->journal_entries, iter, _r) {
+ genradix_for_each_reverse(&c->journal_entries, iter, _r) {
r = *_r;
if (!r)
continue;
for (i = 0; i < r->nr_ptrs; i++) {
- if (r->ptrs[i].dev == ca->dev_idx &&
- sector_to_bucket(ca, r->ptrs[i].sector) == ja->buckets[ja->cur_idx]) {
+ if (r->ptrs[i].dev == ca->dev_idx) {
unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
vstruct_sectors(&r->j, c->block_bits);
- ja->sectors_free = min(ja->sectors_free,
- ca->mi.bucket_size - wrote);
+ ja->cur_idx = r->ptrs[i].bucket;
+ ja->sectors_free = ca->mi.bucket_size - wrote;
+ goto found;
}
}
}
+found:
mutex_unlock(&jlist->lock);
if (ja->bucket_seq[ja->cur_idx] &&
j->write_start_time = local_clock();
spin_lock(&j->lock);
- if (bch2_journal_error(j) ||
- w->noflush ||
- (!w->must_flush &&
- (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
- test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
+
+ /*
+ * If the journal is in an error state - we did an emergency shutdown -
+ * we prefer to continue doing journal writes. We just mark them as
+ * noflush so they'll never be used, but they'll still be visible by the
+ * list_journal tool - this helps in debugging.
+ *
+ * There's a caveat: the first journal write after marking the
+ * superblock dirty must always be a flush write, because on startup
+ * from a clean shutdown we didn't necessarily read the journal and the
+ * new journal write might overwrite whatever was in the journal
+ * previously - we can't leave the journal without any flush writes in
+ * it.
+ *
+ * So if we're in an error state, and we're still starting up, we don't
+ * write anything at all.
+ */
+ if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) &&
+ (bch2_journal_error(j) ||
+ w->noflush ||
+ (!w->must_flush &&
+ (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
+ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) {
w->noflush = true;
SET_JSET_NO_FLUSH(jset, true);
jset->last_seq = 0;
w->last_seq = 0;
j->nr_noflush_writes++;
- } else {
+ } else if (!bch2_journal_error(j)) {
j->last_flush_write = jiffies;
j->nr_flush_writes++;
+ clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
+ } else {
+ spin_unlock(&j->lock);
+ goto err;
}
spin_unlock(&j->lock);
journal_space_nr,
};
-enum {
+enum journal_flags {
JOURNAL_REPLAY_DONE,
JOURNAL_STARTED,
JOURNAL_MAY_SKIP_FLUSH,
+ JOURNAL_NEED_FLUSH_WRITE,
};
#define JOURNAL_WATERMARKS() \
u64 offset, prev_offset, max_sectors;
unsigned i;
- if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) {
+ if (uuid_le_cmp(layout->magic, BCACHE_MAGIC) &&
+ uuid_le_cmp(layout->magic, BCHFS_MAGIC)) {
prt_printf(out, "Not a bcachefs superblock layout");
return -BCH_ERR_invalid_sb_layout;
}
return ret;
}
- if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) {
+ if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC) &&
+ uuid_le_cmp(sb->sb->magic, BCHFS_MAGIC)) {
prt_printf(err, "Not a bcachefs superblock");
return -BCH_ERR_invalid_sb_magic;
}
clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
+ /*
+ * First journal write must be a flush write: after a clean shutdown we
+ * don't read the journal, so the first journal write may end up
+ * overwriting whatever was there previously, and there must always be
+ * at least one non-flush write in the journal or recovery will fail:
+ */
+ set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags);
+
for_each_rw_member(ca, c, i)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);