#include "error.h"
#include "extents.h"
#include "io.h"
-#include "journal.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
#include "super-io.h"
#include <trace/events/bcachefs.h>
/* btree_node_iter_large: */
#define btree_node_iter_cmp_heap(h, _l, _r) \
- __btree_node_iter_cmp((iter)->is_extents, b, \
+ __btree_node_iter_cmp(b, \
__btree_node_offset_to_key(b, (_l).k), \
__btree_node_offset_to_key(b, (_r).k))
sort_iter_sort(iter, sort_extent_whiteouts_cmp);
while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
+ if (bkey_deleted(in))
+ continue;
+
EBUG_ON(bkeyp_val_u64s(f, in));
EBUG_ON(in->type != KEY_TYPE_DISCARD);
if (mode == COMPACT_LAZY) {
if (should_compact_bset_lazy(b, t) ||
- (compacting && bset_unwritten(b, bset(b, t))))
+ (compacting && !bset_written(b, bset(b, t))))
return dead_u64s;
} else {
if (bset_written(b, bset(b, t)))
struct bkey_packed *k, *n, *out, *start, *end;
struct btree_node_entry *src = NULL, *dst = NULL;
- if (t != b->set && bset_unwritten(b, i)) {
+ if (t != b->set && !bset_written(b, i)) {
src = container_of(i, struct btree_node_entry, keys);
dst = max(write_block(b),
(void *) btree_bkey_last(b, t -1));
continue;
if (bkey_whiteout(k)) {
- unreserve_whiteout(b, t, k);
+ unreserve_whiteout(b, k);
memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
set_bkeyp_val_u64s(f, u_pos, 0);
u_pos = bkey_next(u_pos);
start = btree_bkey_first(b, t);
end = btree_bkey_last(b, t);
- if (bset_unwritten(b, i) &&
+ if (!bset_written(b, i) &&
t != b->set) {
struct bset *dst =
max_t(struct bset *, write_block(b),
BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
if (sorting_entire_node)
- bch2_time_stats_update(&c->btree_sort_time, start_time);
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_sort],
+ start_time);
/* Make sure we preserve bset journal_seq: */
for (t = b->set + start_idx; t < b->set + end_idx; t++)
bch2_bset_set_no_aux_tree(dst, dst->set);
- bch2_btree_node_iter_init_from_start(&src_iter, src,
- btree_node_is_extents(src));
+ bch2_btree_node_iter_init_from_start(&src_iter, src);
if (btree_node_ops(src)->key_normalize ||
btree_node_ops(src)->key_merge)
&dst->format,
true);
- bch2_time_stats_update(&c->btree_sort_time, start_time);
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], start_time);
set_btree_bset_end(dst, dst->set);
for (unwritten_idx = 0;
unwritten_idx < b->nsets;
unwritten_idx++)
- if (bset_unwritten(b, bset(b, &b->set[unwritten_idx])))
+ if (!bset_written(b, bset(b, &b->set[unwritten_idx])))
break;
if (b->nsets - unwritten_idx > 1) {
for_each_bset(b, t)
bch2_bset_build_aux_tree(b, t,
- bset_unwritten(b, bset(b, t)) &&
+ !bset_written(b, bset(b, t)) &&
t == bset_tree_last(b));
}
bne = want_new_bset(c, b);
if (bne)
- bch2_bset_init_next(b, &bne->keys);
+ bch2_bset_init_next(c, b, bne);
bch2_btree_build_aux_trees(b);
char *out = buf, *end = buf + len;
out += scnprintf(out, end - out,
- "error validating btree node %s "
+ "error validating btree node %s"
"at btree %u level %u/%u\n"
"pos %llu:%llu node offset %u",
write ? "before write " : "",
#define btree_err(type, c, b, i, msg, ...) \
({ \
+ __label__ out; \
char _buf[300], *out = _buf, *end = out + sizeof(_buf); \
\
out += btree_err_msg(c, b, i, b->written, write, out, end - out);\
write == READ && \
!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \
mustfix_fsck_err(c, "%s", _buf); \
- } else { \
+ goto out; \
+ } \
+ \
+ switch (write) { \
+ case READ: \
bch_err(c, "%s", _buf); \
\
switch (type) { \
ret = BCH_FSCK_ERRORS_NOT_FIXED; \
goto fsck_err; \
} \
+ break; \
+ case WRITE: \
+ bch_err(c, "corrupt metadata before write: %s", _buf); \
+ \
+ if (bch2_fs_inconsistent(c)) { \
+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \
+ goto fsck_err; \
+ } \
+ break; \
} \
+out: \
true; \
})
bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
btree_err(BTREE_ERR_FIXABLE, c, b, i,
- "invalid bkey:\n%s\n%s", buf, invalid);
+ "invalid bkey:\n%s\n%s", invalid, buf);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),
int ret, retry_read = 0, write = READ;
iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
- __bch2_btree_node_iter_large_init(iter, btree_node_is_extents(b));
+ iter->used = 0;
if (bch2_meta_read_fault("btree"))
btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL,
struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
const char *invalid = bch2_bkey_val_invalid(c, type, u);
- if (invalid) {
+ if (invalid ||
+ (inject_invalid_keys(c) &&
+ !bversion_cmp(u.k->version, MAX_VERSION))) {
char buf[160];
bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
+ set_btree_bset_end(b, b->set);
continue;
}
struct btree_read_bio *rb =
container_of(work, struct btree_read_bio, work);
struct bch_fs *c = rb->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
struct btree *b = rb->bio.bi_private;
struct bio *bio = &rb->bio;
struct bch_devs_mask avoid;
+ bool can_retry;
memset(&avoid, 0, sizeof(avoid));
goto start;
- do {
+ while (1) {
bch_info(c, "retrying read");
+ ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+ rb->have_ioref = bch2_dev_get_ioref(ca, READ);
bio_reset(bio);
- bio_set_dev(bio, rb->pick.ca->disk_sb.bdev);
bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
- submit_bio_wait(bio);
+
+ if (rb->have_ioref) {
+ bio_set_dev(bio, ca->disk_sb.bdev);
+ submit_bio_wait(bio);
+ } else {
+ bio->bi_status = BLK_STS_REMOVED;
+ }
start:
- bch2_dev_io_err_on(bio->bi_status, rb->pick.ca, "btree read");
- percpu_ref_put(&rb->pick.ca->io_ref);
+ bch2_dev_io_err_on(bio->bi_status, ca, "btree read");
+ if (rb->have_ioref)
+ percpu_ref_put(&ca->io_ref);
+ rb->have_ioref = false;
- __set_bit(rb->pick.ca->dev_idx, avoid.d);
- rb->pick = bch2_btree_pick_ptr(c, b, &avoid);
+ __set_bit(rb->pick.ptr.dev, avoid.d);
+ can_retry = bch2_btree_pick_ptr(c, b, &avoid, &rb->pick) > 0;
if (!bio->bi_status &&
- !bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca)))
- goto out;
- } while (!IS_ERR_OR_NULL(rb->pick.ca));
+ !bch2_btree_node_read_done(c, b, can_retry))
+ break;
- set_btree_node_read_error(b);
-out:
- if (!IS_ERR_OR_NULL(rb->pick.ca))
- percpu_ref_put(&rb->pick.ca->io_ref);
+ if (!can_retry) {
+ set_btree_node_read_error(b);
+ break;
+ }
+ }
- bch2_time_stats_update(&c->btree_read_time, rb->start_time);
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_read], rb->start_time);
bio_put(&rb->bio);
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
{
struct btree_read_bio *rb =
container_of(bio, struct btree_read_bio, bio);
+ struct bch_fs *c = rb->c;
- bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ);
+ if (rb->have_ioref) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+ bch2_latency_acct(ca, rb->start_time, READ);
+ }
- INIT_WORK(&rb->work, btree_node_read_work);
queue_work(system_unbound_wq, &rb->work);
}
{
struct extent_pick_ptr pick;
struct btree_read_bio *rb;
+ struct bch_dev *ca;
struct bio *bio;
+ int ret;
trace_btree_read(c, b);
- pick = bch2_btree_pick_ptr(c, b, NULL);
- if (bch2_fs_fatal_err_on(!pick.ca, c,
+ ret = bch2_btree_pick_ptr(c, b, NULL, &pick);
+ if (bch2_fs_fatal_err_on(ret <= 0, c,
"btree node read error: no device to read from")) {
set_btree_node_read_error(b);
return;
}
+ ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
rb = container_of(bio, struct btree_read_bio, bio);
rb->c = c;
rb->start_time = local_clock();
+ rb->have_ioref = bch2_dev_get_ioref(ca, READ);
rb->pick = pick;
- bio_set_dev(bio, pick.ca->disk_sb.bdev);
+ INIT_WORK(&rb->work, btree_node_read_work);
bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
+ bio->bi_end_io = btree_node_read_endio;
+ bio->bi_private = b;
bch2_bio_map(bio, b->data);
- this_cpu_add(pick.ca->io_done->sectors[READ][BCH_DATA_BTREE],
- bio_sectors(bio));
-
set_btree_node_read_in_flight(b);
- if (sync) {
- submit_bio_wait(bio);
- bio->bi_private = b;
- btree_node_read_work(&rb->work);
+ if (rb->have_ioref) {
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE],
+ bio_sectors(bio));
+ bio_set_dev(bio, ca->disk_sb.bdev);
+
+ if (sync) {
+ submit_bio_wait(bio);
+
+ bio->bi_private = b;
+ btree_node_read_work(&rb->work);
+ } else {
+ submit_bio(bio);
+ }
} else {
- bio->bi_end_io = btree_node_read_endio;
- bio->bi_private = b;
- submit_bio(bio);
+ bio->bi_status = BLK_STS_REMOVED;
+
+ if (sync)
+ btree_node_read_work(&rb->work);
+ else
+ queue_work(system_unbound_wq, &rb->work);
+
}
}
__bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
BTREE_MAX_DEPTH,
- b->level, 0);
+ b->level, BTREE_ITER_NODES);
retry:
ret = bch2_btree_iter_traverse(&iter);
if (ret)
struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
struct bch_write_bio *orig = parent ?: wbio;
struct bch_fs *c = wbio->c;
- struct bch_dev *ca = wbio->ca;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
unsigned long flags;
- bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
+ if (wbio->have_ioref)
+ bch2_latency_acct(ca, wbio->submit_time, WRITE);
if (bio->bi_status == BLK_STS_REMOVED ||
bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
bch2_meta_write_fault("btree")) {
spin_lock_irqsave(&c->btree_write_error_lock, flags);
- bch2_dev_list_add_dev(&orig->failed, ca->dev_idx);
+ bch2_dev_list_add_dev(&orig->failed, wbio->dev);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
}
- if (wbio->have_io_ref)
+ if (wbio->have_ioref)
percpu_ref_put(&ca->io_ref);
if (parent) {
BUG_ON((b->will_make_reachable != 0) != !b->written);
BUG_ON(b->written >= c->opts.btree_node_size);
+ BUG_ON(b->written & (c->opts.block_size - 1));
BUG_ON(bset_written(b, btree_bset_last(b)));
BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
clear_btree_node_just_written(b);
/*
- * Note: immediately after write, bset_unwritten()/bset_written() don't
- * work - the amount of data we had to write after compaction might have
- * been smaller than the offset of the last bset.
+ * Note: immediately after write, bset_written() doesn't work - the
+ * amount of data we had to write after compaction might have been
+ * smaller than the offset of the last bset.
*
* However, we know that all bsets have been written here, as long as
* we're still holding the write lock:
bne = want_new_bset(c, b);
if (bne)
- bch2_bset_init_next(b, &bne->keys);
+ bch2_bset_init_next(c, b, bne);
bch2_btree_build_aux_trees(b);