#include "error.h"
#include "extents.h"
#include "io.h"
-#include "journal.h"
+#include "journal_reclaim.h"
+#include "journal_seq_blacklist.h"
#include "super-io.h"
#include <trace/events/bcachefs.h>
+/* btree_node_iter_large: */
+
+#define btree_node_iter_cmp_heap(h, _l, _r) \
+ __btree_node_iter_cmp(b, \
+ __btree_node_offset_to_key(b, (_l).k), \
+ __btree_node_offset_to_key(b, (_r).k))
+
+void bch2_btree_node_iter_large_push(struct btree_node_iter_large *iter,
+ struct btree *b,
+ const struct bkey_packed *k,
+ const struct bkey_packed *end)
+{
+ if (k != end) {
+ struct btree_node_iter_set n =
+ ((struct btree_node_iter_set) {
+ __btree_node_key_to_offset(b, k),
+ __btree_node_key_to_offset(b, end)
+ });
+
+ __heap_add(iter, n, btree_node_iter_cmp_heap);
+ }
+}
+
+void bch2_btree_node_iter_large_advance(struct btree_node_iter_large *iter,
+ struct btree *b)
+{
+ iter->data->k += __btree_node_offset_to_key(b, iter->data->k)->u64s;
+
+ EBUG_ON(!iter->used);
+ EBUG_ON(iter->data->k > iter->data->end);
+
+ if (iter->data->k == iter->data->end)
+ heap_del(iter, 0, btree_node_iter_cmp_heap);
+ else
+ heap_sift_down(iter, 0, btree_node_iter_cmp_heap);
+}
+
static void verify_no_dups(struct btree *b,
struct bkey_packed *start,
struct bkey_packed *end)
sort_iter_sort(iter, sort_extent_whiteouts_cmp);
while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) {
+ if (bkey_deleted(in))
+ continue;
+
EBUG_ON(bkeyp_val_u64s(f, in));
EBUG_ON(in->type != KEY_TYPE_DISCARD);
bool compacting,
enum compact_mode mode)
{
- unsigned live_u64s = b->nr.bset_u64s[t - b->set];
unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s);
-
- if (live_u64s == bset_u64s)
- return 0;
+ unsigned dead_u64s = bset_u64s - b->nr.bset_u64s[t - b->set];
if (mode == COMPACT_LAZY) {
- if (live_u64s * 4 < bset_u64s * 3 ||
- (compacting && bset_unwritten(b, bset(b, t))))
- return bset_u64s - live_u64s;
+ if (should_compact_bset_lazy(b, t) ||
+ (compacting && !bset_written(b, bset(b, t))))
+ return dead_u64s;
} else {
if (bset_written(b, bset(b, t)))
- return bset_u64s - live_u64s;
+ return dead_u64s;
}
return 0;
struct bkey_packed *k, *n, *out, *start, *end;
struct btree_node_entry *src = NULL, *dst = NULL;
- if (t != b->set && bset_unwritten(b, i)) {
+ if (t != b->set && !bset_written(b, i)) {
src = container_of(i, struct btree_node_entry, keys);
dst = max(write_block(b),
(void *) btree_bkey_last(b, t -1));
continue;
if (bkey_whiteout(k)) {
- unreserve_whiteout(b, t, k);
+ unreserve_whiteout(b, k);
memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k));
set_bkeyp_val_u64s(f, u_pos, 0);
u_pos = bkey_next(u_pos);
struct bset *i = bset(b, t);
struct bkey_packed *k, *n, *out, *start, *end;
- if (!should_compact_bset(b, t, true, true))
+ if (!should_compact_bset(b, t, true, COMPACT_WRITTEN))
continue;
start = btree_bkey_first(b, t);
end = btree_bkey_last(b, t);
- if (bset_unwritten(b, i) &&
+ if (!bset_written(b, i) &&
t != b->set) {
struct bset *dst =
max_t(struct bset *, write_block(b),
BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order));
if (sorting_entire_node)
- bch2_time_stats_update(&c->btree_sort_time, start_time);
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_sort],
+ start_time);
/* Make sure we preserve bset journal_seq: */
for (t = b->set + start_idx; t < b->set + end_idx; t++)
bch2_bset_set_no_aux_tree(dst, dst->set);
- bch2_btree_node_iter_init_from_start(&src_iter, src,
- btree_node_is_extents(src));
+ bch2_btree_node_iter_init_from_start(&src_iter, src);
if (btree_node_ops(src)->key_normalize ||
btree_node_ops(src)->key_merge)
&dst->format,
true);
- bch2_time_stats_update(&c->btree_sort_time, start_time);
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_sort], start_time);
set_btree_bset_end(dst, dst->set);
for (unwritten_idx = 0;
unwritten_idx < b->nsets;
unwritten_idx++)
- if (bset_unwritten(b, bset(b, &b->set[unwritten_idx])))
+ if (!bset_written(b, bset(b, &b->set[unwritten_idx])))
break;
if (b->nsets - unwritten_idx > 1) {
for_each_bset(b, t)
bch2_bset_build_aux_tree(b, t,
- bset_unwritten(b, bset(b, t)) &&
+ !bset_written(b, bset(b, t)) &&
t == bset_tree_last(b));
}
* Returns true if we sorted (i.e. invalidated iterators
*/
void bch2_btree_init_next(struct bch_fs *c, struct btree *b,
- struct btree_iter *iter)
+ struct btree_iter *iter)
{
struct btree_node_entry *bne;
bool did_sort;
EBUG_ON(!(b->lock.state.seq & 1));
- EBUG_ON(iter && iter->nodes[b->level] != b);
+ EBUG_ON(iter && iter->l[b->level].b != b);
did_sort = btree_node_compact(c, b, iter);
bne = want_new_bset(c, b);
if (bne)
- bch2_bset_init_next(b, &bne->keys);
+ bch2_bset_init_next(c, b, bne);
bch2_btree_build_aux_trees(b);
char *out = buf, *end = buf + len;
out += scnprintf(out, end - out,
- "error validating btree node %s "
+ "error validating btree node %s"
"at btree %u level %u/%u\n"
"pos %llu:%llu node offset %u",
write ? "before write " : "",
#define btree_err(type, c, b, i, msg, ...) \
({ \
- char buf[200], *out = buf, *end = out + sizeof(buf); \
+ __label__ out; \
+ char _buf[300], *out = _buf, *end = out + sizeof(_buf); \
\
out += btree_err_msg(c, b, i, b->written, write, out, end - out);\
out += scnprintf(out, end - out, ": " msg, ##__VA_ARGS__); \
if (type == BTREE_ERR_FIXABLE && \
write == READ && \
!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \
- mustfix_fsck_err(c, "%s", buf); \
- } else { \
- bch_err(c, "%s", buf); \
+ mustfix_fsck_err(c, "%s", _buf); \
+ goto out; \
+ } \
+ \
+ switch (write) { \
+ case READ: \
+ bch_err(c, "%s", _buf); \
\
switch (type) { \
case BTREE_ERR_FIXABLE: \
ret = BCH_FSCK_ERRORS_NOT_FIXED; \
goto fsck_err; \
} \
+ break; \
+ case WRITE: \
+ bch_err(c, "corrupt metadata before write: %s", _buf); \
+ \
+ if (bch2_fs_inconsistent(c)) { \
+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \
+ goto fsck_err; \
+ } \
+ break; \
} \
+out: \
true; \
})
bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
btree_err(BTREE_ERR_FIXABLE, c, b, i,
- "invalid bkey %s: %s", buf, invalid);
+ "invalid bkey:\n%s\n%s", invalid, buf);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),
int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry)
{
struct btree_node_entry *bne;
- struct btree_node_iter *iter;
+ struct btree_node_iter_large *iter;
struct btree_node *sorted;
struct bkey_packed *k;
struct bset *i;
int ret, retry_read = 0, write = READ;
iter = mempool_alloc(&c->fill_iter, GFP_NOIO);
- __bch2_btree_node_iter_init(iter, btree_node_is_extents(b));
+ iter->used = 0;
if (bch2_meta_read_fault("btree"))
btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL,
unsigned sectors, whiteout_u64s = 0;
struct nonce nonce;
struct bch_csum csum;
+ bool first = !b->written;
if (!b->written) {
i = &b->data->keys;
sectors = vstruct_sectors(b->data, c->block_bits);
- set_btree_bset(b, b->set, &b->data->keys);
btree_node_set_format(b, b->data->format);
} else {
bne = write_block(b);
}
if (ret) {
- btree_err_on(!b->written,
+ btree_err_on(first,
BTREE_ERR_FIXABLE, c, b, i,
"first btree node bset has blacklisted journal seq");
- if (b->written)
+ if (!first)
continue;
}
- __bch2_btree_node_iter_push(iter, b,
+ bch2_btree_node_iter_large_push(iter, b,
i->start,
vstruct_idx(i, whiteout_u64s));
- __bch2_btree_node_iter_push(iter, b,
+ bch2_btree_node_iter_large_push(iter, b,
vstruct_idx(i, whiteout_u64s),
vstruct_last(i));
}
sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool);
sorted->keys.u64s = 0;
+ set_btree_bset(b, b->set, &b->data->keys);
+
b->nr = btree_node_is_extents(b)
? bch2_extent_sort_fix_overlapping(c, &sorted->keys, b, iter)
: bch2_key_sort_fix_overlapping(&sorted->keys, b, iter);
struct bkey_s_c u = bkey_disassemble(b, k, &tmp);
const char *invalid = bch2_bkey_val_invalid(c, type, u);
- if (invalid) {
+ if (invalid ||
+ (inject_invalid_keys(c) &&
+ !bversion_cmp(u.k->version, MAX_VERSION))) {
char buf[160];
bch2_bkey_val_to_text(c, type, buf, sizeof(buf), u);
i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
memmove_u64s_down(k, bkey_next(k),
(u64 *) vstruct_end(i) - (u64 *) k);
+ set_btree_bset_end(b, b->set);
continue;
}
struct btree_read_bio *rb =
container_of(work, struct btree_read_bio, work);
struct bch_fs *c = rb->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
struct btree *b = rb->bio.bi_private;
struct bio *bio = &rb->bio;
struct bch_devs_mask avoid;
+ bool can_retry;
memset(&avoid, 0, sizeof(avoid));
goto start;
- do {
+ while (1) {
bch_info(c, "retrying read");
+ ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+ rb->have_ioref = bch2_dev_get_ioref(ca, READ);
bio_reset(bio);
bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
- bio->bi_bdev = rb->pick.ca->disk_sb.bdev;
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
- submit_bio_wait(bio);
+
+ if (rb->have_ioref) {
+ bio_set_dev(bio, ca->disk_sb.bdev);
+ submit_bio_wait(bio);
+ } else {
+ bio->bi_status = BLK_STS_REMOVED;
+ }
start:
- bch2_dev_io_err_on(bio->bi_status, rb->pick.ca, "btree read");
- percpu_ref_put(&rb->pick.ca->io_ref);
+ bch2_dev_io_err_on(bio->bi_status, ca, "btree read");
+ if (rb->have_ioref)
+ percpu_ref_put(&ca->io_ref);
+ rb->have_ioref = false;
- __set_bit(rb->pick.ca->dev_idx, avoid.d);
- rb->pick = bch2_btree_pick_ptr(c, b, &avoid);
+ __set_bit(rb->pick.ptr.dev, avoid.d);
+ can_retry = bch2_btree_pick_ptr(c, b, &avoid, &rb->pick) > 0;
if (!bio->bi_status &&
- !bch2_btree_node_read_done(c, b, !IS_ERR_OR_NULL(rb->pick.ca)))
- goto out;
- } while (!IS_ERR_OR_NULL(rb->pick.ca));
+ !bch2_btree_node_read_done(c, b, can_retry))
+ break;
- set_btree_node_read_error(b);
-out:
- if (!IS_ERR_OR_NULL(rb->pick.ca))
- percpu_ref_put(&rb->pick.ca->io_ref);
+ if (!can_retry) {
+ set_btree_node_read_error(b);
+ break;
+ }
+ }
- bch2_time_stats_update(&c->btree_read_time, rb->start_time);
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_read], rb->start_time);
bio_put(&rb->bio);
clear_btree_node_read_in_flight(b);
wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
{
struct btree_read_bio *rb =
container_of(bio, struct btree_read_bio, bio);
+ struct bch_fs *c = rb->c;
- bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ);
+ if (rb->have_ioref) {
+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev);
+ bch2_latency_acct(ca, rb->start_time, READ);
+ }
- INIT_WORK(&rb->work, btree_node_read_work);
- schedule_work(&rb->work);
+ queue_work(system_unbound_wq, &rb->work);
}
void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
{
struct extent_pick_ptr pick;
struct btree_read_bio *rb;
+ struct bch_dev *ca;
struct bio *bio;
+ int ret;
trace_btree_read(c, b);
- pick = bch2_btree_pick_ptr(c, b, NULL);
- if (bch2_fs_fatal_err_on(!pick.ca, c,
+ ret = bch2_btree_pick_ptr(c, b, NULL, &pick);
+ if (bch2_fs_fatal_err_on(ret <= 0, c,
"btree node read error: no device to read from")) {
set_btree_node_read_error(b);
return;
}
- bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio);
+ ca = bch_dev_bkey_exists(c, pick.ptr.dev);
+
+ bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_bio);
rb = container_of(bio, struct btree_read_bio, bio);
rb->c = c;
rb->start_time = local_clock();
+ rb->have_ioref = bch2_dev_get_ioref(ca, READ);
rb->pick = pick;
+ INIT_WORK(&rb->work, btree_node_read_work);
bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META;
- bio->bi_bdev = pick.ca->disk_sb.bdev;
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_iter.bi_size = btree_bytes(c);
+ bio->bi_end_io = btree_node_read_endio;
+ bio->bi_private = b;
bch2_bio_map(bio, b->data);
- this_cpu_add(pick.ca->io_done->sectors[READ][BCH_DATA_BTREE],
- bio_sectors(bio));
-
set_btree_node_read_in_flight(b);
- if (sync) {
- submit_bio_wait(bio);
- bio->bi_private = b;
- btree_node_read_work(&rb->work);
+ if (rb->have_ioref) {
+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE],
+ bio_sectors(bio));
+ bio_set_dev(bio, ca->disk_sb.bdev);
+
+ if (sync) {
+ submit_bio_wait(bio);
+
+ bio->bi_private = b;
+ btree_node_read_work(&rb->work);
+ } else {
+ submit_bio(bio);
+ }
} else {
- bio->bi_end_io = btree_node_read_endio;
- bio->bi_private = b;
- submit_bio(bio);
+ bio->bi_status = BLK_STS_REMOVED;
+
+ if (sync)
+ btree_node_read_work(&rb->work);
+ else
+ queue_work(system_unbound_wq, &rb->work);
+
}
}
void bch2_btree_complete_write(struct bch_fs *c, struct btree *b,
struct btree_write *w)
{
+ unsigned long old, new, v = READ_ONCE(b->will_make_reachable);
+
+ do {
+ old = new = v;
+ if (!(old & 1))
+ break;
+
+ new &= ~1UL;
+ } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old);
+
+ if (old & 1)
+ closure_put(&((struct btree_update *) new)->cl);
+
bch2_journal_pin_drop(&c->journal, &w->journal);
closure_wake_up(&w->wait);
}
}
static void bch2_btree_node_write_error(struct bch_fs *c,
- struct bch_write_bio *wbio)
+ struct btree_write_bio *wbio)
{
- struct btree *b = wbio->bio.bi_private;
- struct closure *cl = wbio->cl;
+ struct btree *b = wbio->wbio.bio.bi_private;
__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
struct bkey_i_extent *new_key;
struct bkey_s_extent e;
__bch2_btree_iter_init(&iter, c, b->btree_id, b->key.k.p,
BTREE_MAX_DEPTH,
- b->level, 0);
+ b->level, BTREE_ITER_NODES);
retry:
ret = bch2_btree_iter_traverse(&iter);
if (ret)
goto err;
/* has node been freed? */
- if (iter.nodes[b->level] != b) {
+ if (iter.l[b->level].b != b) {
/* node has been freed: */
- if (!btree_node_dying(b))
- panic("foo4\n");
+ BUG_ON(!btree_node_dying(b));
goto out;
}
- if (!btree_node_hashed(b))
- panic("foo5\n");
+ BUG_ON(!btree_node_hashed(b));
bkey_copy(&tmp.k, &b->key);
new_key = bkey_i_to_extent(&tmp.k);
e = extent_i_to_s(new_key);
extent_for_each_ptr_backwards(e, ptr)
- if (bch2_dev_list_has_dev(wbio->failed, ptr->dev))
+ if (bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev))
bch2_extent_drop_ptr(e, ptr);
if (!bch2_extent_nr_ptrs(e.c))
goto err;
out:
bch2_btree_iter_unlock(&iter);
- bio_put(&wbio->bio);
+ bio_put(&wbio->wbio.bio);
btree_node_write_done(c, b);
- if (cl)
- closure_put(cl);
return;
err:
set_btree_node_noevict(b);
if (!bio)
break;
- bch2_btree_node_write_error(c, to_wbio(bio));
+ bch2_btree_node_write_error(c,
+ container_of(bio, struct btree_write_bio, wbio.bio));
}
}
+static void btree_node_write_work(struct work_struct *work)
+{
+ struct btree_write_bio *wbio =
+ container_of(work, struct btree_write_bio, work);
+ struct bch_fs *c = wbio->wbio.c;
+ struct btree *b = wbio->wbio.bio.bi_private;
+
+ btree_bounce_free(c,
+ wbio->wbio.order,
+ wbio->wbio.used_mempool,
+ wbio->data);
+
+ if (wbio->wbio.failed.nr) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&c->btree_write_error_lock, flags);
+ bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio);
+ spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+
+ queue_work(c->wq, &c->btree_write_error_work);
+ return;
+ }
+
+ bio_put(&wbio->wbio.bio);
+ btree_node_write_done(c, b);
+}
+
static void btree_node_write_endio(struct bio *bio)
{
- struct btree *b = bio->bi_private;
struct bch_write_bio *wbio = to_wbio(bio);
struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL;
struct bch_write_bio *orig = parent ?: wbio;
- struct closure *cl = !wbio->split ? wbio->cl : NULL;
struct bch_fs *c = wbio->c;
- struct bch_dev *ca = wbio->ca;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev);
unsigned long flags;
- bch2_latency_acct(ca, wbio->submit_time_us, WRITE);
+ if (wbio->have_ioref)
+ bch2_latency_acct(ca, wbio->submit_time, WRITE);
if (bio->bi_status == BLK_STS_REMOVED ||
bch2_dev_io_err_on(bio->bi_status, ca, "btree write") ||
bch2_meta_write_fault("btree")) {
spin_lock_irqsave(&c->btree_write_error_lock, flags);
- bch2_dev_list_add_dev(&orig->failed, ca->dev_idx);
+ bch2_dev_list_add_dev(&orig->failed, wbio->dev);
spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
}
- if (wbio->have_io_ref)
+ if (wbio->have_ioref)
percpu_ref_put(&ca->io_ref);
if (parent) {
bio_put(bio);
bio_endio(&parent->bio);
- return;
- }
-
- btree_bounce_free(c,
- wbio->order,
- wbio->used_mempool,
- wbio->data);
-
- if (wbio->failed.nr) {
- spin_lock_irqsave(&c->btree_write_error_lock, flags);
- bio_list_add(&c->btree_write_error_list, &wbio->bio);
- spin_unlock_irqrestore(&c->btree_write_error_lock, flags);
+ } else {
+ struct btree_write_bio *wb =
+ container_of(orig, struct btree_write_bio, wbio);
- queue_work(c->wq, &c->btree_write_error_work);
- return;
+ INIT_WORK(&wb->work, btree_node_write_work);
+ queue_work(system_unbound_wq, &wb->work);
}
-
- bio_put(bio);
- btree_node_write_done(c, b);
- if (cl)
- closure_put(cl);
}
static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
}
void __bch2_btree_node_write(struct bch_fs *c, struct btree *b,
- struct closure *parent,
enum six_lock_type lock_type_held)
{
- struct bch_write_bio *wbio;
+ struct btree_write_bio *wbio;
struct bset_tree *t;
struct bset *i;
struct btree_node *bn = NULL;
unsigned long old, new;
void *data;
+ if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags))
+ return;
+
/*
* We may only have a read lock on the btree node - the dirty bit is our
* "lock" against racing with other threads that may be trying to start
new ^= (1 << BTREE_NODE_write_idx);
} while (cmpxchg_acquire(&b->flags, old, new) != old);
+ BUG_ON(btree_node_fake(b));
BUG_ON(!list_empty(&b->write_blocked));
- BUG_ON((b->will_make_reachable != NULL) != !b->written);
+ BUG_ON((b->will_make_reachable != 0) != !b->written);
BUG_ON(b->written >= c->opts.btree_node_size);
+ BUG_ON(b->written & (c->opts.block_size - 1));
BUG_ON(bset_written(b, btree_bset_last(b)));
BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c));
BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format)));
- if (lock_type_held == SIX_LOCK_intent) {
- six_lock_write(&b->lock);
+ /*
+ * We can't block on six_lock_write() here; another thread might be
+ * trying to get a journal reservation with read locks held, and getting
+ * a journal reservation might be blocked on flushing the journal and
+ * doing btree writes:
+ */
+ if (lock_type_held == SIX_LOCK_intent &&
+ six_trylock_write(&b->lock)) {
__bch2_compact_whiteouts(c, b, COMPACT_WRITTEN);
six_unlock_write(&b->lock);
} else {
trace_btree_write(b, bytes_to_write, sectors_to_write);
- wbio = wbio_init(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write));
- wbio->cl = parent;
- wbio->failed.nr = 0;
- wbio->order = order;
- wbio->used_mempool = used_mempool;
- wbio->data = data;
- wbio->bio.bi_opf = REQ_OP_WRITE|REQ_META|REQ_FUA;
- wbio->bio.bi_iter.bi_size = sectors_to_write << 9;
- wbio->bio.bi_end_io = btree_node_write_endio;
- wbio->bio.bi_private = b;
+ wbio = container_of(bio_alloc_bioset(GFP_NOIO, 1 << order, &c->btree_bio),
+ struct btree_write_bio, wbio.bio);
+ wbio_init(&wbio->wbio.bio);
+ wbio->data = data;
+ wbio->wbio.order = order;
+ wbio->wbio.used_mempool = used_mempool;
+ wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META|REQ_FUA;
+ wbio->wbio.bio.bi_iter.bi_size = sectors_to_write << 9;
+ wbio->wbio.bio.bi_end_io = btree_node_write_endio;
+ wbio->wbio.bio.bi_private = b;
- if (parent)
- closure_get(parent);
-
- bch2_bio_map(&wbio->bio, data);
+ bch2_bio_map(&wbio->wbio.bio, data);
/*
* If we're appending to a leaf node, we don't technically need FUA -
b->written += sectors_to_write;
- bch2_submit_wbio_replicas(wbio, c, BCH_DATA_BTREE, &k.key);
+ bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key);
return;
err:
set_btree_node_noevict(b);
clear_btree_node_just_written(b);
/*
- * Note: immediately after write, bset_unwritten()/bset_written() don't
- * work - the amount of data we had to write after compaction might have
- * been smaller than the offset of the last bset.
+ * Note: immediately after write, bset_written() doesn't work - the
+ * amount of data we had to write after compaction might have been
+ * smaller than the offset of the last bset.
*
* However, we know that all bsets have been written here, as long as
* we're still holding the write lock:
bne = want_new_bset(c, b);
if (bne)
- bch2_bset_init_next(b, &bne->keys);
+ bch2_bset_init_next(c, b, bne);
bch2_btree_build_aux_trees(b);
* Use this one if the node is intent locked:
*/
void bch2_btree_node_write(struct bch_fs *c, struct btree *b,
- struct closure *parent,
enum six_lock_type lock_type_held)
{
BUG_ON(lock_type_held == SIX_LOCK_write);
if (lock_type_held == SIX_LOCK_intent ||
- six_trylock_convert(&b->lock, SIX_LOCK_read,
- SIX_LOCK_intent)) {
- __bch2_btree_node_write(c, b, parent, SIX_LOCK_intent);
+ six_lock_tryupgrade(&b->lock)) {
+ __bch2_btree_node_write(c, b, SIX_LOCK_intent);
/* don't cycle lock unnecessarily: */
- if (btree_node_just_written(b)) {
- six_lock_write(&b->lock);
+ if (btree_node_just_written(b) &&
+ six_trylock_write(&b->lock)) {
bch2_btree_post_write_cleanup(c, b);
six_unlock_write(&b->lock);
}
if (lock_type_held == SIX_LOCK_read)
six_lock_downgrade(&b->lock);
} else {
- __bch2_btree_node_write(c, b, parent, SIX_LOCK_read);
+ __bch2_btree_node_write(c, b, SIX_LOCK_read);
}
}
+static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag)
+{
+ struct bucket_table *tbl;
+ struct rhash_head *pos;
+ struct btree *b;
+ unsigned i;
+restart:
+ rcu_read_lock();
+ for_each_cached_btree(b, c, tbl, i, pos)
+ if (test_bit(flag, &b->flags)) {
+ rcu_read_unlock();
+ wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE);
+ goto restart;
+
+ }
+ rcu_read_unlock();
+}
+
+void bch2_btree_flush_all_reads(struct bch_fs *c)
+{
+ __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight);
+}
+
+void bch2_btree_flush_all_writes(struct bch_fs *c)
+{
+ __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight);
+}
+
void bch2_btree_verify_flushed(struct bch_fs *c)
{
struct bucket_table *tbl;
unsigned i;
rcu_read_lock();
- tbl = rht_dereference_rcu(c->btree_cache.table.tbl,
- &c->btree_cache.table);
+ for_each_cached_btree(b, c, tbl, i, pos) {
+ unsigned long flags = READ_ONCE(b->flags);
- for (i = 0; i < tbl->size; i++)
- rht_for_each_entry_rcu(b, pos, tbl, i, hash)
- BUG_ON(btree_node_dirty(b));
+ BUG_ON((flags & (1 << BTREE_NODE_dirty)) ||
+ (flags & (1 << BTREE_NODE_write_in_flight)));
+ }
rcu_read_unlock();
}
+
+ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf)
+{
+ char *out = buf, *end = buf + PAGE_SIZE;
+ struct bucket_table *tbl;
+ struct rhash_head *pos;
+ struct btree *b;
+ unsigned i;
+
+ rcu_read_lock();
+ for_each_cached_btree(b, c, tbl, i, pos) {
+ unsigned long flags = READ_ONCE(b->flags);
+ unsigned idx = (flags & (1 << BTREE_NODE_write_idx)) != 0;
+
+ if (//!(flags & (1 << BTREE_NODE_dirty)) &&
+ !b->writes[0].wait.list.first &&
+ !b->writes[1].wait.list.first &&
+ !(b->will_make_reachable & 1))
+ continue;
+
+ out += scnprintf(out, end - out, "%p d %u l %u w %u b %u r %u:%lu c %u p %u\n",
+ b,
+ (flags & (1 << BTREE_NODE_dirty)) != 0,
+ b->level,
+ b->written,
+ !list_empty_careful(&b->write_blocked),
+ b->will_make_reachable != 0,
+ b->will_make_reachable & 1,
+ b->writes[ idx].wait.list.first != NULL,
+ b->writes[!idx].wait.list.first != NULL);
+ }
+ rcu_read_unlock();
+
+ return out - buf;
+}