X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fbtree_io.c;h=decbbaace1eef03e98a143325a296fceafdd30a8;hb=5ef62f56ab50c5799f713e3a42f5c7ad7e8283d3;hp=90f67ccd5e64c72cdde3cdf81106e7da9e6951f9;hpb=ae43a58d97fc00e31770142da832fb8a249808eb;p=bcachefs-tools-debian diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index 90f67cc..decbbaa 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -18,9 +18,9 @@ #include "journal_reclaim.h" #include "journal_seq_blacklist.h" #include "super-io.h" +#include "trace.h" #include -#include void bch2_btree_node_io_unlock(struct btree *b) { @@ -33,7 +33,7 @@ void bch2_btree_node_io_unlock(struct btree *b) void bch2_btree_node_io_lock(struct btree *b) { - BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); + bch2_assert_btree_nodes_not_locked(); wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, TASK_UNINTERRUPTIBLE); @@ -53,7 +53,7 @@ void __bch2_btree_node_wait_on_write(struct btree *b) void bch2_btree_node_wait_on_read(struct btree *b) { - BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); + bch2_assert_btree_nodes_not_locked(); wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, TASK_UNINTERRUPTIBLE); @@ -61,7 +61,7 @@ void bch2_btree_node_wait_on_read(struct btree *b) void bch2_btree_node_wait_on_write(struct btree *b) { - BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); + bch2_assert_btree_nodes_not_locked(); wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, TASK_UNINTERRUPTIBLE); @@ -77,13 +77,13 @@ static void verify_no_dups(struct btree *b, if (start == end) return; - for (p = start, k = bkey_next(start); + for (p = start, k = bkey_p_next(start); k != end; - p = k, k = bkey_next(k)) { + p = k, k = bkey_p_next(k)) { struct bkey l = bkey_unpack_key(b, p); struct bkey r = bkey_unpack_key(b, k); - BUG_ON(bpos_cmp(l.p, bkey_start_pos(&r)) >= 0); + BUG_ON(bpos_ge(l.p, bkey_start_pos(&r))); } #endif } @@ -92,7 +92,7 @@ static void set_needs_whiteout(struct bset *i, int v) { struct bkey_packed *k; - for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) + for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) k->needs_whiteout = v; } @@ -175,7 +175,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) for (k = unwritten_whiteouts_start(c, b); k != unwritten_whiteouts_end(c, b); - k = bkey_next(k)) + k = bkey_p_next(k)) *--ptrs = k; sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs); @@ -184,7 +184,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) while (ptrs != ptrs_end) { bkey_copy(k, *ptrs); - k = bkey_next(k); + k = bkey_p_next(k); ptrs++; } @@ -256,11 +256,11 @@ static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) out = i->start; for (k = start; k != end; k = n) { - n = bkey_next(k); + n = bkey_p_next(k); if (!bkey_deleted(k)) { bkey_copy(out, k); - out = bkey_next(out); + out = bkey_p_next(out); } else { BUG_ON(k->needs_whiteout); } @@ -450,6 +450,24 @@ void bch2_btree_build_aux_trees(struct btree *b) t == bset_tree_last(b)); } +/* + * If we have MAX_BSETS (3) bsets, should we sort them all down to just one? + * + * The first bset is going to be of similar order to the size of the node, the + * last bset is bounded by btree_write_set_buffer(), which is set to keep the + * memmove on insert from being too expensive: the middle bset should, ideally, + * be the geometric mean of the first and the last. + * + * Returns true if the middle bset is greater than that geometric mean: + */ +static inline bool should_compact_all(struct bch_fs *c, struct btree *b) +{ + unsigned mid_u64s_bits = + (ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2; + + return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits; +} + /* * @bch_btree_init_next - initialize a new (unwritten) bset that can then be * inserted into @@ -467,19 +485,14 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) EBUG_ON(!(b->c.lock.state.seq & 1)); BUG_ON(bset_written(b, bset(b, &b->set[1]))); + BUG_ON(btree_node_just_written(b)); if (b->nsets == MAX_BSETS && - !btree_node_write_in_flight(b)) { - unsigned log_u64s[] = { - ilog2(bset_u64s(&b->set[0])), - ilog2(bset_u64s(&b->set[1])), - ilog2(bset_u64s(&b->set[2])), - }; - - if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) { - bch2_btree_node_write(c, b, SIX_LOCK_write, 0); - reinit_iter = true; - } + !btree_node_write_in_flight(b) && + should_compact_all(c, b)) { + bch2_btree_node_write(c, b, SIX_LOCK_write, + BTREE_WRITE_init_next_bset); + reinit_iter = true; } if (b->nsets == MAX_BSETS && @@ -513,11 +526,10 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct btree *b, struct bset *i, unsigned offset, int write) { - prt_printf(out, bch2_log_msg(c, "")); - if (!write) - prt_str(out, "error validating btree node "); - else - prt_str(out, "corrupt btree node before write "); + prt_printf(out, bch2_log_msg(c, "%s"), + write == READ + ? "error validating btree node " + : "corrupt btree node before write "); if (ca) prt_printf(out, "on %s ", ca->name); prt_printf(out, "at btree "); @@ -530,63 +542,96 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, } enum btree_err_type { + /* + * We can repair this locally, and we're after the checksum check so + * there's no need to try another replica: + */ BTREE_ERR_FIXABLE, + /* + * We can repair this if we have to, but we should try reading another + * replica if we can: + */ BTREE_ERR_WANT_RETRY, + /* + * Read another replica if we have one, otherwise consider the whole + * node bad: + */ BTREE_ERR_MUST_RETRY, - BTREE_ERR_FATAL, + BTREE_ERR_BAD_NODE, + BTREE_ERR_INCOMPATIBLE, }; enum btree_validate_ret { BTREE_RETRY_READ = 64, }; +static int __btree_err(enum btree_err_type type, + struct bch_fs *c, + struct bch_dev *ca, + struct btree *b, + struct bset *i, + int write, + bool have_retry, + const char *fmt, ...) +{ + struct printbuf out = PRINTBUF; + va_list args; + int ret = -BCH_ERR_fsck_fix; + + btree_err_msg(&out, c, ca, b, i, b->written, write); + + va_start(args, fmt); + prt_vprintf(&out, fmt, args); + va_end(args); + + if (write == WRITE) { + bch2_print_string_as_lines(KERN_ERR, out.buf); + ret = c->opts.errors == BCH_ON_ERROR_continue + ? 0 + : -BCH_ERR_fsck_errors_not_fixed; + goto out; + } + + if (!have_retry && type == BTREE_ERR_WANT_RETRY) + type = BTREE_ERR_FIXABLE; + if (!have_retry && type == BTREE_ERR_MUST_RETRY) + type = BTREE_ERR_BAD_NODE; + + switch (type) { + case BTREE_ERR_FIXABLE: + mustfix_fsck_err(c, "%s", out.buf); + ret = -BCH_ERR_fsck_fix; + break; + case BTREE_ERR_WANT_RETRY: + case BTREE_ERR_MUST_RETRY: + bch2_print_string_as_lines(KERN_ERR, out.buf); + ret = BTREE_RETRY_READ; + break; + case BTREE_ERR_BAD_NODE: + bch2_print_string_as_lines(KERN_ERR, out.buf); + bch2_topology_error(c); + ret = -BCH_ERR_need_topology_repair; + break; + case BTREE_ERR_INCOMPATIBLE: + bch2_print_string_as_lines(KERN_ERR, out.buf); + ret = -BCH_ERR_fsck_errors_not_fixed; + break; + default: + BUG(); + } +out: +fsck_err: + printbuf_exit(&out); + return ret; +} + #define btree_err(type, c, ca, b, i, msg, ...) \ ({ \ - __label__ out; \ - struct printbuf out = PRINTBUF; \ - \ - btree_err_msg(&out, c, ca, b, i, b->written, write); \ - prt_printf(&out, msg, ##__VA_ARGS__); \ + int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\ \ - if (type == BTREE_ERR_FIXABLE && \ - write == READ && \ - !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ - mustfix_fsck_err(c, "%s", out.buf); \ - goto out; \ - } \ - \ - bch2_print_string_as_lines(KERN_ERR, out.buf); \ - \ - switch (write) { \ - case READ: \ - switch (type) { \ - case BTREE_ERR_FIXABLE: \ - ret = -BCH_ERR_fsck_errors_not_fixed; \ - goto fsck_err; \ - case BTREE_ERR_WANT_RETRY: \ - if (have_retry) { \ - ret = BTREE_RETRY_READ; \ - goto fsck_err; \ - } \ - break; \ - case BTREE_ERR_MUST_RETRY: \ - ret = BTREE_RETRY_READ; \ - goto fsck_err; \ - case BTREE_ERR_FATAL: \ - ret = -BCH_ERR_fsck_errors_not_fixed; \ - goto fsck_err; \ - } \ - break; \ - case WRITE: \ - if (bch2_fs_inconsistent(c)) { \ - ret = -BCH_ERR_fsck_errors_not_fixed; \ - goto fsck_err; \ - } \ - break; \ - } \ -out: \ - printbuf_exit(&out); \ - true; \ + if (_ret != -BCH_ERR_fsck_fix) \ + goto fsck_err; \ + *saw_error = true; \ }) #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) @@ -595,6 +640,7 @@ out: \ * When btree topology repair changes the start or end of a node, that might * mean we have to drop keys that are no longer inside the node: */ +__cold void bch2_btree_node_drop_keys_outside_node(struct btree *b) { struct bset_tree *t; @@ -606,7 +652,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) struct bset *i = bset(b, t); struct bkey_packed *k; - for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) + for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0) break; @@ -619,7 +665,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) set_btree_bset_end(b, t); } - for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) + for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0) break; @@ -637,15 +683,15 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) bch2_btree_build_aux_trees(b); for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { - BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); - BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); + BUG_ON(bpos_lt(k.k->p, b->data->min_key)); + BUG_ON(bpos_gt(k.k->p, b->data->max_key)); } } static int validate_bset(struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, unsigned offset, unsigned sectors, - int write, bool have_retry) + int write, bool have_retry, bool *saw_error) { unsigned version = le16_to_cpu(i->version); const char *err; @@ -656,7 +702,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, btree_err_on((version != BCH_BSET_VERSION_OLD && version < bcachefs_metadata_version_min) || version >= bcachefs_metadata_version_max, - BTREE_ERR_FATAL, c, ca, b, i, + BTREE_ERR_INCOMPATIBLE, c, ca, b, i, "unsupported bset version"); if (btree_err_on(version < c->sb.version_min, @@ -680,7 +726,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, } btree_err_on(BSET_SEPARATE_WHITEOUTS(i), - BTREE_ERR_FATAL, c, ca, b, i, + BTREE_ERR_INCOMPATIBLE, c, ca, b, i, "BSET_SEPARATE_WHITEOUTS no longer supported"); if (btree_err_on(offset + sectors > btree_sectors(c), @@ -736,7 +782,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, b->data->max_key = b->key.k.p; } - btree_err_on(bpos_cmp(b->data->min_key, bp->min_key), + btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), BTREE_ERR_MUST_RETRY, c, ca, b, NULL, "incorrect min_key: got %s should be %s", (printbuf_reset(&buf1), @@ -745,7 +791,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf)); } - btree_err_on(bpos_cmp(bn->max_key, b->key.k.p), + btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), BTREE_ERR_MUST_RETRY, c, ca, b, i, "incorrect max key %s", (printbuf_reset(&buf1), @@ -757,7 +803,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, err = bch2_bkey_format_validate(&bn->format); btree_err_on(err, - BTREE_ERR_FATAL, c, ca, b, i, + BTREE_ERR_BAD_NODE, c, ca, b, i, "invalid bkey format: %s", err); compat_bformat(b->c.level, b->c.btree_id, version, @@ -782,7 +828,8 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b, } static int validate_bset_keys(struct bch_fs *c, struct btree *b, - struct bset *i, int write, bool have_retry) + struct bset *i, int write, + bool have_retry, bool *saw_error) { unsigned version = le16_to_cpu(i->version); struct bkey_packed *k, *prev = NULL; @@ -796,7 +843,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, struct bkey_s u; struct bkey tmp; - if (btree_err_on(bkey_next(k) > vstruct_last(i), + if (btree_err_on(bkey_p_next(k) > vstruct_last(i), BTREE_ERR_FIXABLE, c, NULL, b, i, "key extends past end of bset")) { i->u64s = cpu_to_le16((u64 *) k - i->_data); @@ -807,7 +854,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, BTREE_ERR_FIXABLE, c, NULL, b, i, "invalid bkey format %u", k->format)) { i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_next(k), + memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); continue; } @@ -831,7 +878,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_next(k), + memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); continue; } @@ -854,14 +901,14 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) { i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_next(k), + memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); continue; } } prev = k; - k = bkey_next(k); + k = bkey_p_next(k); } fsck_err: printbuf_exit(&buf); @@ -869,7 +916,7 @@ fsck_err: } int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - struct btree *b, bool have_retry) + struct btree *b, bool have_retry, bool *saw_error) { struct btree_node_entry *bne; struct sort_iter *iter; @@ -884,7 +931,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, unsigned blacklisted_written, nonblacklisted_written = 0; unsigned ptr_written = btree_ptr_sectors_written(&b->key); struct printbuf buf = PRINTBUF; - int ret, retry_read = 0, write = READ; + int ret = 0, retry_read = 0, write = READ; b->version_ondisk = U16_MAX; /* We might get called multiple times on read retry: */ @@ -945,7 +992,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), - BTREE_ERR_FATAL, c, NULL, b, NULL, + BTREE_ERR_INCOMPATIBLE, c, NULL, b, NULL, "btree node does not have NEW_EXTENT_OVERWRITE set"); sectors = vstruct_sectors(b->data, c->block_bits); @@ -980,14 +1027,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, le16_to_cpu(i->version)); ret = validate_bset(c, ca, b, i, b->written, sectors, - READ, have_retry); + READ, have_retry, saw_error); if (ret) goto fsck_err; if (!b->written) btree_node_set_format(b, b->data->format); - ret = validate_bset_keys(c, b, i, READ, have_retry); + ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error); if (ret) goto fsck_err; @@ -1092,7 +1139,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_keys_account_key_drop(&b->nr, 0, k); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); - memmove_u64s_down(k, bkey_next(k), + memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); set_btree_bset_end(b, b->set); continue; @@ -1104,7 +1151,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bp.v->mem_ptr = 0; } - k = bkey_next(k); + k = bkey_p_next(k); } bch2_bset_build_aux_tree(b, b->set, false); @@ -1127,12 +1174,10 @@ out: printbuf_exit(&buf); return retry_read; fsck_err: - if (ret == BTREE_RETRY_READ) { + if (ret == BTREE_RETRY_READ) retry_read = 1; - } else { - bch2_inconsistent_error(c); + else set_btree_node_read_error(b); - } goto out; } @@ -1182,7 +1227,7 @@ start: &failed, &rb->pick) > 0; if (!bio->bi_status && - !bch2_btree_node_read_done(c, ca, b, can_retry)) { + !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) { if (retry) bch_info(c, "retry success"); break; @@ -1201,8 +1246,16 @@ start: bio_put(&rb->bio); printbuf_exit(&buf); - if (saw_error && !btree_node_read_error(b)) + if (saw_error && !btree_node_read_error(b)) { + struct printbuf buf = PRINTBUF; + + bch2_bpos_to_text(&buf, b->key.k.p); + bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", + __func__, bch2_btree_ids[b->c.btree_id], b->c.level, buf.buf); + printbuf_exit(&buf); + bch2_btree_node_rewrite_async(c, b); + } clear_btree_node_read_in_flight(b); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); @@ -1288,6 +1341,7 @@ static void btree_node_read_all_replicas_done(struct closure *cl) unsigned i, written = 0, written2 = 0; __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; + bool _saw_error = false, *saw_error = &_saw_error; for (i = 0; i < ra->nr; i++) { struct btree_node *bn = ra->buf[i]; @@ -1374,13 +1428,15 @@ fsck_err: if (best >= 0) { memcpy(b->data, ra->buf[best], btree_bytes(c)); - ret = bch2_btree_node_read_done(c, NULL, b, false); + ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error); } else { ret = -1; } if (ret) set_btree_node_read_error(b); + else if (*saw_error) + bch2_btree_node_rewrite_async(c, b); for (i = 0; i < ra->nr; i++) { mempool_free(ra->buf[i], &c->btree_bounce_pool); @@ -1427,7 +1483,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool ra = kzalloc(sizeof(*ra), GFP_NOFS); if (!ra) - return -ENOMEM; + return -BCH_ERR_ENOMEM_btree_node_read_all_replicas; closure_init(&ra->cl, NULL); ra->c = c; @@ -1560,9 +1616,10 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, } } -int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, - const struct bkey_i *k, unsigned level) +static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, + const struct bkey_i *k, unsigned level) { + struct bch_fs *c = trans->c; struct closure cl; struct btree *b; int ret; @@ -1574,7 +1631,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, closure_sync(&cl); } while (ret); - b = bch2_btree_node_mem_alloc(c, level != 0); + b = bch2_btree_node_mem_alloc(trans, level != 0); bch2_btree_cache_cannibalize_unlock(c); BUG_ON(IS_ERR(b)); @@ -1605,6 +1662,13 @@ err: return ret; } +int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, + const struct bkey_i *k, unsigned level) +{ + return bch2_trans_run(c, __bch2_btree_root_read(&trans, id, k, level)); + +} + void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, struct btree_write *w) { @@ -1628,6 +1692,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) { struct btree_write *w = btree_prev_write(b); unsigned long old, new, v; + unsigned type = 0; bch2_btree_complete_write(c, b, w); @@ -1646,6 +1711,9 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) new |= (1U << BTREE_NODE_write_in_flight_inner); new |= (1U << BTREE_NODE_just_written); new ^= (1U << BTREE_NODE_write_idx); + + type = new & BTREE_WRITE_TYPE_MASK; + new &= ~BTREE_WRITE_TYPE_MASK; } else { new &= ~(1U << BTREE_NODE_write_in_flight); new &= ~(1U << BTREE_NODE_write_in_flight_inner); @@ -1653,7 +1721,7 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) } while ((v = cmpxchg(&b->flags, old, new)) != old); if (new & (1U << BTREE_NODE_write_in_flight)) - __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED); + __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type); else wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); } @@ -1678,7 +1746,7 @@ static void btree_node_write_work(struct work_struct *work) struct bch_fs *c = wbio->wbio.c; struct btree *b = wbio->wbio.bio.bi_private; struct bch_extent_ptr *ptr; - int ret; + int ret = 0; btree_bounce_free(c, wbio->data_bytes, @@ -1708,7 +1776,8 @@ out: return; err: set_btree_node_noevict(b); - bch2_fs_fatal_error(c, "fatal error writing btree node"); + if (!bch2_err_matches(ret, EROFS)) + bch2_fs_fatal_error(c, "fatal error writing btree node"); goto out; } @@ -1753,6 +1822,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, struct bset *i, unsigned sectors) { struct printbuf buf = PRINTBUF; + bool saw_error; int ret; ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), @@ -1764,8 +1834,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, if (ret) return ret; - ret = validate_bset_keys(c, b, i, WRITE, false) ?: - validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false); + ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?: + validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error); if (ret) { bch2_inconsistent_error(c); dump_stack(); @@ -1778,14 +1848,15 @@ static void btree_write_submit(struct work_struct *work) { struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); struct bch_extent_ptr *ptr; - __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; + BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; bkey_copy(&tmp.k, &wbio->key); bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr) ptr->offset += wbio->sector_offset; - bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k); + bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, + &tmp.k, false); } void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) @@ -1802,6 +1873,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) bool used_mempool; unsigned long old, new; bool validate_before_checksum = false; + enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK; void *data; int ret; @@ -1837,6 +1909,10 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) if (old & (1 << BTREE_NODE_write_in_flight)) return; + if (flags & BTREE_WRITE_ONLY_IF_NEED) + type = new & BTREE_WRITE_TYPE_MASK; + new &= ~BTREE_WRITE_TYPE_MASK; + new &= ~(1 << BTREE_NODE_dirty); new &= ~(1 << BTREE_NODE_need_write); new |= (1 << BTREE_NODE_write_in_flight); @@ -1848,6 +1924,8 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) if (new & (1U << BTREE_NODE_need_write)) return; do_write: + BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0)); + atomic_dec(&c->btree_cache.dirty); BUG_ON(btree_node_fake(b)); @@ -2022,8 +2100,8 @@ do_write: bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written = cpu_to_le16(b->written); - atomic64_inc(&c->btree_writes_nr); - atomic64_add(sectors_to_write, &c->btree_writes_sectors); + atomic64_inc(&c->btree_write_stats[type].nr); + atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); INIT_WORK(&wbio->work, btree_write_submit); queue_work(c->io_complete_wq, &wbio->work); @@ -2151,3 +2229,33 @@ bool bch2_btree_flush_all_writes(struct bch_fs *c) { return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); } + +const char * const bch2_btree_write_types[] = { +#define x(t, n) [n] = #t, + BCH_BTREE_WRITE_TYPES() + NULL +}; + +void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c) +{ + printbuf_tabstop_push(out, 20); + printbuf_tabstop_push(out, 10); + + prt_tab(out); + prt_str(out, "nr"); + prt_tab(out); + prt_str(out, "size"); + prt_newline(out); + + for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) { + u64 nr = atomic64_read(&c->btree_write_stats[i].nr); + u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes); + + prt_printf(out, "%s:", bch2_btree_write_types[i]); + prt_tab(out); + prt_u64(out, nr); + prt_tab(out); + prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0); + prt_newline(out); + } +}