X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Fbtree_io.c;h=c354dd1aefb9e365f5772ca2fae5c8cb4a72111a;hb=07ec713e056a73337bc0f53f5910f5179537b2c2;hp=bb8cee15d51fa3d3dce78fa474b57a98b427f233;hpb=6d8881705872edbc2c8fe539312aa40eaa8f281d;p=bcachefs-tools-debian diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index bb8cee1..c354dd1 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -1,20 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "bkey_methods.h" +#include "bkey_sort.h" #include "btree_cache.h" -#include "btree_update.h" #include "btree_io.h" #include "btree_iter.h" #include "btree_locking.h" +#include "btree_update.h" +#include "btree_update_interior.h" #include "buckets.h" #include "checksum.h" #include "debug.h" #include "error.h" #include "extents.h" #include "io.h" -#include "journal.h" +#include "journal_reclaim.h" +#include "journal_seq_blacklist.h" #include "super-io.h" +#include #include static void verify_no_dups(struct btree *b, @@ -22,313 +27,168 @@ static void verify_no_dups(struct btree *b, struct bkey_packed *end) { #ifdef CONFIG_BCACHEFS_DEBUG - struct bkey_packed *k; + struct bkey_packed *k, *p; - for (k = start; k != end && bkey_next(k) != end; k = bkey_next(k)) { - struct bkey l = bkey_unpack_key(b, k); - struct bkey r = bkey_unpack_key(b, bkey_next(k)); + if (start == end) + return; - BUG_ON(btree_node_is_extents(b) - ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 - : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); - //BUG_ON(bkey_cmp_packed(&b->format, k, bkey_next(k)) >= 0); + for (p = start, k = bkey_next(start); + k != end; + p = k, k = bkey_next(k)) { + struct bkey l = bkey_unpack_key(b, p); + struct bkey r = bkey_unpack_key(b, k); + + BUG_ON(bpos_cmp(l.p, bkey_start_pos(&r)) >= 0); } #endif } -static void clear_needs_whiteout(struct bset *i) -{ - struct bkey_packed *k; - - for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) - k->needs_whiteout = false; -} - -static void set_needs_whiteout(struct bset *i) +static void set_needs_whiteout(struct bset *i, int v) { struct bkey_packed *k; for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) - k->needs_whiteout = true; + k->needs_whiteout = v; } -static void btree_bounce_free(struct bch_fs *c, unsigned order, +static void btree_bounce_free(struct bch_fs *c, size_t size, bool used_mempool, void *p) { if (used_mempool) - mempool_free(virt_to_page(p), &c->btree_bounce_pool); + mempool_free(p, &c->btree_bounce_pool); else - free_pages((unsigned long) p, order); + vpfree(p, size); } -static void *btree_bounce_alloc(struct bch_fs *c, unsigned order, +static void *btree_bounce_alloc(struct bch_fs *c, size_t size, bool *used_mempool) { + unsigned flags = memalloc_nofs_save(); void *p; - BUG_ON(1 << order > btree_pages(c)); + BUG_ON(size > btree_bytes(c)); *used_mempool = false; - p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order); - if (p) - return p; - - *used_mempool = true; - return page_address(mempool_alloc(&c->btree_bounce_pool, GFP_NOIO)); -} - -typedef int (*sort_cmp_fn)(struct btree *, - struct bkey_packed *, - struct bkey_packed *); - -struct sort_iter { - struct btree *b; - unsigned used; - - struct sort_iter_set { - struct bkey_packed *k, *end; - } data[MAX_BSETS + 1]; -}; - -static void sort_iter_init(struct sort_iter *iter, struct btree *b) -{ - memset(iter, 0, sizeof(*iter)); - iter->b = b; -} - -static inline void __sort_iter_sift(struct sort_iter *iter, - unsigned from, - sort_cmp_fn cmp) -{ - unsigned i; - - for (i = from; - i + 1 < iter->used && - cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; - i++) - swap(iter->data[i], iter->data[i + 1]); -} - -static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp) -{ - - __sort_iter_sift(iter, 0, cmp); -} - -static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) -{ - unsigned i = iter->used; - - while (i--) - __sort_iter_sift(iter, i, cmp); -} - -static void sort_iter_add(struct sort_iter *iter, - struct bkey_packed *k, - struct bkey_packed *end) -{ - BUG_ON(iter->used >= ARRAY_SIZE(iter->data)); - - if (k != end) - iter->data[iter->used++] = (struct sort_iter_set) { k, end }; -} - -static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) -{ - return iter->used ? iter->data->k : NULL; -} - -static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) -{ - iter->data->k = bkey_next(iter->data->k); - - BUG_ON(iter->data->k > iter->data->end); - - if (iter->data->k == iter->data->end) - memmove(&iter->data[0], - &iter->data[1], - sizeof(iter->data[0]) * --iter->used); - else - sort_iter_sift(iter, cmp); -} - -static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, - sort_cmp_fn cmp) -{ - struct bkey_packed *ret = sort_iter_peek(iter); - - if (ret) - sort_iter_advance(iter, cmp); - - return ret; + p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); + if (!p) { + *used_mempool = true; + p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); + } + memalloc_nofs_restore(flags); + return p; } -static inline int sort_key_whiteouts_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) +static void sort_bkey_ptrs(const struct btree *bt, + struct bkey_packed **ptrs, unsigned nr) { - return bkey_cmp_packed(b, l, r); -} + unsigned n = nr, a = nr / 2, b, c, d; -static unsigned sort_key_whiteouts(struct bkey_packed *dst, - struct sort_iter *iter) -{ - struct bkey_packed *in, *out = dst; + if (!a) + return; - sort_iter_sort(iter, sort_key_whiteouts_cmp); + /* Heap sort: see lib/sort.c: */ + while (1) { + if (a) + a--; + else if (--n) + swap(ptrs[0], ptrs[n]); + else + break; - while ((in = sort_iter_next(iter, sort_key_whiteouts_cmp))) { - bkey_copy(out, in); - out = bkey_next(out); + for (b = a; c = 2 * b + 1, (d = c + 1) < n;) + b = bch2_bkey_cmp_packed(bt, + ptrs[c], + ptrs[d]) >= 0 ? c : d; + if (d == n) + b = c; + + while (b != a && + bch2_bkey_cmp_packed(bt, + ptrs[a], + ptrs[b]) >= 0) + b = (b - 1) / 2; + c = b; + while (b != a) { + b = (b - 1) / 2; + swap(ptrs[b], ptrs[c]); + } } - - return (u64 *) out - (u64 *) dst; -} - -static inline int sort_extent_whiteouts_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - struct bkey ul = bkey_unpack_key(b, l); - struct bkey ur = bkey_unpack_key(b, r); - - return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur)); } -static unsigned sort_extent_whiteouts(struct bkey_packed *dst, - struct sort_iter *iter) +static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) { - const struct bkey_format *f = &iter->b->format; - struct bkey_packed *in, *out = dst; - struct bkey_i l, r; - bool prev = false, l_packed = false; - u64 max_packed_size = bkey_field_max(f, BKEY_FIELD_SIZE); - u64 max_packed_offset = bkey_field_max(f, BKEY_FIELD_OFFSET); - u64 new_size; - - max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX); - - sort_iter_sort(iter, sort_extent_whiteouts_cmp); - - while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) { - EBUG_ON(bkeyp_val_u64s(f, in)); - EBUG_ON(in->type != KEY_TYPE_DISCARD); - - r.k = bkey_unpack_key(iter->b, in); - - if (prev && - bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) { - if (bkey_cmp(l.k.p, r.k.p) >= 0) - continue; + struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; + bool used_mempool = false; + size_t bytes = b->whiteout_u64s * sizeof(u64); - new_size = l_packed - ? min(max_packed_size, max_packed_offset - - bkey_start_offset(&l.k)) - : KEY_SIZE_MAX; + if (!b->whiteout_u64s) + return; - new_size = min(new_size, r.k.p.offset - - bkey_start_offset(&l.k)); + new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); - BUG_ON(new_size < l.k.size); + ptrs = ptrs_end = ((void *) new_whiteouts + bytes); - bch2_key_resize(&l.k, new_size); + for (k = unwritten_whiteouts_start(c, b); + k != unwritten_whiteouts_end(c, b); + k = bkey_next(k)) + *--ptrs = k; - if (bkey_cmp(l.k.p, r.k.p) >= 0) - continue; + sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs); - bch2_cut_front(l.k.p, &r); - } - - if (prev) { - if (!bch2_bkey_pack(out, &l, f)) { - BUG_ON(l_packed); - bkey_copy(out, &l); - } - out = bkey_next(out); - } + k = new_whiteouts; - l = r; - prev = true; - l_packed = bkey_packed(in); + while (ptrs != ptrs_end) { + bkey_copy(k, *ptrs); + k = bkey_next(k); + ptrs++; } - if (prev) { - if (!bch2_bkey_pack(out, &l, f)) { - BUG_ON(l_packed); - bkey_copy(out, &l); - } - out = bkey_next(out); - } + verify_no_dups(b, new_whiteouts, + (void *) ((u64 *) new_whiteouts + b->whiteout_u64s)); + + memcpy_u64s(unwritten_whiteouts_start(c, b), + new_whiteouts, b->whiteout_u64s); - return (u64 *) out - (u64 *) dst; + btree_bounce_free(c, bytes, used_mempool, new_whiteouts); } -static unsigned should_compact_bset(struct btree *b, struct bset_tree *t, - bool compacting, - enum compact_mode mode) +static bool should_compact_bset(struct btree *b, struct bset_tree *t, + bool compacting, enum compact_mode mode) { - unsigned live_u64s = b->nr.bset_u64s[t - b->set]; - unsigned bset_u64s = le16_to_cpu(bset(b, t)->u64s); - - if (live_u64s == bset_u64s) - return 0; + if (!bset_dead_u64s(b, t)) + return false; - if (mode == COMPACT_LAZY) { - if (live_u64s * 4 < bset_u64s * 3 || - (compacting && bset_unwritten(b, bset(b, t)))) - return bset_u64s - live_u64s; - } else { - if (bset_written(b, bset(b, t))) - return bset_u64s - live_u64s; + switch (mode) { + case COMPACT_LAZY: + return should_compact_bset_lazy(b, t) || + (compacting && !bset_written(b, bset(b, t))); + case COMPACT_ALL: + return true; + default: + BUG(); } - - return 0; } -bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, - enum compact_mode mode) +static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) { - const struct bkey_format *f = &b->format; struct bset_tree *t; - struct bkey_packed *whiteouts = NULL; - struct bkey_packed *u_start, *u_pos; - struct sort_iter sort_iter; - unsigned order, whiteout_u64s = 0, u64s; - bool used_mempool, compacting = false; - - for_each_bset(b, t) - whiteout_u64s += should_compact_bset(b, t, - whiteout_u64s != 0, mode); - - if (!whiteout_u64s) - return false; - - sort_iter_init(&sort_iter, b); - - whiteout_u64s += b->whiteout_u64s; - order = get_order(whiteout_u64s * sizeof(u64)); - - whiteouts = btree_bounce_alloc(c, order, &used_mempool); - u_start = u_pos = whiteouts; - - memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b), - b->whiteout_u64s); - u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64); - - sort_iter_add(&sort_iter, u_start, u_pos); + bool ret = false; for_each_bset(b, t) { struct bset *i = bset(b, t); struct bkey_packed *k, *n, *out, *start, *end; struct btree_node_entry *src = NULL, *dst = NULL; - if (t != b->set && bset_unwritten(b, i)) { + if (t != b->set && !bset_written(b, i)) { src = container_of(i, struct btree_node_entry, keys); dst = max(write_block(b), - (void *) btree_bkey_last(b, t -1)); + (void *) btree_bkey_last(b, t - 1)); } - if (!should_compact_bset(b, t, compacting, mode)) { + if (src != dst) + ret = true; + + if (!should_compact_bset(b, t, ret, mode)) { if (src != dst) { memmove(dst, src, sizeof(*src) + le16_to_cpu(src->keys.u64s) * @@ -339,10 +199,8 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, continue; } - compacting = true; - u_start = u_pos; - start = i->start; - end = vstruct_last(i); + start = btree_bkey_first(b, t); + end = btree_bkey_last(b, t); if (src != dst) { memmove(dst, src, sizeof(*src)); @@ -355,199 +213,34 @@ bool __bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, for (k = start; k != end; k = n) { n = bkey_next(k); - if (bkey_deleted(k) && btree_node_is_extents(b)) - continue; - - if (bkey_whiteout(k) && !k->needs_whiteout) - continue; - - if (bkey_whiteout(k)) { - unreserve_whiteout(b, t, k); - memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); - set_bkeyp_val_u64s(f, u_pos, 0); - u_pos = bkey_next(u_pos); - } else if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) { - bkey_copy(out, k); - out = bkey_next(out); - } - } - - sort_iter_add(&sort_iter, u_start, u_pos); - - if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) { - i->u64s = cpu_to_le16((u64 *) out - i->_data); - set_btree_bset_end(b, t); - bch2_bset_set_no_aux_tree(b, t); - } - } - - b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts; - - BUG_ON((void *) unwritten_whiteouts_start(c, b) < - (void *) btree_bkey_last(b, bset_tree_last(b))); - - u64s = btree_node_is_extents(b) - ? sort_extent_whiteouts(unwritten_whiteouts_start(c, b), - &sort_iter) - : sort_key_whiteouts(unwritten_whiteouts_start(c, b), - &sort_iter); - - BUG_ON(u64s > b->whiteout_u64s); - BUG_ON(u64s != b->whiteout_u64s && !btree_node_is_extents(b)); - BUG_ON(u_pos != whiteouts && !u64s); - - if (u64s != b->whiteout_u64s) { - void *src = unwritten_whiteouts_start(c, b); - - b->whiteout_u64s = u64s; - memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s); - } - - verify_no_dups(b, - unwritten_whiteouts_start(c, b), - unwritten_whiteouts_end(c, b)); - - btree_bounce_free(c, order, used_mempool, whiteouts); - - if (mode != COMPACT_WRITTEN_NO_WRITE_LOCK) - bch2_btree_build_aux_trees(b); - - bch_btree_keys_u64s_remaining(c, b); - bch2_verify_btree_nr_keys(b); - - return true; -} - -static bool bch2_drop_whiteouts(struct btree *b) -{ - struct bset_tree *t; - bool ret = false; - - for_each_bset(b, t) { - struct bset *i = bset(b, t); - struct bkey_packed *k, *n, *out, *start, *end; - - if (!should_compact_bset(b, t, true, true)) - continue; - - start = btree_bkey_first(b, t); - end = btree_bkey_last(b, t); - - if (bset_unwritten(b, i) && - t != b->set) { - struct bset *dst = - max_t(struct bset *, write_block(b), - (void *) btree_bkey_last(b, t -1)); - - memmove(dst, i, sizeof(struct bset)); - i = dst; - set_btree_bset(b, t, i); - } - - out = i->start; - - for (k = start; k != end; k = n) { - n = bkey_next(k); - - if (!bkey_whiteout(k)) { + if (!bkey_deleted(k)) { bkey_copy(out, k); out = bkey_next(out); + } else { + BUG_ON(k->needs_whiteout); } } i->u64s = cpu_to_le16((u64 *) out - i->_data); + set_btree_bset_end(b, t); bch2_bset_set_no_aux_tree(b, t); ret = true; } bch2_verify_btree_nr_keys(b); - return ret; -} - -static inline int sort_keys_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - return bkey_cmp_packed(b, l, r) ?: - (int) bkey_whiteout(r) - (int) bkey_whiteout(l) ?: - (int) l->needs_whiteout - (int) r->needs_whiteout; -} - -static unsigned sort_keys(struct bkey_packed *dst, - struct sort_iter *iter, - bool filter_whiteouts) -{ - const struct bkey_format *f = &iter->b->format; - struct bkey_packed *in, *next, *out = dst; - - sort_iter_sort(iter, sort_keys_cmp); - - while ((in = sort_iter_next(iter, sort_keys_cmp))) { - if (bkey_whiteout(in) && - (filter_whiteouts || !in->needs_whiteout)) - continue; - - if (bkey_whiteout(in) && - (next = sort_iter_peek(iter)) && - !bkey_cmp_packed(iter->b, in, next)) { - BUG_ON(in->needs_whiteout && - next->needs_whiteout); - /* - * XXX racy, called with read lock from write path - * - * leads to spurious BUG_ON() in bkey_unpack_key() in - * debug mode - */ - next->needs_whiteout |= in->needs_whiteout; - continue; - } - - if (bkey_whiteout(in)) { - memcpy_u64s(out, in, bkeyp_key_u64s(f, in)); - set_bkeyp_val_u64s(f, out, 0); - } else { - bkey_copy(out, in); - } - out = bkey_next(out); - } - - return (u64 *) out - (u64 *) dst; -} + bch2_btree_build_aux_trees(b); -static inline int sort_extents_cmp(struct btree *b, - struct bkey_packed *l, - struct bkey_packed *r) -{ - return bkey_cmp_packed(b, l, r) ?: - (int) bkey_deleted(l) - (int) bkey_deleted(r); + return ret; } -static unsigned sort_extents(struct bkey_packed *dst, - struct sort_iter *iter, - bool filter_whiteouts) +bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, + enum compact_mode mode) { - struct bkey_packed *in, *out = dst; - - sort_iter_sort(iter, sort_extents_cmp); - - while ((in = sort_iter_next(iter, sort_extents_cmp))) { - if (bkey_deleted(in)) - continue; - - if (bkey_whiteout(in) && - (filter_whiteouts || !in->needs_whiteout)) - continue; - - bkey_copy(out, in); - out = bkey_next(out); - } - - return (u64 *) out - (u64 *) dst; + return bch2_drop_whiteouts(b, mode); } static void btree_node_sort(struct bch_fs *c, struct btree *b, - struct btree_iter *iter, unsigned start_idx, unsigned end_idx, bool filter_whiteouts) @@ -557,8 +250,8 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, struct bset_tree *t; struct bset *start_bset = bset(b, &b->set[start_idx]); bool used_mempool = false; - u64 start_time; - unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1; + u64 start_time, seq = 0; + unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1; bool sorting_entire_node = start_idx == 0 && end_idx == b->nsets; @@ -573,40 +266,33 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, btree_bkey_last(b, t)); } - order = sorting_entire_node - ? btree_page_order(c) - : get_order(__vstruct_bytes(struct btree_node, u64s)); + bytes = sorting_entire_node + ? btree_bytes(c) + : __vstruct_bytes(struct btree_node, u64s); - out = btree_bounce_alloc(c, order, &used_mempool); + out = btree_bounce_alloc(c, bytes, &used_mempool); start_time = local_clock(); - if (btree_node_is_extents(b)) - filter_whiteouts = bset_written(b, start_bset); - - u64s = btree_node_is_extents(b) - ? sort_extents(out->keys.start, &sort_iter, filter_whiteouts) - : sort_keys(out->keys.start, &sort_iter, filter_whiteouts); + u64s = bch2_sort_keys(out->keys.start, &sort_iter, filter_whiteouts); out->keys.u64s = cpu_to_le16(u64s); - BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order)); + BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes); if (sorting_entire_node) - bch2_time_stats_update(&c->btree_sort_time, start_time); + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], + start_time); /* Make sure we preserve bset journal_seq: */ - for (t = b->set + start_idx + 1; - t < b->set + end_idx; - t++) - start_bset->journal_seq = - max(start_bset->journal_seq, - bset(b, t)->journal_seq); + for (t = b->set + start_idx; t < b->set + end_idx; t++) + seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq)); + start_bset->journal_seq = cpu_to_le64(seq); if (sorting_entire_node) { unsigned u64s = le16_to_cpu(out->keys.u64s); - BUG_ON(order != btree_page_order(c)); + BUG_ON(bytes != btree_bytes(c)); /* * Our temporary buffer is the same size as the btree node's @@ -641,106 +327,11 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, set_btree_bset_end(b, &b->set[start_idx]); bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); - btree_bounce_free(c, order, used_mempool, out); + btree_bounce_free(c, bytes, used_mempool, out); bch2_verify_btree_nr_keys(b); } -/* Sort + repack in a new format: */ -static struct btree_nr_keys sort_repack(struct bset *dst, - struct btree *src, - struct btree_node_iter *src_iter, - struct bkey_format *out_f, - bool filter_whiteouts) -{ - struct bkey_format *in_f = &src->format; - struct bkey_packed *in, *out = vstruct_last(dst); - struct btree_nr_keys nr; - - memset(&nr, 0, sizeof(nr)); - - while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { - if (filter_whiteouts && bkey_whiteout(in)) - continue; - - if (bch2_bkey_transform(out_f, out, bkey_packed(in) - ? in_f : &bch2_bkey_format_current, in)) - out->format = KEY_FORMAT_LOCAL_BTREE; - else - bch2_bkey_unpack(src, (void *) out, in); - - btree_keys_account_key_add(&nr, 0, out); - out = bkey_next(out); - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; -} - -/* Sort, repack, and merge: */ -static struct btree_nr_keys sort_repack_merge(struct bch_fs *c, - struct bset *dst, - struct btree *src, - struct btree_node_iter *iter, - struct bkey_format *out_f, - bool filter_whiteouts, - key_filter_fn filter, - key_merge_fn merge) -{ - struct bkey_packed *k, *prev = NULL, *out; - struct btree_nr_keys nr; - BKEY_PADDED(k) tmp; - - memset(&nr, 0, sizeof(nr)); - - while ((k = bch2_btree_node_iter_next_all(iter, src))) { - if (filter_whiteouts && bkey_whiteout(k)) - continue; - - /* - * The filter might modify pointers, so we have to unpack the - * key and values to &tmp.k: - */ - bch2_bkey_unpack(src, &tmp.k, k); - - if (filter && filter(c, src, bkey_i_to_s(&tmp.k))) - continue; - - /* prev is always unpacked, for key merging: */ - - if (prev && - merge && - merge(c, src, (void *) prev, &tmp.k) == BCH_MERGE_MERGE) - continue; - - /* - * the current key becomes the new prev: advance prev, then - * copy the current key - but first pack prev (in place): - */ - if (prev) { - bch2_bkey_pack(prev, (void *) prev, out_f); - - btree_keys_account_key_add(&nr, 0, prev); - prev = bkey_next(prev); - } else { - prev = vstruct_last(dst); - } - - bkey_copy(prev, &tmp.k); - } - - if (prev) { - bch2_bkey_pack(prev, (void *) prev, out_f); - btree_keys_account_key_add(&nr, 0, prev); - out = bkey_next(prev); - } else { - out = vstruct_last(dst); - } - - dst->u64s = cpu_to_le16((u64 *) out - dst->_data); - return nr; -} - void bch2_btree_sort_into(struct bch_fs *c, struct btree *dst, struct btree *src) @@ -753,24 +344,21 @@ void bch2_btree_sort_into(struct bch_fs *c, bch2_bset_set_no_aux_tree(dst, dst->set); - bch2_btree_node_iter_init_from_start(&src_iter, src, - btree_node_is_extents(src)); + bch2_btree_node_iter_init_from_start(&src_iter, src); - if (btree_node_ops(src)->key_normalize || - btree_node_ops(src)->key_merge) - nr = sort_repack_merge(c, btree_bset_first(dst), + if (btree_node_is_extents(src)) + nr = bch2_sort_repack_merge(c, btree_bset_first(dst), src, &src_iter, &dst->format, - true, - btree_node_ops(src)->key_normalize, - btree_node_ops(src)->key_merge); + true); else - nr = sort_repack(btree_bset_first(dst), + nr = bch2_sort_repack(btree_bset_first(dst), src, &src_iter, &dst->format, true); - bch2_time_stats_update(&c->btree_sort_time, start_time); + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], + start_time); set_btree_bset_end(dst, dst->set); @@ -788,8 +376,7 @@ void bch2_btree_sort_into(struct bch_fs *c, * We're about to add another bset to the btree node, so if there's currently * too many bsets - sort some of them together: */ -static bool btree_node_compact(struct bch_fs *c, struct btree *b, - struct btree_iter *iter) +static bool btree_node_compact(struct bch_fs *c, struct btree *b) { unsigned unwritten_idx; bool ret = false; @@ -797,17 +384,17 @@ static bool btree_node_compact(struct bch_fs *c, struct btree *b, for (unwritten_idx = 0; unwritten_idx < b->nsets; unwritten_idx++) - if (bset_unwritten(b, bset(b, &b->set[unwritten_idx]))) + if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) break; if (b->nsets - unwritten_idx > 1) { - btree_node_sort(c, b, iter, unwritten_idx, + btree_node_sort(c, b, unwritten_idx, b->nsets, false); ret = true; } if (unwritten_idx > 1) { - btree_node_sort(c, b, iter, 0, unwritten_idx, false); + btree_node_sort(c, b, 0, unwritten_idx, false); ret = true; } @@ -820,7 +407,7 @@ void bch2_btree_build_aux_trees(struct btree *b) for_each_bset(b, t) bch2_bset_build_aux_tree(b, t, - bset_unwritten(b, bset(b, t)) && + !bset_written(b, bset(b, t)) && t == bset_tree_last(b)); } @@ -834,122 +421,356 @@ void bch2_btree_build_aux_trees(struct btree *b) * Returns true if we sorted (i.e. invalidated iterators */ void bch2_btree_init_next(struct bch_fs *c, struct btree *b, - struct btree_iter *iter) + struct btree_iter *iter) { struct btree_node_entry *bne; - bool did_sort; + bool reinit_iter = false; + + EBUG_ON(!(b->c.lock.state.seq & 1)); + EBUG_ON(iter && iter->l[b->c.level].b != b); + BUG_ON(bset_written(b, bset(b, &b->set[1]))); + + if (b->nsets == MAX_BSETS) { + unsigned log_u64s[] = { + ilog2(bset_u64s(&b->set[0])), + ilog2(bset_u64s(&b->set[1])), + ilog2(bset_u64s(&b->set[2])), + }; + + if (log_u64s[1] >= (log_u64s[0] + log_u64s[2]) / 2) { + bch2_btree_node_write(c, b, SIX_LOCK_write); + reinit_iter = true; + } + } - EBUG_ON(!(b->lock.state.seq & 1)); - EBUG_ON(iter && iter->nodes[b->level] != b); + if (b->nsets == MAX_BSETS && + btree_node_compact(c, b)) + reinit_iter = true; - did_sort = btree_node_compact(c, b, iter); + BUG_ON(b->nsets >= MAX_BSETS); bne = want_new_bset(c, b); if (bne) - bch2_bset_init_next(b, &bne->keys); + bch2_bset_init_next(c, b, bne); bch2_btree_build_aux_trees(b); - if (iter && did_sort) + if (iter && reinit_iter) bch2_btree_iter_reinit_node(iter, b); } -static struct nonce btree_nonce(struct btree *b, - struct bset *i, - unsigned offset) +static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c, + struct btree *b) { - return (struct nonce) {{ - [0] = cpu_to_le32(offset), - [1] = ((__le32 *) &i->seq)[0], - [2] = ((__le32 *) &i->seq)[1], - [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, - }}; + pr_buf(out, "%s level %u/%u\n ", + bch2_btree_ids[b->c.btree_id], + b->c.level, + c->btree_roots[b->c.btree_id].level); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); } -static void bset_encrypt(struct bch_fs *c, struct bset *i, struct nonce nonce) +static void btree_err_msg(struct printbuf *out, struct bch_fs *c, + struct bch_dev *ca, + struct btree *b, struct bset *i, + unsigned offset, int write) { - bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, - vstruct_end(i) - (void *) i->_data); + pr_buf(out, "error validating btree node "); + if (write) + pr_buf(out, "before write "); + if (ca) + pr_buf(out, "on %s ", ca->name); + pr_buf(out, "at btree "); + btree_pos_to_text(out, c, b); + + pr_buf(out, "\n node offset %u", b->written); + if (i) + pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s)); } -#define btree_node_error(b, c, ptr, fmt, ...) \ - bch2_fs_inconsistent(c, \ - "btree node error at btree %u level %u/%u bucket %zu block %u u64s %u: " fmt,\ - (b)->btree_id, (b)->level, btree_node_root(c, b) \ - ? btree_node_root(c, b)->level : -1, \ - PTR_BUCKET_NR(ca, ptr), (b)->written, \ - le16_to_cpu((i)->u64s), ##__VA_ARGS__) - -static const char *validate_bset(struct bch_fs *c, struct btree *b, - struct bch_dev *ca, - const struct bch_extent_ptr *ptr, - struct bset *i, unsigned sectors, - unsigned *whiteout_u64s) +enum btree_err_type { + BTREE_ERR_FIXABLE, + BTREE_ERR_WANT_RETRY, + BTREE_ERR_MUST_RETRY, + BTREE_ERR_FATAL, +}; + +enum btree_validate_ret { + BTREE_RETRY_READ = 64, +}; + +#define btree_err(type, c, ca, b, i, msg, ...) \ +({ \ + __label__ out; \ + char _buf[300]; \ + char *_buf2 = _buf; \ + struct printbuf out = PBUF(_buf); \ + \ + _buf2 = kmalloc(4096, GFP_ATOMIC); \ + if (_buf2) \ + out = _PBUF(_buf2, 4986); \ + \ + btree_err_msg(&out, c, ca, b, i, b->written, write); \ + pr_buf(&out, ": " msg, ##__VA_ARGS__); \ + \ + if (type == BTREE_ERR_FIXABLE && \ + write == READ && \ + !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ + mustfix_fsck_err(c, "%s", _buf2); \ + goto out; \ + } \ + \ + switch (write) { \ + case READ: \ + bch_err(c, "%s", _buf2); \ + \ + switch (type) { \ + case BTREE_ERR_FIXABLE: \ + ret = BCH_FSCK_ERRORS_NOT_FIXED; \ + goto fsck_err; \ + case BTREE_ERR_WANT_RETRY: \ + if (have_retry) { \ + ret = BTREE_RETRY_READ; \ + goto fsck_err; \ + } \ + break; \ + case BTREE_ERR_MUST_RETRY: \ + ret = BTREE_RETRY_READ; \ + goto fsck_err; \ + case BTREE_ERR_FATAL: \ + ret = BCH_FSCK_ERRORS_NOT_FIXED; \ + goto fsck_err; \ + } \ + break; \ + case WRITE: \ + bch_err(c, "corrupt metadata before write: %s", _buf2); \ + \ + if (bch2_fs_inconsistent(c)) { \ + ret = BCH_FSCK_ERRORS_NOT_FIXED; \ + goto fsck_err; \ + } \ + break; \ + } \ +out: \ + if (_buf2 != _buf) \ + kfree(_buf2); \ + true; \ +}) + +#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) + +/* + * When btree topology repair changes the start or end of a node, that might + * mean we have to drop keys that are no longer inside the node: + */ +void bch2_btree_node_drop_keys_outside_node(struct btree *b) { - struct bkey_packed *k, *prev = NULL; - struct bpos prev_pos = POS_MIN; - bool seen_non_whiteout = false; + struct bset_tree *t; + struct bkey_s_c k; + struct bkey unpacked; + struct btree_node_iter iter; + + for_each_bset(b, t) { + struct bset *i = bset(b, t); + struct bkey_packed *k; + + for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) + if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0) + break; + + if (k != i->start) { + unsigned shift = (u64 *) k - (u64 *) i->start; + + memmove_u64s_down(i->start, k, + (u64 *) vstruct_end(i) - (u64 *) k); + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift); + set_btree_bset_end(b, t); + bch2_bset_set_no_aux_tree(b, t); + } - if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION) - return "unsupported bset version"; + for (k = i->start; k != vstruct_last(i); k = bkey_next(k)) + if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0) + break; - if (b->written + sectors > c->sb.btree_node_size) - return "bset past end of btree node"; + if (k != vstruct_last(i)) { + i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start); + set_btree_bset_end(b, t); + bch2_bset_set_no_aux_tree(b, t); + } + } - if (i != &b->data->keys && !i->u64s) - btree_node_error(b, c, ptr, "empty set"); + bch2_btree_build_aux_trees(b); - if (!BSET_SEPARATE_WHITEOUTS(i)) { - seen_non_whiteout = true; - whiteout_u64s = 0; + for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { + BUG_ON(bpos_cmp(k.k->p, b->data->min_key) < 0); + BUG_ON(bpos_cmp(k.k->p, b->data->max_key) > 0); } +} - for (k = i->start; - k != vstruct_last(i);) { - struct bkey_s_c u; - struct bkey tmp; - const char *invalid; +static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + struct btree *b, struct bset *i, + unsigned sectors, int write, bool have_retry) +{ + unsigned version = le16_to_cpu(i->version); + const char *err; + char buf1[100]; + char buf2[100]; + int ret = 0; + + btree_err_on((version != BCH_BSET_VERSION_OLD && + version < bcachefs_metadata_version_min) || + version >= bcachefs_metadata_version_max, + BTREE_ERR_FATAL, c, ca, b, i, + "unsupported bset version"); + + if (btree_err_on(version < c->sb.version_min, + BTREE_ERR_FIXABLE, c, NULL, b, i, + "bset version %u older than superblock version_min %u", + version, c->sb.version_min)) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->version_min = cpu_to_le16(version); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } - if (!k->u64s) { - btree_node_error(b, c, ptr, - "KEY_U64s 0: %zu bytes of metadata lost", - vstruct_end(i) - (void *) k); + if (btree_err_on(version > c->sb.version, + BTREE_ERR_FIXABLE, c, NULL, b, i, + "bset version %u newer than superblock version %u", + version, c->sb.version)) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->version = cpu_to_le16(version); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } - i->u64s = cpu_to_le16((u64 *) k - i->_data); - break; + btree_err_on(BSET_SEPARATE_WHITEOUTS(i), + BTREE_ERR_FATAL, c, ca, b, i, + "BSET_SEPARATE_WHITEOUTS no longer supported"); + + if (btree_err_on(b->written + sectors > c->opts.btree_node_size, + BTREE_ERR_FIXABLE, c, ca, b, i, + "bset past end of btree node")) { + i->u64s = 0; + return 0; + } + + btree_err_on(b->written && !i->u64s, + BTREE_ERR_FIXABLE, c, ca, b, i, + "empty bset"); + + if (!b->written) { + struct btree_node *bn = + container_of(i, struct btree_node, keys); + /* These indicate that we read the wrong btree node: */ + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bch_btree_ptr_v2 *bp = + &bkey_i_to_btree_ptr_v2(&b->key)->v; + + /* XXX endianness */ + btree_err_on(bp->seq != bn->keys.seq, + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "incorrect sequence number (wrong btree node)"); + } + + btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, + BTREE_ERR_MUST_RETRY, c, ca, b, i, + "incorrect btree id"); + + btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, + BTREE_ERR_MUST_RETRY, c, ca, b, i, + "incorrect level"); + + if (!write) + compat_btree_node(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, bn); + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bch_btree_ptr_v2 *bp = + &bkey_i_to_btree_ptr_v2(&b->key)->v; + + if (BTREE_PTR_RANGE_UPDATED(bp)) { + b->data->min_key = bp->min_key; + b->data->max_key = b->key.k.p; + } + + btree_err_on(bpos_cmp(b->data->min_key, bp->min_key), + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "incorrect min_key: got %s should be %s", + (bch2_bpos_to_text(&PBUF(buf1), bn->min_key), buf1), + (bch2_bpos_to_text(&PBUF(buf2), bp->min_key), buf2)); } - if (bkey_next(k) > vstruct_last(i)) { - btree_node_error(b, c, ptr, - "key extends past end of bset"); + btree_err_on(bpos_cmp(bn->max_key, b->key.k.p), + BTREE_ERR_MUST_RETRY, c, ca, b, i, + "incorrect max key %s", + (bch2_bpos_to_text(&PBUF(buf1), bn->max_key), buf1)); + + if (write) + compat_btree_node(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, bn); + + err = bch2_bkey_format_validate(&bn->format); + btree_err_on(err, + BTREE_ERR_FATAL, c, ca, b, i, + "invalid bkey format: %s", err); + + compat_bformat(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, + &bn->format); + } +fsck_err: + return ret; +} + +static int validate_bset_keys(struct bch_fs *c, struct btree *b, + struct bset *i, unsigned *whiteout_u64s, + int write, bool have_retry) +{ + unsigned version = le16_to_cpu(i->version); + struct bkey_packed *k, *prev = NULL; + bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && + BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); + int ret = 0; + + for (k = i->start; + k != vstruct_last(i);) { + struct bkey_s u; + struct bkey tmp; + const char *invalid; + if (btree_err_on(bkey_next(k) > vstruct_last(i), + BTREE_ERR_FIXABLE, c, NULL, b, i, + "key extends past end of bset")) { i->u64s = cpu_to_le16((u64 *) k - i->_data); break; } - if (k->format > KEY_FORMAT_CURRENT) { - btree_node_error(b, c, ptr, - "invalid bkey format %u", k->format); - + if (btree_err_on(k->format > KEY_FORMAT_CURRENT, + BTREE_ERR_FIXABLE, c, NULL, b, i, + "invalid bkey format %u", k->format)) { i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_next(k), (u64 *) vstruct_end(i) - (u64 *) k); continue; } - if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) - bch2_bkey_swab(btree_node_type(b), &b->format, k); + /* XXX: validate k->u64s */ + if (!write) + bch2_bkey_compat(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, + &b->format, k); - u = bkey_disassemble(b, k, &tmp); + u = __bkey_disassemble(b, k, &tmp); - invalid = bch2_btree_bkey_invalid(c, b, u); + invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?: + (!updated_range ? bch2_bkey_in_btree_node(b, u.s_c) : NULL) ?: + (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL); if (invalid) { char buf[160]; - bch2_bkey_val_to_text(c, btree_node_type(b), - buf, sizeof(buf), u); - btree_node_error(b, c, ptr, - "invalid bkey %s: %s", buf, invalid); + bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); + btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, + "invalid bkey: %s\n%s", invalid, buf); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); memmove_u64s_down(k, bkey_next(k), @@ -957,263 +778,680 @@ static const char *validate_bset(struct bch_fs *c, struct btree *b, continue; } - /* - * with the separate whiteouts thing (used for extents), the - * second set of keys actually can have whiteouts too, so we - * can't solely go off bkey_whiteout()... - */ + if (write) + bch2_bkey_compat(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, + &b->format, k); + + if (prev && bkey_iter_cmp(b, prev, k) > 0) { + char buf1[80]; + char buf2[80]; + struct bkey up = bkey_unpack_key(b, prev); - if (!seen_non_whiteout && - (!bkey_whiteout(k) || - (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0))) { - *whiteout_u64s = k->_data - i->_data; - seen_non_whiteout = true; - } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) { - btree_node_error(b, c, ptr, - "keys out of order: %llu:%llu > %llu:%llu", - prev_pos.inode, - prev_pos.offset, - u.k->p.inode, - bkey_start_offset(u.k)); - /* XXX: repair this */ + bch2_bkey_to_text(&PBUF(buf1), &up); + bch2_bkey_to_text(&PBUF(buf2), u.k); + + bch2_dump_bset(c, b, i, 0); + + if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, + "keys out of order: %s > %s", + buf1, buf2)) { + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); + continue; + } } - prev_pos = u.k->p; prev = k; k = bkey_next(k); } - - SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); - return NULL; -} - -static bool extent_contains_ptr(struct bkey_s_c_extent e, - struct bch_extent_ptr match) -{ - const struct bch_extent_ptr *ptr; - - extent_for_each_ptr(e, ptr) - if (!memcmp(ptr, &match, sizeof(*ptr))) - return true; - - return false; +fsck_err: + return ret; } -void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, - struct bch_dev *ca, - const struct bch_extent_ptr *ptr) +int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + struct btree *b, bool have_retry) { struct btree_node_entry *bne; - struct bset *i = &b->data->keys; - struct btree_node_iter *iter; + struct sort_iter *iter; struct btree_node *sorted; - bool used_mempool; + struct bkey_packed *k; + struct bch_extent_ptr *ptr; + struct bset *i; + bool used_mempool, blacklisted; + bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && + BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); unsigned u64s; - const char *err; - struct bch_csum csum; - struct nonce nonce; - int ret; + unsigned nonblacklisted_written = 0; + int ret, retry_read = 0, write = READ; + + b->version_ondisk = U16_MAX; iter = mempool_alloc(&c->fill_iter, GFP_NOIO); - __bch2_btree_node_iter_init(iter, btree_node_is_extents(b)); + sort_iter_init(iter, b); + iter->size = (btree_blocks(c) + 1) * 2; - err = "dynamic fault"; if (bch2_meta_read_fault("btree")) - goto err; + btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "dynamic fault"); + + btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "bad magic"); + + btree_err_on(!b->data->keys.seq, + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "bad btree header"); + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bch_btree_ptr_v2 *bp = + &bkey_i_to_btree_ptr_v2(&b->key)->v; - while (b->written < c->sb.btree_node_size) { + btree_err_on(b->data->keys.seq != bp->seq, + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "got wrong btree node (seq %llx want %llx)", + b->data->keys.seq, bp->seq); + } + + while (b->written < c->opts.btree_node_size) { unsigned sectors, whiteout_u64s = 0; + struct nonce nonce; + struct bch_csum csum; + bool first = !b->written; if (!b->written) { i = &b->data->keys; - err = "bad magic"; - if (le64_to_cpu(b->data->magic) != bset_magic(c)) - goto err; + btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), + BTREE_ERR_WANT_RETRY, c, ca, b, i, + "unknown checksum type %llu", + BSET_CSUM_TYPE(i)); - err = "bad btree header"; - if (!b->data->keys.seq) - goto err; + nonce = btree_nonce(i, b->written << 9); + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); - err = "unknown checksum type"; - if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) - goto err; + btree_err_on(bch2_crc_cmp(csum, b->data->csum), + BTREE_ERR_WANT_RETRY, c, ca, b, i, + "invalid checksum"); - /* XXX: retry checksum errors */ + bset_encrypt(c, i, b->written << 9); - nonce = btree_nonce(b, i, b->written << 9); - csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); + btree_err_on(btree_node_is_extents(b) && + !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), + BTREE_ERR_FATAL, c, NULL, b, NULL, + "btree node does not have NEW_EXTENT_OVERWRITE set"); - err = "bad checksum"; - if (bch2_crc_cmp(csum, b->data->csum)) - goto err; + sectors = vstruct_sectors(b->data, c->block_bits); + } else { + bne = write_block(b); + i = &bne->keys; - bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, - &b->data->flags, - (void *) &b->data->keys - - (void *) &b->data->flags); - nonce = nonce_add(nonce, - round_up((void *) &b->data->keys - - (void *) &b->data->flags, - CHACHA20_BLOCK_SIZE)); - bset_encrypt(c, i, nonce); + if (i->seq != b->data->keys.seq) + break; - sectors = vstruct_sectors(b->data, c->block_bits); + btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), + BTREE_ERR_WANT_RETRY, c, ca, b, i, + "unknown checksum type %llu", + BSET_CSUM_TYPE(i)); + + nonce = btree_nonce(i, b->written << 9); + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { - u64 *p = (u64 *) &b->data->ptr; + btree_err_on(bch2_crc_cmp(csum, bne->csum), + BTREE_ERR_WANT_RETRY, c, ca, b, i, + "invalid checksum"); - *p = swab64(*p); - bch2_bpos_swab(&b->data->min_key); - bch2_bpos_swab(&b->data->max_key); - } + bset_encrypt(c, i, b->written << 9); + + sectors = vstruct_sectors(bne, c->block_bits); + } + + b->version_ondisk = min(b->version_ondisk, + le16_to_cpu(i->version)); + + ret = validate_bset(c, ca, b, i, sectors, + READ, have_retry); + if (ret) + goto fsck_err; + + if (!b->written) + btree_node_set_format(b, b->data->format); + + ret = validate_bset_keys(c, b, i, &whiteout_u64s, + READ, have_retry); + if (ret) + goto fsck_err; + + SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); + + b->written += sectors; + + blacklisted = bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(i->journal_seq), + true); + + btree_err_on(blacklisted && first, + BTREE_ERR_FIXABLE, c, ca, b, i, + "first btree node bset has blacklisted journal seq"); + if (blacklisted && !first) + continue; + + sort_iter_add(iter, i->start, + vstruct_idx(i, whiteout_u64s)); + + sort_iter_add(iter, + vstruct_idx(i, whiteout_u64s), + vstruct_last(i)); + + nonblacklisted_written = b->written; + } + + for (bne = write_block(b); + bset_byte_offset(b, bne) < btree_bytes(c); + bne = (void *) bne + block_bytes(c)) + btree_err_on(bne->keys.seq == b->data->keys.seq && + !bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), + true), + BTREE_ERR_WANT_RETRY, c, ca, b, NULL, + "found bset signature after last bset"); + + /* + * Blacklisted bsets are those that were written after the most recent + * (flush) journal write. Since there wasn't a flush, they may not have + * made it to all devices - which means we shouldn't write new bsets + * after them, as that could leave a gap and then reads from that device + * wouldn't find all the bsets in that btree node - which means it's + * important that we start writing new bsets after the most recent _non_ + * blacklisted bset: + */ + b->written = nonblacklisted_written; + + sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); + sorted->keys.u64s = 0; + + set_btree_bset(b, b->set, &b->data->keys); + + b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); + + u64s = le16_to_cpu(sorted->keys.u64s); + *sorted = *b->data; + sorted->keys.u64s = cpu_to_le16(u64s); + swap(sorted, b->data); + set_btree_bset(b, b->set, &b->data->keys); + b->nsets = 1; + + BUG_ON(b->nr.live_u64s != u64s); + + btree_bounce_free(c, btree_bytes(c), used_mempool, sorted); + + if (updated_range) + bch2_btree_node_drop_keys_outside_node(b); + + i = &b->data->keys; + for (k = i->start; k != vstruct_last(i);) { + struct bkey tmp; + struct bkey_s u = __bkey_disassemble(b, k, &tmp); + const char *invalid = bch2_bkey_val_invalid(c, u.s_c); + + if (invalid || + (bch2_inject_invalid_keys && + !bversion_cmp(u.k->version, MAX_VERSION))) { + char buf[160]; + + bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); + btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, + "invalid bkey %s: %s", buf, invalid); + + btree_keys_account_key_drop(&b->nr, 0, k); + + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); + set_btree_bset_end(b, b->set); + continue; + } + + if (u.k->type == KEY_TYPE_btree_ptr_v2) { + struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); + + bp.v->mem_ptr = 0; + } + + k = bkey_next(k); + } + + bch2_bset_build_aux_tree(b, b->set, false); + + set_needs_whiteout(btree_bset_first(b), true); + + btree_node_reset_sib_u64s(b); + + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (ca->mi.state != BCH_MEMBER_STATE_rw) + set_btree_node_need_rewrite(b); + } +out: + mempool_free(iter, &c->fill_iter); + return retry_read; +fsck_err: + if (ret == BTREE_RETRY_READ) { + retry_read = 1; + } else { + bch2_inconsistent_error(c); + set_btree_node_read_error(b); + } + goto out; +} + +static void btree_node_read_work(struct work_struct *work) +{ + struct btree_read_bio *rb = + container_of(work, struct btree_read_bio, work); + struct bch_fs *c = rb->c; + struct btree *b = rb->b; + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + struct bio *bio = &rb->bio; + struct bch_io_failures failed = { .nr = 0 }; + char buf[200]; + struct printbuf out; + bool saw_error = false; + bool can_retry; + + goto start; + while (1) { + bch_info(c, "retrying read"); + ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + rb->have_ioref = bch2_dev_get_ioref(ca, READ); + bio_reset(bio); + bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; + bio->bi_iter.bi_sector = rb->pick.ptr.offset; + bio->bi_iter.bi_size = btree_bytes(c); + + if (rb->have_ioref) { + bio_set_dev(bio, ca->disk_sb.bdev); + submit_bio_wait(bio); + } else { + bio->bi_status = BLK_STS_REMOVED; + } +start: + out = PBUF(buf); + btree_pos_to_text(&out, c, b); + bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s", + bch2_blk_status_to_str(bio->bi_status), buf); + if (rb->have_ioref) + percpu_ref_put(&ca->io_ref); + rb->have_ioref = false; + + bch2_mark_io_failure(&failed, &rb->pick); + + can_retry = bch2_bkey_pick_read_device(c, + bkey_i_to_s_c(&b->key), + &failed, &rb->pick) > 0; + + if (!bio->bi_status && + !bch2_btree_node_read_done(c, ca, b, can_retry)) + break; + + saw_error = true; + + if (!can_retry) { + set_btree_node_read_error(b); + break; + } + } + + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], + rb->start_time); + bio_put(&rb->bio); + + if (saw_error && !btree_node_read_error(b)) + bch2_btree_node_rewrite_async(c, b); + + clear_btree_node_read_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); +} + +static void btree_node_read_endio(struct bio *bio) +{ + struct btree_read_bio *rb = + container_of(bio, struct btree_read_bio, bio); + struct bch_fs *c = rb->c; + + if (rb->have_ioref) { + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + bch2_latency_acct(ca, rb->start_time, READ); + } + + queue_work(c->io_complete_wq, &rb->work); +} + +struct btree_node_read_all { + struct closure cl; + struct bch_fs *c; + struct btree *b; + unsigned nr; + void *buf[BCH_REPLICAS_MAX]; + struct bio *bio[BCH_REPLICAS_MAX]; + int err[BCH_REPLICAS_MAX]; +}; + +static unsigned btree_node_sectors_written(struct bch_fs *c, void *data) +{ + struct btree_node *bn = data; + struct btree_node_entry *bne; + unsigned offset = 0; + + if (le64_to_cpu(bn->magic) != bset_magic(c)) + return 0; + + while (offset < c->opts.btree_node_size) { + if (!offset) { + offset += vstruct_sectors(bn, c->block_bits); + } else { + bne = data + (offset << 9); + if (bne->keys.seq != bn->keys.seq) + break; + offset += vstruct_sectors(bne, c->block_bits); + } + } - err = "incorrect btree id"; - if (BTREE_NODE_ID(b->data) != b->btree_id) - goto err; + return offset; +} - err = "incorrect level"; - if (BTREE_NODE_LEVEL(b->data) != b->level) - goto err; +static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data) +{ + struct btree_node *bn = data; + struct btree_node_entry *bne; - err = "incorrect max key"; - if (bkey_cmp(b->data->max_key, b->key.k.p)) - goto err; + if (!offset) + return false; - err = "incorrect backpointer"; - if (!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key), - b->data->ptr)) - goto err; + while (offset < c->opts.btree_node_size) { + bne = data + (offset << 9); + if (bne->keys.seq == bn->keys.seq) + return true; + offset++; + } - err = bch2_bkey_format_validate(&b->data->format); - if (err) - goto err; + return false; + return offset; +} - set_btree_bset(b, b->set, &b->data->keys); +static void btree_node_read_all_replicas_done(struct closure *cl) +{ + struct btree_node_read_all *ra = + container_of(cl, struct btree_node_read_all, cl); + struct bch_fs *c = ra->c; + struct btree *b = ra->b; + bool dump_bset_maps = false; + bool have_retry = false; + int ret = 0, best = -1, write = READ; + unsigned i, written, written2; + __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 + ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; + + for (i = 0; i < ra->nr; i++) { + struct btree_node *bn = ra->buf[i]; + + if (ra->err[i]) + continue; - btree_node_set_format(b, b->data->format); - } else { - bne = write_block(b); - i = &bne->keys; + if (le64_to_cpu(bn->magic) != bset_magic(c) || + (seq && seq != bn->keys.seq)) + continue; - if (i->seq != b->data->keys.seq) - break; + if (best < 0) { + best = i; + written = btree_node_sectors_written(c, bn); + continue; + } - err = "unknown checksum type"; - if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) - goto err; + written2 = btree_node_sectors_written(c, ra->buf[i]); + if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL, + "btree node sectors written mismatch: %u != %u", + written, written2) || + btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), + BTREE_ERR_FIXABLE, c, NULL, b, NULL, + "found bset signature after last bset") || + btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), + BTREE_ERR_FIXABLE, c, NULL, b, NULL, + "btree node replicas content mismatch")) + dump_bset_maps = true; + + if (written2 > written) { + written = written2; + best = i; + } + } +fsck_err: + if (dump_bset_maps) { + for (i = 0; i < ra->nr; i++) { + char buf[200]; + struct printbuf out = PBUF(buf); + struct btree_node *bn = ra->buf[i]; + struct btree_node_entry *bne = NULL; + unsigned offset = 0, sectors; + bool gap = false; + + if (ra->err[i]) + continue; - nonce = btree_nonce(b, i, b->written << 9); - csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + while (offset < c->opts.btree_node_size) { + if (!offset) { + sectors = vstruct_sectors(bn, c->block_bits); + } else { + bne = ra->buf[i] + (offset << 9); + if (bne->keys.seq != bn->keys.seq) + break; + sectors = vstruct_sectors(bne, c->block_bits); + } - err = "bad checksum"; - if (memcmp(&csum, &bne->csum, sizeof(csum))) - goto err; + pr_buf(&out, " %u-%u", offset, offset + sectors); + if (bne && bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), false)) + pr_buf(&out, "*"); + offset += sectors; + } - bset_encrypt(c, i, nonce); + while (offset < c->opts.btree_node_size) { + bne = ra->buf[i] + (offset << 9); + if (bne->keys.seq == bn->keys.seq) { + if (!gap) + pr_buf(&out, " GAP"); + gap = true; + + sectors = vstruct_sectors(bne, c->block_bits); + pr_buf(&out, " %u-%u", offset, offset + sectors); + if (bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), false)) + pr_buf(&out, "*"); + } + offset++; + } - sectors = vstruct_sectors(bne, c->block_bits); + bch_err(c, "replica %u:%s", i, buf); } + } - err = validate_bset(c, b, ca, ptr, i, sectors, &whiteout_u64s); - if (err) - goto err; + if (best >= 0) { + memcpy(b->data, ra->buf[best], btree_bytes(c)); + ret = bch2_btree_node_read_done(c, NULL, b, false); + } else { + ret = -1; + } - b->written += sectors; + if (ret) + set_btree_node_read_error(b); - err = "insufficient memory"; - ret = bch2_journal_seq_should_ignore(c, le64_to_cpu(i->journal_seq), b); - if (ret < 0) - goto err; + for (i = 0; i < ra->nr; i++) { + mempool_free(ra->buf[i], &c->btree_bounce_pool); + bio_put(ra->bio[i]); + } - if (ret) - continue; + closure_debug_destroy(&ra->cl); + kfree(ra); - __bch2_btree_node_iter_push(iter, b, - i->start, - vstruct_idx(i, whiteout_u64s)); + clear_btree_node_read_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); +} - __bch2_btree_node_iter_push(iter, b, - vstruct_idx(i, whiteout_u64s), - vstruct_last(i)); +static void btree_node_read_all_replicas_endio(struct bio *bio) +{ + struct btree_read_bio *rb = + container_of(bio, struct btree_read_bio, bio); + struct bch_fs *c = rb->c; + struct btree_node_read_all *ra = rb->ra; + + if (rb->have_ioref) { + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + bch2_latency_acct(ca, rb->start_time, READ); } - err = "corrupted btree"; - for (bne = write_block(b); - bset_byte_offset(b, bne) < btree_bytes(c); - bne = (void *) bne + block_bytes(c)) - if (bne->keys.seq == b->data->keys.seq) - goto err; + ra->err[rb->idx] = bio->bi_status; + closure_put(&ra->cl); +} - sorted = btree_bounce_alloc(c, ilog2(btree_pages(c)), &used_mempool); - sorted->keys.u64s = 0; +/* + * XXX This allocates multiple times from the same mempools, and can deadlock + * under sufficient memory pressure (but is only a debug path) + */ +static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync) +{ + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded pick; + struct btree_node_read_all *ra; + unsigned i; - b->nr = btree_node_is_extents(b) - ? bch2_extent_sort_fix_overlapping(c, &sorted->keys, b, iter) - : bch2_key_sort_fix_overlapping(&sorted->keys, b, iter); + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; - u64s = le16_to_cpu(sorted->keys.u64s); - *sorted = *b->data; - sorted->keys.u64s = cpu_to_le16(u64s); - swap(sorted, b->data); - set_btree_bset(b, b->set, &b->data->keys); - b->nsets = 1; + closure_init(&ra->cl, NULL); + ra->c = c; + ra->b = b; + ra->nr = bch2_bkey_nr_ptrs(k); - BUG_ON(b->nr.live_u64s != u64s); + for (i = 0; i < ra->nr; i++) { + ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); + ra->bio[i] = bio_alloc_bioset(GFP_NOFS, buf_pages(ra->buf[i], + btree_bytes(c)), + &c->btree_bio); + } - btree_bounce_free(c, ilog2(btree_pages(c)), used_mempool, sorted); + i = 0; + bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); + struct btree_read_bio *rb = + container_of(ra->bio[i], struct btree_read_bio, bio); + rb->c = c; + rb->b = b; + rb->ra = ra; + rb->start_time = local_clock(); + rb->have_ioref = bch2_dev_get_ioref(ca, READ); + rb->idx = i; + rb->pick = pick; + rb->bio.bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; + rb->bio.bi_iter.bi_sector = pick.ptr.offset; + rb->bio.bi_end_io = btree_node_read_all_replicas_endio; + bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c)); + + if (rb->have_ioref) { + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], + bio_sectors(&rb->bio)); + bio_set_dev(&rb->bio, ca->disk_sb.bdev); + + closure_get(&ra->cl); + submit_bio(&rb->bio); + } else { + ra->err[i] = BLK_STS_REMOVED; + } - bch2_bset_build_aux_tree(b, b->set, false); + i++; + } - set_needs_whiteout(btree_bset_first(b)); + if (sync) { + closure_sync(&ra->cl); + btree_node_read_all_replicas_done(&ra->cl); + } else { + continue_at(&ra->cl, btree_node_read_all_replicas_done, + c->io_complete_wq); + } - btree_node_reset_sib_u64s(b); -out: - mempool_free(iter, &c->fill_iter); - return; -err: - set_btree_node_read_error(b); - btree_node_error(b, c, ptr, "%s", err); - goto out; + return 0; } -void bch2_btree_node_read(struct bch_fs *c, struct btree *b) +void bch2_btree_node_read(struct bch_fs *c, struct btree *b, + bool sync) { - uint64_t start_time = local_clock(); + struct extent_ptr_decoded pick; + struct btree_read_bio *rb; + struct bch_dev *ca; struct bio *bio; - struct extent_pick_ptr pick; + char buf[200]; + int ret; + btree_pos_to_text(&PBUF(buf), c, b); trace_btree_read(c, b); - pick = bch2_btree_pick_ptr(c, b); - if (bch2_fs_fatal_err_on(!pick.ca, c, - "btree node read error: no device to read from")) { + set_btree_node_read_in_flight(b); + + if (bch2_verify_all_btree_replicas && + !btree_node_read_all_replicas(c, b, sync)) + return; + + ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), + NULL, &pick); + if (bch2_fs_fatal_err_on(ret <= 0, c, + "btree node read error: no device to read from\n" + " at %s", buf)) { set_btree_node_read_error(b); return; } - bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio); - bio->bi_bdev = pick.ca->disk_sb.bdev; + ca = bch_dev_bkey_exists(c, pick.ptr.dev); + + bio = bio_alloc_bioset(GFP_NOIO, buf_pages(b->data, + btree_bytes(c)), + &c->btree_bio); + rb = container_of(bio, struct btree_read_bio, bio); + rb->c = c; + rb->b = b; + rb->ra = NULL; + rb->start_time = local_clock(); + rb->have_ioref = bch2_dev_get_ioref(ca, READ); + rb->pick = pick; + INIT_WORK(&rb->work, btree_node_read_work); + bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; bio->bi_iter.bi_sector = pick.ptr.offset; - bio->bi_iter.bi_size = btree_bytes(c); - bio_set_op_attrs(bio, REQ_OP_READ, REQ_META|READ_SYNC); - bch2_bio_map(bio, b->data); + bio->bi_end_io = btree_node_read_endio; + bch2_bio_map(bio, b->data, btree_bytes(c)); - submit_bio_wait(bio); + if (rb->have_ioref) { + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], + bio_sectors(bio)); + bio_set_dev(bio, ca->disk_sb.bdev); - if (bch2_dev_fatal_io_err_on(bio->bi_error, - pick.ca, "IO error reading bucket %zu", - PTR_BUCKET_NR(pick.ca, &pick.ptr)) || - bch2_meta_read_fault("btree")) { - set_btree_node_read_error(b); - goto out; - } + if (sync) { + submit_bio_wait(bio); - bch2_btree_node_read_done(c, b, pick.ca, &pick.ptr); - bch2_time_stats_update(&c->btree_read_time, start_time); -out: - bio_put(bio); - percpu_ref_put(&pick.ca->io_ref); + btree_node_read_work(&rb->work); + } else { + submit_bio(bio); + } + } else { + bio->bi_status = BLK_STS_REMOVED; + + if (sync) + btree_node_read_work(&rb->work); + else + queue_work(c->io_complete_wq, &rb->work); + } } int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, @@ -1226,111 +1464,248 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, closure_init_stack(&cl); do { - ret = bch2_btree_node_cannibalize_lock(c, &cl); + ret = bch2_btree_cache_cannibalize_lock(c, &cl); closure_sync(&cl); } while (ret); b = bch2_btree_node_mem_alloc(c); - bch2_btree_node_cannibalize_unlock(c); + bch2_btree_cache_cannibalize_unlock(c); BUG_ON(IS_ERR(b)); bkey_copy(&b->key, k); - BUG_ON(bch2_btree_node_hash_insert(c, b, level, id)); + BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); - bch2_btree_node_read(c, b); - six_unlock_write(&b->lock); + bch2_btree_node_read(c, b, true); if (btree_node_read_error(b)) { - six_unlock_intent(&b->lock); - return -EIO; + bch2_btree_node_hash_remove(&c->btree_cache, b); + + mutex_lock(&c->btree_cache.lock); + list_move(&b->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); + + ret = -EIO; + goto err; } - bch2_btree_set_root_initial(c, b, NULL); - six_unlock_intent(&b->lock); + bch2_btree_set_root_for_read(c, b); +err: + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); - return 0; + return ret; } void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, struct btree_write *w) { + unsigned long old, new, v = READ_ONCE(b->will_make_reachable); + + do { + old = new = v; + if (!(old & 1)) + break; + + new &= ~1UL; + } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old); + + if (old & 1) + closure_put(&((struct btree_update *) new)->cl); + bch2_journal_pin_drop(&c->journal, &w->journal); - closure_wake_up(&w->wait); } static void btree_node_write_done(struct bch_fs *c, struct btree *b) { struct btree_write *w = btree_prev_write(b); - /* - * Before calling bch2_btree_complete_write() - if the write errored, we - * have to halt new journal writes before they see this btree node - * write as completed: - */ - if (btree_node_write_error(b)) - bch2_journal_halt(&c->journal); - bch2_btree_complete_write(c, b, w); btree_node_io_unlock(b); } +static void bch2_btree_node_write_error(struct bch_fs *c, + struct btree_write_bio *wbio) +{ + struct btree *b = wbio->wbio.bio.bi_private; + struct bkey_buf k; + struct bch_extent_ptr *ptr; + struct btree_trans trans; + struct btree_iter *iter; + int ret; + + bch2_bkey_buf_init(&k); + bch2_trans_init(&trans, c, 0, 0); + + iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p, + BTREE_MAX_DEPTH, b->c.level, 0); +retry: + ret = bch2_btree_iter_traverse(iter); + if (ret) + goto err; + + /* has node been freed? */ + if (iter->l[b->c.level].b != b) { + /* node has been freed: */ + BUG_ON(!btree_node_dying(b)); + goto out; + } + + BUG_ON(!btree_node_hashed(b)); + + bch2_bkey_buf_copy(&k, c, &b->key); + + bch2_bkey_drop_ptrs(bkey_i_to_s(k.k), ptr, + bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); + + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(k.k))) + goto err; + + ret = bch2_btree_node_update_key(c, iter, b, k.k); + if (ret == -EINTR) + goto retry; + if (ret) + goto err; +out: + bch2_trans_iter_put(&trans, iter); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&k, c); + bio_put(&wbio->wbio.bio); + btree_node_write_done(c, b); + return; +err: + set_btree_node_noevict(b); + bch2_fs_fatal_error(c, "fatal error writing btree node"); + goto out; +} + +void bch2_btree_write_error_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, + btree_write_error_work); + struct bio *bio; + + while (1) { + spin_lock_irq(&c->btree_write_error_lock); + bio = bio_list_pop(&c->btree_write_error_list); + spin_unlock_irq(&c->btree_write_error_lock); + + if (!bio) + break; + + bch2_btree_node_write_error(c, + container_of(bio, struct btree_write_bio, wbio.bio)); + } +} + +static void btree_node_write_work(struct work_struct *work) +{ + struct btree_write_bio *wbio = + container_of(work, struct btree_write_bio, work); + struct bch_fs *c = wbio->wbio.c; + struct btree *b = wbio->wbio.bio.bi_private; + + btree_bounce_free(c, + wbio->bytes, + wbio->wbio.used_mempool, + wbio->data); + + if (wbio->wbio.failed.nr) { + unsigned long flags; + + spin_lock_irqsave(&c->btree_write_error_lock, flags); + bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio); + spin_unlock_irqrestore(&c->btree_write_error_lock, flags); + + queue_work(c->btree_error_wq, &c->btree_write_error_work); + return; + } + + bio_put(&wbio->wbio.bio); + btree_node_write_done(c, b); +} + static void btree_node_write_endio(struct bio *bio) { - struct btree *b = bio->bi_private; - struct bch_write_bio *wbio = to_wbio(bio); - struct bch_fs *c = wbio->c; - struct bio *orig = wbio->split ? wbio->orig : NULL; - struct closure *cl = !wbio->split ? wbio->cl : NULL; - struct bch_dev *ca = wbio->ca; - - if (bch2_dev_fatal_io_err_on(bio->bi_error, ca, "btree write") || - bch2_meta_write_fault("btree")) - set_btree_node_write_error(b); - - if (wbio->bounce) - btree_bounce_free(c, - wbio->order, - wbio->used_mempool, - page_address(bio->bi_io_vec[0].bv_page)); - - if (wbio->put_bio) - bio_put(bio); + struct bch_write_bio *wbio = to_wbio(bio); + struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; + struct bch_write_bio *orig = parent ?: wbio; + struct bch_fs *c = wbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); + unsigned long flags; + + if (wbio->have_ioref) + bch2_latency_acct(ca, wbio->submit_time, WRITE); + + if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s", + bch2_blk_status_to_str(bio->bi_status)) || + bch2_meta_write_fault("btree")) { + spin_lock_irqsave(&c->btree_write_error_lock, flags); + bch2_dev_list_add_dev(&orig->failed, wbio->dev); + spin_unlock_irqrestore(&c->btree_write_error_lock, flags); + } - if (orig) { - bio_endio(orig); + if (wbio->have_ioref) + percpu_ref_put(&ca->io_ref); + + if (parent) { + bio_put(bio); + bio_endio(&parent->bio); } else { - btree_node_write_done(c, b); - if (cl) - closure_put(cl); + struct btree_write_bio *wb = + container_of(orig, struct btree_write_bio, wbio); + + INIT_WORK(&wb->work, btree_node_write_work); + queue_work(c->io_complete_wq, &wb->work); } +} - if (ca) - percpu_ref_put(&ca->io_ref); +static int validate_bset_for_write(struct bch_fs *c, struct btree *b, + struct bset *i, unsigned sectors) +{ + unsigned whiteout_u64s = 0; + int ret; + + if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_btree)) + return -1; + + ret = validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false) ?: + validate_bset(c, NULL, b, i, sectors, WRITE, false); + if (ret) { + bch2_inconsistent_error(c); + dump_stack(); + } + + return ret; } -void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, - struct closure *parent, - enum six_lock_type lock_type_held, - int idx_to_write) +static void btree_write_submit(struct work_struct *work) { - struct bio *bio; - struct bch_write_bio *wbio; + struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); + + bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &wbio->key); +} + +void __bch2_btree_node_write(struct bch_fs *c, struct btree *b) +{ + struct btree_write_bio *wbio; struct bset_tree *t; struct bset *i; struct btree_node *bn = NULL; struct btree_node_entry *bne = NULL; - BKEY_PADDED(key) k; - struct bkey_s_extent e; struct bch_extent_ptr *ptr; struct sort_iter sort_iter; struct nonce nonce; - unsigned bytes_to_write, sectors_to_write, order, bytes, u64s; + unsigned bytes_to_write, sectors_to_write, bytes, u64s; u64 seq = 0; bool used_mempool; unsigned long old, new; + bool validate_before_checksum = false; void *data; + if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) + return; + /* * We may only have a read lock on the btree node - the dirty bit is our * "lock" against racing with other threads that may be trying to start @@ -1344,39 +1719,40 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, if (!(old & (1 << BTREE_NODE_dirty))) return; - if (idx_to_write >= 0 && - idx_to_write != !!(old & (1 << BTREE_NODE_write_idx))) + if (!btree_node_may_write(b)) + return; + + if (old & (1 << BTREE_NODE_never_write)) return; if (old & (1 << BTREE_NODE_write_in_flight)) { - wait_on_bit_io(&b->flags, - BTREE_NODE_write_in_flight, - TASK_UNINTERRUPTIBLE); + /* + * XXX waiting on btree writes with btree locks held - + * this can deadlock, and we hit the write error path + */ + btree_node_wait_on_io(b); continue; } new &= ~(1 << BTREE_NODE_dirty); + new &= ~(1 << BTREE_NODE_need_write); new |= (1 << BTREE_NODE_write_in_flight); new |= (1 << BTREE_NODE_just_written); new ^= (1 << BTREE_NODE_write_idx); } while (cmpxchg_acquire(&b->flags, old, new) != old); - BUG_ON(!list_empty(&b->write_blocked)); + atomic_dec(&c->btree_cache.dirty); + + BUG_ON(btree_node_fake(b)); + BUG_ON((b->will_make_reachable != 0) != !b->written); - BUG_ON(b->written >= c->sb.btree_node_size); + BUG_ON(b->written >= c->opts.btree_node_size); + BUG_ON(b->written & (c->opts.block_size - 1)); BUG_ON(bset_written(b, btree_bset_last(b))); BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); - if (lock_type_held == SIX_LOCK_intent) { - six_lock_write(&b->lock); - __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN); - six_unlock_write(&b->lock); - } else { - __bch2_compact_whiteouts(c, b, COMPACT_WRITTEN_NO_WRITE_LOCK); - } - - BUG_ON(b->uncompacted_whiteout_u64s); + bch2_sort_whiteouts(c, b); sort_iter_init(&sort_iter, b); @@ -1399,8 +1775,15 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, seq = max(seq, le64_to_cpu(i->journal_seq)); } - order = get_order(bytes); - data = btree_bounce_alloc(c, order, &used_mempool); + BUG_ON(b->written && !seq); + + /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */ + bytes += 8; + + /* buffer must be a multiple of the block size */ + bytes = round_up(bytes, block_bytes(c)); + + data = btree_bounce_alloc(c, bytes, &used_mempool); if (!b->written) { bn = data; @@ -1415,71 +1798,62 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, i->journal_seq = cpu_to_le64(seq); i->u64s = 0; - if (!btree_node_is_extents(b)) { - sort_iter_add(&sort_iter, - unwritten_whiteouts_start(c, b), - unwritten_whiteouts_end(c, b)); - SET_BSET_SEPARATE_WHITEOUTS(i, false); - } else { - memcpy_u64s(i->start, - unwritten_whiteouts_start(c, b), - b->whiteout_u64s); - i->u64s = cpu_to_le16(b->whiteout_u64s); - SET_BSET_SEPARATE_WHITEOUTS(i, true); - } + sort_iter_add(&sort_iter, + unwritten_whiteouts_start(c, b), + unwritten_whiteouts_end(c, b)); + SET_BSET_SEPARATE_WHITEOUTS(i, false); b->whiteout_u64s = 0; - u64s = btree_node_is_extents(b) - ? sort_extents(vstruct_last(i), &sort_iter, false) - : sort_keys(i->start, &sort_iter, false); + u64s = bch2_sort_keys(i->start, &sort_iter, false); le16_add_cpu(&i->u64s, u64s); - clear_needs_whiteout(i); + set_needs_whiteout(i, false); - if (b->written && !i->u64s) { - /* Nothing to write: */ - btree_bounce_free(c, order, used_mempool, data); - btree_node_write_done(c, b); - return; - } + /* do we have data to write? */ + if (b->written && !i->u64s) + goto nowrite; + + bytes_to_write = vstruct_end(i) - data; + sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; + memset(data + bytes_to_write, 0, + (sectors_to_write << 9) - bytes_to_write); + + BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size); BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); BUG_ON(i->seq != b->data->keys.seq); - i->version = cpu_to_le16(BCACHE_BSET_VERSION); + i->version = c->sb.version < bcachefs_metadata_version_new_versioning + ? cpu_to_le16(BCH_BSET_VERSION_OLD) + : cpu_to_le16(c->sb.version); SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); - nonce = btree_nonce(b, i, b->written << 9); - - if (bn) { - bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, - &bn->flags, - (void *) &b->data->keys - - (void *) &b->data->flags); - nonce = nonce_add(nonce, - round_up((void *) &b->data->keys - - (void *) &b->data->flags, - CHACHA20_BLOCK_SIZE)); - bset_encrypt(c, i, nonce); + if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) + validate_before_checksum = true; - nonce = btree_nonce(b, i, b->written << 9); - bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); - } else { - bset_encrypt(c, i, nonce); + /* validate_bset will be modifying: */ + if (le16_to_cpu(i->version) < bcachefs_metadata_version_current) + validate_before_checksum = true; - bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - } + /* if we're going to be encrypting, check metadata validity first: */ + if (validate_before_checksum && + validate_bset_for_write(c, b, i, sectors_to_write)) + goto err; - bytes_to_write = vstruct_end(i) - data; - sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; + bset_encrypt(c, i, b->written << 9); - memset(data + bytes_to_write, 0, - (sectors_to_write << 9) - bytes_to_write); + nonce = btree_nonce(i, b->written << 9); - BUG_ON(b->written + sectors_to_write > c->sb.btree_node_size); + if (bn) + bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); + else + bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); - trace_btree_write(b, bytes_to_write, sectors_to_write); + /* if we're not encrypting, check metadata after checksumming: */ + if (!validate_before_checksum && + validate_bset_for_write(c, b, i, sectors_to_write)) + goto err; /* * We handle btree write errors by immediately halting the journal - @@ -1491,36 +1865,34 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, * reflect that those writes were done and the data flushed from the * journal: * + * Also on journal error, the pending write may have updates that were + * never journalled (interior nodes, see btree_update_nodes_written()) - + * it's critical that we don't do the write in that case otherwise we + * will have updates visible that weren't in the journal: + * * Make sure to update b->written so bch2_btree_init_next() doesn't * break: */ if (bch2_journal_error(&c->journal) || - c->opts.nochanges) { - set_btree_node_noevict(b); - b->written += sectors_to_write; - - btree_bounce_free(c, order, used_mempool, data); - btree_node_write_done(c, b); - return; - } - - bio = bio_alloc_bioset(GFP_NOIO, 1 << order, &c->bio_write); - - wbio = to_wbio(bio); - wbio->cl = parent; - wbio->bounce = true; - wbio->put_bio = true; - wbio->order = order; - wbio->used_mempool = used_mempool; - bio->bi_iter.bi_size = sectors_to_write << 9; - bio->bi_end_io = btree_node_write_endio; - bio->bi_private = b; - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META|WRITE_SYNC|REQ_FUA); + c->opts.nochanges) + goto err; - if (parent) - closure_get(parent); + trace_btree_write(b, bytes_to_write, sectors_to_write); - bch2_bio_map(bio, data); + wbio = container_of(bio_alloc_bioset(GFP_NOIO, + buf_pages(data, sectors_to_write << 9), + &c->btree_bio), + struct btree_write_bio, wbio.bio); + wbio_init(&wbio->wbio.bio); + wbio->data = data; + wbio->bytes = bytes; + wbio->wbio.c = c; + wbio->wbio.used_mempool = used_mempool; + wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; + wbio->wbio.bio.bi_end_io = btree_node_write_endio; + wbio->wbio.bio.bi_private = b; + + bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); /* * If we're appending to a leaf node, we don't technically need FUA - @@ -1537,19 +1909,25 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, * just make all btree node writes FUA to keep things sane. */ - bkey_copy(&k.key, &b->key); - e = bkey_i_to_s_extent(&k.key); + bkey_copy(&wbio->key, &b->key); - extent_for_each_ptr(e, ptr) + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&wbio->key)), ptr) ptr->offset += b->written; - extent_for_each_ptr(e, ptr) - atomic64_add(sectors_to_write, - &c->devs[ptr->dev]->btree_sectors_written); - b->written += sectors_to_write; - bch2_submit_wbio_replicas(wbio, c, &k.key); + atomic64_inc(&c->btree_writes_nr); + atomic64_add(sectors_to_write, &c->btree_writes_sectors); + + INIT_WORK(&wbio->work, btree_write_submit); + queue_work(c->io_complete_wq, &wbio->work); + return; +err: + set_btree_node_noevict(b); + b->written += sectors_to_write; +nowrite: + btree_bounce_free(c, bytes, used_mempool, data); + btree_node_write_done(c, b); } /* @@ -1565,14 +1943,13 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) return false; BUG_ON(b->whiteout_u64s); - BUG_ON(b->uncompacted_whiteout_u64s); clear_btree_node_just_written(b); /* - * Note: immediately after write, bset_unwritten()/bset_written() don't - * work - the amount of data we had to write after compaction might have - * been smaller than the offset of the last bset. + * Note: immediately after write, bset_written() doesn't work - the + * amount of data we had to write after compaction might have been + * smaller than the offset of the last bset. * * However, we know that all bsets have been written here, as long as * we're still holding the write lock: @@ -1583,14 +1960,14 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) * single bset: */ if (b->nsets > 1) { - btree_node_sort(c, b, NULL, 0, b->nsets, true); + btree_node_sort(c, b, 0, b->nsets, true); invalidated_iter = true; } else { - invalidated_iter = bch2_drop_whiteouts(b); + invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); } for_each_bset(b, t) - set_needs_whiteout(bset(b, t)); + set_needs_whiteout(bset(b, t), true); bch2_btree_verify(c, b); @@ -1602,7 +1979,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) bne = want_new_bset(c, b); if (bne) - bch2_bset_init_next(b, &bne->keys); + bch2_bset_init_next(c, b, bne); bch2_btree_build_aux_trees(b); @@ -1613,113 +1990,81 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) * Use this one if the node is intent locked: */ void bch2_btree_node_write(struct bch_fs *c, struct btree *b, - struct closure *parent, - enum six_lock_type lock_type_held, - int idx_to_write) + enum six_lock_type lock_type_held) { - BUG_ON(lock_type_held == SIX_LOCK_write); - if (lock_type_held == SIX_LOCK_intent || - six_trylock_convert(&b->lock, SIX_LOCK_read, - SIX_LOCK_intent)) { - __bch2_btree_node_write(c, b, parent, SIX_LOCK_intent, idx_to_write); - - six_lock_write(&b->lock); - bch2_btree_post_write_cleanup(c, b); - six_unlock_write(&b->lock); + (lock_type_held == SIX_LOCK_read && + six_lock_tryupgrade(&b->c.lock))) { + __bch2_btree_node_write(c, b); + + /* don't cycle lock unnecessarily: */ + if (btree_node_just_written(b) && + six_trylock_write(&b->c.lock)) { + bch2_btree_post_write_cleanup(c, b); + six_unlock_write(&b->c.lock); + } if (lock_type_held == SIX_LOCK_read) - six_lock_downgrade(&b->lock); + six_lock_downgrade(&b->c.lock); } else { - __bch2_btree_node_write(c, b, parent, SIX_LOCK_read, idx_to_write); + __bch2_btree_node_write(c, b); + if (lock_type_held == SIX_LOCK_write && + btree_node_just_written(b)) + bch2_btree_post_write_cleanup(c, b); } } -static void bch2_btree_node_write_dirty(struct bch_fs *c, struct btree *b, - struct closure *parent) -{ - six_lock_read(&b->lock); - BUG_ON(b->level); - - bch2_btree_node_write(c, b, parent, SIX_LOCK_read, -1); - six_unlock_read(&b->lock); -} - -/* - * Write all dirty btree nodes to disk, including roots - */ -void bch2_btree_flush(struct bch_fs *c) +static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) { - struct closure cl; - struct btree *b; struct bucket_table *tbl; struct rhash_head *pos; - bool dropped_lock; + struct btree *b; unsigned i; - - closure_init_stack(&cl); - - rcu_read_lock(); - - do { - dropped_lock = false; - i = 0; restart: - tbl = rht_dereference_rcu(c->btree_cache_table.tbl, - &c->btree_cache_table); - - for (; i < tbl->size; i++) - rht_for_each_entry_rcu(b, pos, tbl, i, hash) - /* - * XXX - locking for b->level, when called from - * bch2_journal_move() - */ - if (!b->level && btree_node_dirty(b)) { - rcu_read_unlock(); - bch2_btree_node_write_dirty(c, b, &cl); - dropped_lock = true; - rcu_read_lock(); - goto restart; - } - } while (dropped_lock); + rcu_read_lock(); + for_each_cached_btree(b, c, tbl, i, pos) + if (test_bit(flag, &b->flags)) { + rcu_read_unlock(); + wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); + goto restart; + } rcu_read_unlock(); +} - closure_sync(&cl); +void bch2_btree_flush_all_reads(struct bch_fs *c) +{ + __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); } -/** - * bch_btree_node_flush_journal - flush any journal entries that contain keys - * from this node - * - * The bset's journal sequence number is used for preserving ordering of index - * updates across unclean shutdowns - it's used to ignore bsets newer than the - * most recent journal entry. - * - * But when rewriting btree nodes we compact all the bsets in a btree node - and - * if we compacted a bset that should be ignored with bsets we do need, that - * would be bad. So to avoid that, prior to making the new node visible ensure - * that the journal has been flushed so that all the bsets we compacted should - * be visible. - */ -void bch2_btree_node_flush_journal_entries(struct bch_fs *c, - struct btree *b, - struct closure *cl) +void bch2_btree_flush_all_writes(struct bch_fs *c) { - int i = b->nsets; + __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); +} - /* - * Journal sequence numbers in the different bsets will always be in - * ascending order, we only need to flush the highest - except that the - * most recent bset might not have a journal sequence number yet, so we - * need to loop: - */ - while (i--) { - u64 seq = le64_to_cpu(bset(b, &b->set[i])->journal_seq); +void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bucket_table *tbl; + struct rhash_head *pos; + struct btree *b; + unsigned i; - if (seq) { - bch2_journal_flush_seq_async(&c->journal, seq, cl); - break; - } + rcu_read_lock(); + for_each_cached_btree(b, c, tbl, i, pos) { + unsigned long flags = READ_ONCE(b->flags); + + if (!(flags & (1 << BTREE_NODE_dirty))) + continue; + + pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", + b, + (flags & (1 << BTREE_NODE_dirty)) != 0, + (flags & (1 << BTREE_NODE_need_write)) != 0, + b->c.level, + b->written, + !list_empty_careful(&b->write_blocked), + b->will_make_reachable != 0, + b->will_make_reachable & 1); } + rcu_read_unlock(); }