From 38f22164a9a3f2f8e33af8e0cc3ce4f17ef99cde Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 13 Jun 2017 17:06:05 -0800 Subject: [PATCH] Update bcachefs sources to 6a25f7a00d bcachefs: fix ioctl code --- .bcachefs_revision | 2 +- Makefile | 3 +- cmd_migrate.c | 1 + include/linux/bitops.h | 8 + include/linux/wait.h | 2 + include/trace/events/bcachefs.h | 2 +- libbcachefs/alloc.c | 35 +- libbcachefs/bcachefs.h | 6 +- libbcachefs/bkey_methods.c | 15 +- libbcachefs/btree_cache.c | 69 +- libbcachefs/btree_cache.h | 13 +- libbcachefs/btree_gc.c | 103 +- libbcachefs/btree_io.c | 210 +- libbcachefs/btree_io.h | 6 +- libbcachefs/btree_iter.c | 30 +- libbcachefs/btree_locking.h | 1 + libbcachefs/btree_types.h | 13 +- libbcachefs/btree_update.h | 323 +-- ...btree_update.c => btree_update_interior.c} | 2004 ++++++----------- libbcachefs/btree_update_interior.h | 312 +++ libbcachefs/btree_update_leaf.c | 660 ++++++ libbcachefs/buckets.c | 212 +- libbcachefs/buckets.h | 47 +- libbcachefs/buckets_types.h | 24 +- libbcachefs/chardev.c | 4 +- libbcachefs/debug.c | 2 +- libbcachefs/error.c | 6 +- libbcachefs/error.h | 49 +- libbcachefs/extents.c | 115 +- libbcachefs/extents.h | 38 +- libbcachefs/fs-io.c | 49 +- libbcachefs/inode.c | 7 +- libbcachefs/io.c | 809 +++---- libbcachefs/io.h | 79 +- libbcachefs/io_types.h | 43 +- libbcachefs/journal.c | 186 +- libbcachefs/journal_types.h | 2 + libbcachefs/keylist.c | 11 + libbcachefs/keylist.h | 6 + libbcachefs/migrate.c | 2 +- libbcachefs/move.c | 103 +- libbcachefs/move.h | 2 +- libbcachefs/opts.c | 7 +- libbcachefs/super-io.c | 95 +- libbcachefs/super-io.h | 4 +- libbcachefs/super.c | 53 +- libbcachefs/super.h | 12 + libbcachefs/super_types.h | 4 + libbcachefs/sysfs.c | 50 +- 49 files changed, 2980 insertions(+), 2859 deletions(-) rename libbcachefs/{btree_update.c => btree_update_interior.c} (55%) create mode 100644 libbcachefs/btree_update_interior.h create mode 100644 libbcachefs/btree_update_leaf.c diff --git a/.bcachefs_revision b/.bcachefs_revision index d2d0c51..7d1a4e6 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -14e9ac5016803fc63c1216608c866bef16b4053e +6a25f7a00d08c45b35bed3d649c05286ec60f7f6 diff --git a/Makefile b/Makefile index 4d406cc..227ce20 100644 --- a/Makefile +++ b/Makefile @@ -69,7 +69,8 @@ SRCS=bcachefs.c \ libbcachefs/btree_gc.c \ libbcachefs/btree_io.c \ libbcachefs/btree_iter.c \ - libbcachefs/btree_update.c \ + libbcachefs/btree_update_interior.c\ + libbcachefs/btree_update_leaf.c \ libbcachefs/buckets.c \ libbcachefs/checksum.c \ libbcachefs/clock.c \ diff --git a/cmd_migrate.c b/cmd_migrate.c index 82fa0f1..2e31b9e 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -24,6 +24,7 @@ #include #include #include +#include "bcachefs.h" #include "btree_update.h" #include "buckets.h" #include "dirent.h" diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 239574c..dc2927b 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -38,6 +38,14 @@ static inline void set_bit(long nr, volatile unsigned long *addr) __atomic_or_fetch(p, mask, __ATOMIC_RELAXED); } +static inline void __clear_bit(int nr, volatile unsigned long *addr) +{ + unsigned long mask = BIT_MASK(nr); + unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr); + + *p &= ~mask; +} + static inline void clear_bit(long nr, volatile unsigned long *addr) { unsigned long mask = BIT_MASK(nr); diff --git a/include/linux/wait.h b/include/linux/wait.h index f6f5757..62d15e5 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -90,6 +90,8 @@ do { \ __wait_event(wq, condition); \ } while (0) +#define wait_event_killable(wq, condition) ({wait_event(wq, condition); 0; }) + #define __wait_event_timeout(wq, condition, timeout) \ ___wait_event(wq, ___wait_cond_timeout(condition), \ TASK_UNINTERRUPTIBLE, 0, timeout, \ diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index 06cb5ff..e5052b8 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -87,7 +87,7 @@ DECLARE_EVENT_CLASS(bio, ), TP_fast_assign( - __entry->dev = bio->bi_bdev->bd_dev; + __entry->dev = bio->bi_bdev ? bio->bi_bdev->bd_dev : 0; __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio->bi_iter.bi_size >> 9; blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index 36dc947..953c6b3 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -146,17 +146,17 @@ static void pd_controllers_update(struct work_struct *work) u64 size = (ca->mi.nbuckets - ca->mi.first_bucket) << bucket_bits; - u64 dirty = stats.buckets_dirty << bucket_bits; + u64 dirty = stats.buckets[S_DIRTY] << bucket_bits; u64 free = __dev_buckets_free(ca, stats) << bucket_bits; /* * Bytes of internal fragmentation, which can be * reclaimed by copy GC */ - s64 fragmented = ((stats.buckets_dirty + + s64 fragmented = ((stats.buckets[S_DIRTY] + stats.buckets_cached) << bucket_bits) - ((stats.sectors[S_DIRTY] + - stats.sectors[S_CACHED] ) << 9); + stats.sectors_cached) << 9); fragmented = max(0LL, fragmented); @@ -912,7 +912,7 @@ static int bch2_allocator_thread(void *arg) bucket = fifo_peek(&ca->free_inc); discard_invalidated_bucket(ca, bucket); if (kthread_should_stop()) - goto out; + return 0; --ca->nr_invalidated; } @@ -922,7 +922,7 @@ static int bch2_allocator_thread(void *arg) journal_seq = 0; ret = bch2_invalidate_free_inc(c, ca, &journal_seq); if (ret < 0) - goto out; + return 0; ca->nr_invalidated = ret; @@ -944,7 +944,7 @@ static int bch2_allocator_thread(void *arg) down_read(&c->gc_lock); if (test_bit(BCH_FS_GC_FAILURE, &c->flags)) { up_read(&c->gc_lock); - goto out; + return 0; } while (1) { @@ -973,7 +973,7 @@ static int bch2_allocator_thread(void *arg) if (wait_buckets_available(c, ca)) { up_read(&c->gc_lock); - goto out; + return 0; } } up_read(&c->gc_lock); @@ -992,13 +992,6 @@ static int bch2_allocator_thread(void *arg) * write out the new bucket gens: */ } -out: - /* - * Avoid a race with bch2_usage_update() trying to wake us up after - * we've exited: - */ - synchronize_rcu(); - return 0; } /* Allocation */ @@ -1892,18 +1885,20 @@ void bch2_dev_allocator_stop(struct bch_dev *ca) struct task_struct *p = ca->alloc_thread; ca->alloc_thread = NULL; - smp_wmb(); /* * We need an rcu barrier between setting ca->alloc_thread = NULL and - * the thread shutting down to avoid a race with bch2_usage_update() - - * the allocator thread itself does a synchronize_rcu() on exit. + * the thread shutting down to avoid bch2_wake_allocator() racing: * * XXX: it would be better to have the rcu barrier be asynchronous * instead of blocking us here */ - if (p) + synchronize_rcu(); + + if (p) { kthread_stop(p); + put_task_struct(p); + } } /* start allocator thread: */ @@ -1917,11 +1912,13 @@ int bch2_dev_allocator_start(struct bch_dev *ca) if (ca->alloc_thread) return 0; - p = kthread_run(bch2_allocator_thread, ca, "bcache_allocator"); + p = kthread_create(bch2_allocator_thread, ca, "bcache_allocator"); if (IS_ERR(p)) return PTR_ERR(p); + get_task_struct(p); ca->alloc_thread = p; + wake_up_process(p); return 0; } diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 4d0fc62..96956e1 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -282,7 +282,6 @@ do { \ #include "alloc_types.h" #include "buckets_types.h" #include "clock_types.h" -#include "io_types.h" #include "journal_types.h" #include "keylist_types.h" #include "move_types.h" @@ -365,6 +364,7 @@ struct bch_dev { char name[BDEVNAME_SIZE]; struct bcache_superblock disk_sb; + int sb_write_error; struct dev_group self; @@ -721,10 +721,6 @@ struct bch_fs { atomic64_t key_version; - struct bio_list read_retry_list; - struct work_struct read_retry_work; - spinlock_t read_retry_lock; - struct bio_list btree_write_error_list; struct work_struct btree_write_error_work; spinlock_t btree_write_error_lock; diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index dbec8b3..2389415 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -27,9 +27,18 @@ const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type, if (k.k->u64s < BKEY_U64s) return "u64s too small"; - if (k.k->size && - (bkey_deleted(k.k) || !ops->is_extents)) - return "nonzero size field"; + if (!ops->is_extents) { + if (k.k->size) + return "nonzero size field"; + } else { + if ((k.k->size == 0) != bkey_deleted(k.k)) + return "bad size field"; + } + + if (ops->is_extents && + !k.k->size && + !bkey_deleted(k.k)) + return "zero size field"; switch (k.k->type) { case KEY_TYPE_DELETED: diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index d619f37..03c77b4 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -539,12 +539,12 @@ err: } /* Slowpath, don't want it inlined into btree_iter_traverse() */ -static noinline struct btree *bch2_btree_node_fill(struct btree_iter *iter, +static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, + struct btree_iter *iter, const struct bkey_i *k, unsigned level, enum six_lock_type lock_type) { - struct bch_fs *c = iter->c; struct btree *b; /* @@ -603,7 +603,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_iter *iter, * The btree node will have either a read or a write lock held, depending on * the @write parameter. */ -struct btree *bch2_btree_node_get(struct btree_iter *iter, +struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, const struct bkey_i *k, unsigned level, enum six_lock_type lock_type) { @@ -613,7 +613,7 @@ struct btree *bch2_btree_node_get(struct btree_iter *iter, BUG_ON(level >= BTREE_MAX_DEPTH); retry: rcu_read_lock(); - b = mca_find(iter->c, k); + b = mca_find(c, k); rcu_read_unlock(); if (unlikely(!b)) { @@ -622,7 +622,7 @@ retry: * else we could read in a btree node from disk that's been * freed: */ - b = bch2_btree_node_fill(iter, k, level, lock_type); + b = bch2_btree_node_fill(c, iter, k, level, lock_type); /* We raced and found the btree node in the cache */ if (!b) @@ -706,10 +706,61 @@ retry: return b; } -void bch2_btree_node_prefetch(struct btree_iter *iter, - const struct bkey_i *k, unsigned level) +struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, + struct btree_iter *iter, + struct btree *b, + enum btree_node_sibling sib) +{ + struct btree *parent; + struct btree_node_iter node_iter; + struct bkey_packed *k; + BKEY_PADDED(k) tmp; + struct btree *ret; + unsigned level = b->level; + + parent = iter->nodes[level + 1]; + if (!parent) + return NULL; + + if (!bch2_btree_node_relock(iter, level + 1)) { + bch2_btree_iter_set_locks_want(iter, level + 2); + return ERR_PTR(-EINTR); + } + + node_iter = iter->node_iters[parent->level]; + + k = bch2_btree_node_iter_peek_all(&node_iter, parent); + BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); + + do { + k = sib == btree_prev_sib + ? bch2_btree_node_iter_prev_all(&node_iter, parent) + : (bch2_btree_node_iter_advance(&node_iter, parent), + bch2_btree_node_iter_peek_all(&node_iter, parent)); + if (!k) + return NULL; + } while (bkey_deleted(k)); + + bch2_bkey_unpack(parent, &tmp.k, k); + + ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent); + + if (IS_ERR(ret) && PTR_ERR(ret) == -EINTR) { + btree_node_unlock(iter, level); + ret = bch2_btree_node_get(c, iter, &tmp.k, level, SIX_LOCK_intent); + } + + if (!IS_ERR(ret) && !bch2_btree_node_relock(iter, level)) { + six_unlock_intent(&ret->lock); + ret = ERR_PTR(-EINTR); + } + + return ret; +} + +void bch2_btree_node_prefetch(struct bch_fs *c, const struct bkey_i *k, + unsigned level, enum btree_id btree_id) { - struct bch_fs *c = iter->c; struct btree *b; BUG_ON(level >= BTREE_MAX_DEPTH); @@ -726,7 +777,7 @@ void bch2_btree_node_prefetch(struct btree_iter *iter, return; bkey_copy(&b->key, k); - if (bch2_btree_node_hash_insert(c, b, level, iter->btree_id)) { + if (bch2_btree_node_hash_insert(c, b, level, btree_id)) { /* raced with another fill: */ /* mark as unhashed... */ diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index ea53d2b..3155604 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -21,11 +21,16 @@ int bch2_btree_node_cannibalize_lock(struct bch_fs *, struct closure *); struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); -struct btree *bch2_btree_node_get(struct btree_iter *, const struct bkey_i *, - unsigned, enum six_lock_type); +struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, + const struct bkey_i *, unsigned, + enum six_lock_type); -void bch2_btree_node_prefetch(struct btree_iter *, const struct bkey_i *, - unsigned); +struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, + struct btree *, + enum btree_node_sibling); + +void bch2_btree_node_prefetch(struct bch_fs *, const struct bkey_i *, + unsigned, enum btree_id); void bch2_fs_btree_exit(struct bch_fs *); int bch2_fs_btree_init(struct bch_fs *); diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 376edaf..212bb5f 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -7,7 +7,7 @@ #include "alloc.h" #include "bkey_methods.h" #include "btree_locking.h" -#include "btree_update.h" +#include "btree_update_interior.h" #include "btree_io.h" #include "btree_gc.h" #include "buckets.h" @@ -112,14 +112,14 @@ u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k) * For runtime mark and sweep: */ static u8 bch2_btree_mark_key(struct bch_fs *c, enum bkey_type type, - struct bkey_s_c k) + struct bkey_s_c k, unsigned flags) { switch (type) { case BKEY_TYPE_BTREE: - bch2_gc_mark_key(c, k, c->sb.btree_node_size, true); + bch2_gc_mark_key(c, k, c->sb.btree_node_size, true, flags); return 0; case BKEY_TYPE_EXTENTS: - bch2_gc_mark_key(c, k, k.k->size, false); + bch2_gc_mark_key(c, k, k.k->size, false, flags); return bch2_btree_key_recalc_oldest_gen(c, k); default: BUG(); @@ -151,13 +151,10 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, extent_for_each_ptr(e, ptr) { struct bch_dev *ca = c->devs[ptr->dev]; struct bucket *g = PTR_BUCKET(ca, ptr); - struct bucket_mark new; if (!g->mark.gen_valid) { - bucket_cmpxchg(g, new, ({ - new.gen = ptr->gen; - new.gen_valid = 1; - })); + g->_mark.gen = ptr->gen; + g->_mark.gen_valid = 1; ca->need_alloc_write = true; } @@ -166,10 +163,8 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, type == BKEY_TYPE_BTREE ? "btree" : "data", ptr->gen, g->mark.gen)) { - bucket_cmpxchg(g, new, ({ - new.gen = ptr->gen; - new.gen_valid = 1; - })); + g->_mark.gen = ptr->gen; + g->_mark.gen_valid = 1; ca->need_alloc_write = true; set_bit(BCH_FS_FIXED_GENS, &c->flags); } @@ -184,13 +179,14 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, max_t(u64, k.k->version.lo, atomic64_read(&c->key_version))); - bch2_btree_mark_key(c, type, k); + bch2_btree_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC); fsck_err: return ret; } static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b) { + enum bkey_type type = btree_node_type(b); struct btree_node_iter iter; struct bkey unpacked; struct bkey_s_c k; @@ -201,8 +197,7 @@ static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b) btree_node_is_extents(b), &unpacked) { bch2_bkey_debugcheck(c, b, k); - stale = max(stale, bch2_btree_mark_key(c, - btree_node_type(b), k)); + stale = max(stale, bch2_btree_mark_key(c, type, k, 0)); } return stale; @@ -269,7 +264,7 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id) mutex_lock(&c->btree_root_lock); b = c->btree_roots[btree_id].b; - bch2_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key)); + bch2_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0); gc_pos_set(c, gc_pos_btree_root(b->btree_id)); mutex_unlock(&c->btree_root_lock); @@ -379,7 +374,7 @@ static void bch2_mark_metadata(struct bch_fs *c) static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) { struct bch_fs_usage stats = { 0 }; - struct btree_interior_update *as; + struct btree_update *as; struct pending_btree_node_free *d; mutex_lock(&c->btree_interior_update_lock); @@ -387,9 +382,10 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) for_each_pending_btree_node_free(c, as, d) if (d->index_update_done) - __bch2_gc_mark_key(c, bkey_i_to_s_c(&d->key), - c->sb.btree_node_size, true, - &stats); + __bch2_mark_key(c, bkey_i_to_s_c(&d->key), + c->sb.btree_node_size, true, + &stats, 0, + BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); /* * Don't apply stats - pending deletes aren't tracked in * bch_alloc_stats: @@ -430,7 +426,6 @@ void bch2_gc_start(struct bch_fs *c) per_cpu_ptr(c->usage_percpu, cpu); memset(p->s, 0, sizeof(p->s)); - p->persistent_reserved = 0; } lg_global_unlock(&c->usage_lock); @@ -551,16 +546,14 @@ static void recalc_packed_keys(struct btree *b) btree_keys_account_key_add(&b->nr, 0, k); } -static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES], - struct btree_iter *iter) +static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, + struct btree *old_nodes[GC_MERGE_NODES]) { struct btree *parent = iter->nodes[old_nodes[0]->level + 1]; - struct bch_fs *c = iter->c; unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0; unsigned blocks = btree_blocks(c) * 2 / 3; struct btree *new_nodes[GC_MERGE_NODES]; - struct btree_interior_update *as; - struct btree_reserve *res; + struct btree_update *as; struct keylist keylist; struct bkey_format_state format_state; struct bkey_format new_format; @@ -580,23 +573,6 @@ static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES], DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks) return; - res = bch2_btree_reserve_get(c, parent, nr_old_nodes, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE, - NULL); - if (IS_ERR(res)) { - trace_btree_gc_coalesce_fail(c, - BTREE_GC_COALESCE_FAIL_RESERVE_GET); - return; - } - - if (bch2_keylist_realloc(&keylist, NULL, 0, - (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) { - trace_btree_gc_coalesce_fail(c, - BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC); - goto out; - } - /* Find a format that all keys in @old_nodes can pack into */ bch2_bkey_format_init(&format_state); @@ -610,21 +586,38 @@ static void bch2_coalesce_nodes(struct btree *old_nodes[GC_MERGE_NODES], if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) { trace_btree_gc_coalesce_fail(c, BTREE_GC_COALESCE_FAIL_FORMAT_FITS); - goto out; + return; } - trace_btree_gc_coalesce(c, parent, nr_old_nodes); + if (bch2_keylist_realloc(&keylist, NULL, 0, + (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) { + trace_btree_gc_coalesce_fail(c, + BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC); + return; + } + + as = bch2_btree_update_start(c, iter->btree_id, + btree_update_reserve_required(c, parent) + nr_old_nodes, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE, + NULL); + if (IS_ERR(as)) { + trace_btree_gc_coalesce_fail(c, + BTREE_GC_COALESCE_FAIL_RESERVE_GET); + bch2_keylist_free(&keylist, NULL); + return; + } - as = bch2_btree_interior_update_alloc(c); + trace_btree_gc_coalesce(c, parent, nr_old_nodes); for (i = 0; i < nr_old_nodes; i++) - bch2_btree_interior_update_will_free_node(c, as, old_nodes[i]); + bch2_btree_interior_update_will_free_node(as, old_nodes[i]); /* Repack everything with @new_format and sort down to one bset */ for (i = 0; i < nr_old_nodes; i++) new_nodes[i] = - __bch2_btree_node_alloc_replacement(c, old_nodes[i], - new_format, as, res); + __bch2_btree_node_alloc_replacement(as, old_nodes[i], + new_format); /* * Conceptually we concatenate the nodes together and slice them @@ -738,7 +731,7 @@ next: bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key); /* Insert the newly coalesced nodes */ - bch2_btree_insert_node(parent, iter, &keylist, res, as); + bch2_btree_insert_node(as, parent, iter, &keylist); BUG_ON(!bch2_keylist_empty(&keylist)); @@ -751,7 +744,7 @@ next: /* Free the old nodes and update our sliding window */ for (i = 0; i < nr_old_nodes; i++) { - bch2_btree_node_free_inmem(iter, old_nodes[i]); + bch2_btree_node_free_inmem(c, old_nodes[i], iter); six_unlock_intent(&old_nodes[i]->lock); /* @@ -768,9 +761,9 @@ next: six_unlock_intent(&new_nodes[i]->lock); } } -out: + + bch2_btree_update_done(as); bch2_keylist_free(&keylist, NULL); - bch2_btree_reserve_put(c, res); } static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) @@ -814,7 +807,7 @@ static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) } memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0])); - bch2_coalesce_nodes(merge, &iter); + bch2_coalesce_nodes(c, &iter, merge); for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) { lock_seq[i] = merge[i]->lock.state.seq; diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index eeb546e..89724f3 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -2,10 +2,11 @@ #include "bcachefs.h" #include "bkey_methods.h" #include "btree_cache.h" -#include "btree_update.h" #include "btree_io.h" #include "btree_iter.h" #include "btree_locking.h" +#include "btree_update.h" +#include "btree_update_interior.h" #include "buckets.h" #include "checksum.h" #include "debug.h" @@ -872,37 +873,37 @@ static void bset_encrypt(struct bch_fs *c, struct bset *i, struct nonce nonce) vstruct_end(i) - (void *) i->_data); } -#define btree_node_error(c, b, ptr, msg, ...) \ +#define btree_node_error(c, b, msg, ...) \ do { \ if (write == READ && \ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ mustfix_fsck_err(c, \ - "btree node read error at btree %u level %u/%u\n"\ - "sector %llu node offset %u bset u64s %u: " msg,\ + "btree node read error at btree %u level %u/%u\n"\ + "pos %llu:%llu node offset %u bset u64s %u: " msg,\ (b)->btree_id, (b)->level, \ (c)->btree_roots[(b)->btree_id].level, \ - (u64) ptr->offset, (b)->written, \ - le16_to_cpu((i)->u64s), ##__VA_ARGS__); \ + (b)->key.k.p.inode, (b)->key.k.p.offset, \ + (b)->written, le16_to_cpu((i)->u64s), \ + ##__VA_ARGS__); \ } else { \ bch_err(c, "%s at btree %u level %u/%u\n" \ - "sector %llu node offset %u bset u64s %u: " msg,\ + "pos %llu:%llu node offset %u bset u64s %u: " msg,\ write == WRITE \ ? "corrupt metadata in btree node write" \ : "btree node error", \ (b)->btree_id, (b)->level, \ (c)->btree_roots[(b)->btree_id].level, \ - (u64) ptr->offset, (b)->written, \ - le16_to_cpu((i)->u64s), ##__VA_ARGS__); \ + (b)->key.k.p.inode, (b)->key.k.p.offset, \ + (b)->written, le16_to_cpu((i)->u64s), \ + ##__VA_ARGS__); \ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ goto fsck_err; \ } \ } while (0) static int validate_bset(struct bch_fs *c, struct btree *b, - const struct bch_extent_ptr *ptr, struct bset *i, unsigned sectors, - unsigned *whiteout_u64s, - int write) + unsigned *whiteout_u64s, int write) { struct bkey_packed *k, *prev = NULL; struct bpos prev_pos = POS_MIN; @@ -910,19 +911,19 @@ static int validate_bset(struct bch_fs *c, struct btree *b, int ret = 0; if (le16_to_cpu(i->version) != BCACHE_BSET_VERSION) { - btree_node_error(c, b, ptr, "unsupported bset version"); + btree_node_error(c, b, "unsupported bset version"); i->u64s = 0; return 0; } if (b->written + sectors > c->sb.btree_node_size) { - btree_node_error(c, b, ptr, "bset past end of btree node"); + btree_node_error(c, b, "bset past end of btree node"); i->u64s = 0; return 0; } if (b->written && !i->u64s) - btree_node_error(c, b, ptr, "empty set"); + btree_node_error(c, b, "empty set"); if (!BSET_SEPARATE_WHITEOUTS(i)) { seen_non_whiteout = true; @@ -936,7 +937,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, const char *invalid; if (!k->u64s) { - btree_node_error(c, b, ptr, + btree_node_error(c, b, "KEY_U64s 0: %zu bytes of metadata lost", vstruct_end(i) - (void *) k); @@ -945,7 +946,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, } if (bkey_next(k) > vstruct_last(i)) { - btree_node_error(c, b, ptr, + btree_node_error(c, b, "key extends past end of bset"); i->u64s = cpu_to_le16((u64 *) k - i->_data); @@ -953,7 +954,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, } if (k->format > KEY_FORMAT_CURRENT) { - btree_node_error(c, b, ptr, + btree_node_error(c, b, "invalid bkey format %u", k->format); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); @@ -973,7 +974,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, bch2_bkey_val_to_text(c, btree_node_type(b), buf, sizeof(buf), u); - btree_node_error(c, b, ptr, + btree_node_error(c, b, "invalid bkey %s: %s", buf, invalid); i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); @@ -994,7 +995,7 @@ static int validate_bset(struct bch_fs *c, struct btree *b, *whiteout_u64s = k->_data - i->_data; seen_non_whiteout = true; } else if (bkey_cmp(prev_pos, bkey_start_pos(u.k)) > 0) { - btree_node_error(c, b, ptr, + btree_node_error(c, b, "keys out of order: %llu:%llu > %llu:%llu", prev_pos.inode, prev_pos.offset, @@ -1013,32 +1014,7 @@ fsck_err: return ret; } -static bool extent_contains_ptr(struct bkey_s_c_extent e, - struct bch_extent_ptr match) -{ - const struct bch_extent_ptr *ptr; - - extent_for_each_ptr(e, ptr) - if (!memcmp(ptr, &match, sizeof(*ptr))) - return true; - - return false; -} - -static void bch2_btree_node_read_complete(struct btree_read_bio *rb, - struct btree *b) -{ - struct bch_dev *ca = rb->pick.ca; - - bio_put(&rb->bio); - percpu_ref_put(&ca->io_ref); - clear_btree_node_read_in_flight(b); - wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); -} - -void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, - struct bch_dev *ca, - const struct bch_extent_ptr *ptr) +int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b) { struct btree_node_entry *bne; struct bset *i = &b->data->keys; @@ -1049,7 +1025,7 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, const char *err; struct bch_csum csum; struct nonce nonce; - int ret, write = READ; + int ret, should_retry = 0, write = READ; iter = mempool_alloc(&c->fill_iter, GFP_NOIO); __bch2_btree_node_iter_init(iter, btree_node_is_extents(b)); @@ -1066,24 +1042,22 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, err = "bad magic"; if (le64_to_cpu(b->data->magic) != bset_magic(c)) - goto err; + goto retry_err; err = "bad btree header"; if (!b->data->keys.seq) - goto err; + goto retry_err; err = "unknown checksum type"; if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) - goto err; - - /* XXX: retry checksum errors */ + goto retry_err; nonce = btree_nonce(b, i, b->written << 9); csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); err = "bad checksum"; if (bch2_crc_cmp(csum, b->data->csum)) - goto err; + goto retry_err; bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &b->data->flags, @@ -1116,12 +1090,19 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, err = "incorrect max key"; if (bkey_cmp(b->data->max_key, b->key.k.p)) goto err; - +#if 0 + /* + * not correct anymore, due to btree node write error + * handling + * + * need to add b->data->seq to btree keys and verify + * against that + */ err = "incorrect backpointer"; if (!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key), b->data->ptr)) goto err; - +#endif err = bch2_bkey_format_validate(&b->data->format); if (err) goto err; @@ -1138,22 +1119,21 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, err = "unknown checksum type"; if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) - goto err; + goto retry_err; nonce = btree_nonce(b, i, b->written << 9); csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); err = "bad checksum"; - if (memcmp(&csum, &bne->csum, sizeof(csum))) - goto err; + if (bch2_crc_cmp(csum, bne->csum)) + goto retry_err; bset_encrypt(c, i, nonce); sectors = vstruct_sectors(bne, c->block_bits); } - ret = validate_bset(c, b, ptr, i, sectors, - &whiteout_u64s, READ); + ret = validate_bset(c, b, i, sectors, &whiteout_u64s, READ); if (ret) goto fsck_err; @@ -1208,40 +1188,79 @@ void bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, btree_node_reset_sib_u64s(b); out: mempool_free(iter, &c->fill_iter); - return; + return should_retry; err: - btree_node_error(c, b, ptr, "%s", err); + btree_node_error(c, b, "%s", err); fsck_err: bch2_inconsistent_error(c); set_btree_node_read_error(b); goto out; +retry_err: + should_retry = -1; + goto out; } static void btree_node_read_work(struct work_struct *work) { struct btree_read_bio *rb = container_of(work, struct btree_read_bio, work); + struct bch_fs *c = rb->c; + struct bch_dev *ca = rb->pick.ca; + struct btree *b = rb->bio.bi_private; + struct bio *bio = &rb->bio; + struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); + const struct bch_extent_ptr *ptr; + struct bch_devs_mask avoid; + + bch2_dev_io_err_on(bio->bi_error, rb->pick.ca, "btree read"); + percpu_ref_put(&rb->pick.ca->io_ref); + + if (!bio->bi_error && + !bch2_btree_node_read_done(c, b)) + goto out; - bch2_btree_node_read_done(rb->c, rb->bio.bi_private, - rb->pick.ca, &rb->pick.ptr); - bch2_btree_node_read_complete(rb, rb->bio.bi_private); + goto err; +out: + bch2_time_stats_update(&c->btree_read_time, rb->start_time); + bio_put(&rb->bio); + clear_btree_node_read_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); + return; +err: + memset(&avoid, 0, sizeof(avoid)); + __set_bit(ca->dev_idx, avoid.d); + + extent_for_each_ptr(e, ptr) { + memset(&rb->pick, 0, sizeof(rb->pick)); + bch2_get_read_device(c, e.k, ptr, NULL, &avoid, &rb->pick); + + if (!rb->pick.ca) + continue; + + bio_reset(bio); + bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; + bio->bi_bdev = rb->pick.ca->disk_sb.bdev; + bio->bi_iter.bi_sector = rb->pick.ptr.offset; + bio->bi_iter.bi_size = btree_bytes(c); + submit_bio_wait(bio); + + bch2_dev_io_err_on(bio->bi_error, rb->pick.ca, "btree read"); + percpu_ref_put(&rb->pick.ca->io_ref); + + if (!bio->bi_error && + !bch2_btree_node_read_done(c, b)) + goto out; + } + + set_btree_node_read_error(b); + goto out; } static void btree_node_read_endio(struct bio *bio) { - struct btree *b = bio->bi_private; struct btree_read_bio *rb = container_of(bio, struct btree_read_bio, bio); - if (bch2_dev_fatal_io_err_on(bio->bi_error, - rb->pick.ca, "IO error reading bucket %zu", - PTR_BUCKET_NR(rb->pick.ca, &rb->pick.ptr)) || - bch2_meta_read_fault("btree")) { - set_btree_node_read_error(b); - bch2_btree_node_read_complete(rb, rb->bio.bi_private); - return; - } - INIT_WORK(&rb->work, btree_node_read_work); schedule_work(&rb->work); } @@ -1249,7 +1268,6 @@ static void btree_node_read_endio(struct bio *bio) void bch2_btree_node_read(struct bch_fs *c, struct btree *b, bool sync) { - uint64_t start_time = local_clock(); struct extent_pick_ptr pick; struct btree_read_bio *rb; struct bio *bio; @@ -1266,6 +1284,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, bio = bio_alloc_bioset(GFP_NOIO, btree_pages(c), &c->btree_read_bio); rb = container_of(bio, struct btree_read_bio, bio); rb->c = c; + rb->start_time = local_clock(); rb->pick = pick; bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; bio->bi_bdev = pick.ca->disk_sb.bdev; @@ -1277,19 +1296,8 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, if (sync) { submit_bio_wait(bio); - - if (bch2_dev_fatal_io_err_on(bio->bi_error, - pick.ca, "IO error reading bucket %zu", - PTR_BUCKET_NR(pick.ca, &pick.ptr)) || - bch2_meta_read_fault("btree")) { - set_btree_node_read_error(b); - goto out; - } - - bch2_btree_node_read_done(c, b, pick.ca, &pick.ptr); - bch2_time_stats_update(&c->btree_read_time, start_time); -out: - bch2_btree_node_read_complete(rb, b); + bio->bi_private = b; + btree_node_read_work(&rb->work); } else { bio->bi_end_io = btree_node_read_endio; bio->bi_private = b; @@ -1327,7 +1335,7 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, return -EIO; } - bch2_btree_set_root_initial(c, b, NULL); + bch2_btree_set_root_for_read(c, b); six_unlock_intent(&b->lock); return 0; @@ -1356,7 +1364,15 @@ static void bch2_btree_node_write_error(struct bch_fs *c, __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; struct bkey_i_extent *new_key; + six_lock_read(&b->lock); bkey_copy(&tmp.k, &b->key); + six_unlock_read(&b->lock); + + if (!bkey_extent_is_data(&tmp.k.k) || !PTR_HASH(&tmp.k)) { + /* Node has been freed: */ + goto out; + } + new_key = bkey_i_to_extent(&tmp.k); while (wbio->replicas_failed) { @@ -1371,7 +1387,7 @@ static void bch2_btree_node_write_error(struct bch_fs *c, set_btree_node_noevict(b); bch2_fatal_error(c); } - +out: bio_put(&wbio->bio); btree_node_write_done(c, b); if (cl) @@ -1385,9 +1401,9 @@ void bch2_btree_write_error_work(struct work_struct *work) struct bio *bio; while (1) { - spin_lock_irq(&c->read_retry_lock); - bio = bio_list_pop(&c->read_retry_list); - spin_unlock_irq(&c->read_retry_lock); + spin_lock_irq(&c->btree_write_error_lock); + bio = bio_list_pop(&c->btree_write_error_list); + spin_unlock_irq(&c->btree_write_error_lock); if (!bio) break; @@ -1406,7 +1422,7 @@ static void btree_node_write_endio(struct bio *bio) struct bch_fs *c = wbio->c; struct bch_dev *ca = wbio->ca; - if (bch2_dev_nonfatal_io_err_on(bio->bi_error, ca, "btree write") || + if (bch2_dev_io_err_on(bio->bi_error, ca, "btree write") || bch2_meta_write_fault("btree")) set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed); @@ -1428,7 +1444,7 @@ static void btree_node_write_endio(struct bio *bio) unsigned long flags; spin_lock_irqsave(&c->btree_write_error_lock, flags); - bio_list_add(&c->read_retry_list, &wbio->bio); + bio_list_add(&c->btree_write_error_list, &wbio->bio); spin_unlock_irqrestore(&c->btree_write_error_lock, flags); queue_work(c->wq, &c->btree_write_error_work); return; @@ -1450,7 +1466,7 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, extent_for_each_ptr(bkey_i_to_s_c_extent(&b->key), ptr) break; - ret = validate_bset(c, b, ptr, i, sectors, &whiteout_u64s, WRITE); + ret = validate_bset(c, b, i, sectors, &whiteout_u64s, WRITE); if (ret) bch2_inconsistent_error(c); diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index 91263ee..877ada6 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -10,6 +10,7 @@ struct btree_iter; struct btree_read_bio { struct bch_fs *c; + u64 start_time; struct extent_pick_ptr pick; struct work_struct work; struct bio bio; @@ -71,11 +72,10 @@ void bch2_btree_build_aux_trees(struct btree *); void bch2_btree_init_next(struct bch_fs *, struct btree *, struct btree_iter *); -void bch2_btree_node_read_done(struct bch_fs *, struct btree *, - struct bch_dev *, const struct bch_extent_ptr *); +int bch2_btree_node_read_done(struct bch_fs *, struct btree *); void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, - const struct bkey_i *, unsigned); + const struct bkey_i *, unsigned); void bch2_btree_complete_write(struct bch_fs *, struct btree *, struct btree_write *); diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index 46df99f..8ad0895 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -247,14 +247,12 @@ fail: return false; } -static int __bch2_btree_iter_unlock(struct btree_iter *iter) +static void __bch2_btree_iter_unlock(struct btree_iter *iter) { while (iter->nodes_locked) btree_node_unlock(iter, __ffs(iter->nodes_locked)); iter->flags &= ~BTREE_ITER_UPTODATE; - - return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; } int bch2_btree_iter_unlock(struct btree_iter *iter) @@ -263,7 +261,9 @@ int bch2_btree_iter_unlock(struct btree_iter *iter) for_each_linked_btree_iter(iter, linked) __bch2_btree_iter_unlock(linked); - return __bch2_btree_iter_unlock(iter); + __bch2_btree_iter_unlock(iter); + + return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; } /* Btree iterator: */ @@ -617,13 +617,9 @@ bool bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) void bch2_btree_iter_node_drop_linked(struct btree_iter *iter, struct btree *b) { struct btree_iter *linked; - unsigned level = b->level; for_each_linked_btree_iter(iter, linked) - if (linked->nodes[level] == b) { - btree_node_unlock(linked, level); - linked->nodes[level] = BTREE_ITER_NOT_END; - } + bch2_btree_iter_node_drop(linked, b); } void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) @@ -631,9 +627,9 @@ void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) unsigned level = b->level; if (iter->nodes[level] == b) { - BUG_ON(b->lock.state.intent_lock != 1); btree_node_unlock(iter, level); iter->nodes[level] = BTREE_ITER_NOT_END; + iter->flags &= ~BTREE_ITER_UPTODATE; } } @@ -718,7 +714,8 @@ static void btree_iter_prefetch(struct btree_iter *iter) break; bch2_bkey_unpack(b, &tmp.k, k); - bch2_btree_node_prefetch(iter, &tmp.k, iter->level); + bch2_btree_node_prefetch(iter->c, &tmp.k, + iter->level, iter->btree_id); } if (!was_locked) @@ -735,7 +732,7 @@ static inline int btree_iter_down(struct btree_iter *iter) bkey_reassemble(&tmp.k, k); - b = bch2_btree_node_get(iter, &tmp.k, level, lock_type); + b = bch2_btree_node_get(iter->c, iter, &tmp.k, level, lock_type); if (unlikely(IS_ERR(b))) return PTR_ERR(b); @@ -907,6 +904,8 @@ int __must_check bch2_btree_iter_traverse(struct btree_iter *iter) { int ret; + iter->flags &= ~BTREE_ITER_UPTODATE; + if (unlikely(!iter->nodes[iter->level])) return 0; @@ -1064,11 +1063,14 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) struct btree *b = iter->nodes[0]; struct bkey_packed *k = __bch2_btree_node_iter_peek_all(&iter->node_iters[0], b); - - return (struct bkey_s_c) { + struct bkey_s_c ret = { .k = &iter->k, .v = bkeyp_val(&b->format, k) }; + + if (debug_check_bkeys(iter->c)) + bch2_bkey_debugcheck(iter->c, b, ret); + return ret; } while (1) { diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 0945ea8..86c1954 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -10,6 +10,7 @@ */ #include "btree_iter.h" +#include "btree_io.h" #include "six.h" /* matches six lock types */ diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 7085feb..d3ba28b 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -11,7 +11,7 @@ #include "six.h" struct open_bucket; -struct btree_interior_update; +struct btree_update; #define MAX_BSETS 3U @@ -105,7 +105,7 @@ struct btree { * node to point to them: we update the parent in memory immediately, * but then we must wait until the children have been written out before * the update to the parent can be written - this is a list of the - * btree_interior_updates that are blocking this node from being + * btree_updates that are blocking this node from being * written: */ struct list_head write_blocked; @@ -116,7 +116,7 @@ struct btree { * another write - because that write also won't yet be reachable and * marking it as completed before it's reachable would be incorrect: */ - struct btree_interior_update *will_make_reachable; + struct btree_update *will_make_reachable; struct open_bucket *ob; @@ -265,7 +265,7 @@ static inline bool btree_node_is_extents(struct btree *b) struct btree_root { struct btree *b; - struct btree_interior_update *as; + struct btree_update *as; /* On disk root - see async splits: */ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); @@ -312,6 +312,11 @@ enum btree_gc_coalesce_fail_reason { BTREE_GC_COALESCE_FAIL_FORMAT_FITS, }; +enum btree_node_sibling { + btree_prev_sib, + btree_next_sib, +}; + typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, struct btree *, struct btree_node_iter *); diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 086077f..584f0f5 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -1,310 +1,24 @@ -#ifndef _BCACHE_BTREE_INSERT_H -#define _BCACHE_BTREE_INSERT_H +#ifndef _BCACHE_BTREE_UPDATE_H +#define _BCACHE_BTREE_UPDATE_H -#include "btree_cache.h" #include "btree_iter.h" -#include "buckets.h" #include "journal.h" -#include "vstructs.h" struct bch_fs; -struct bkey_format_state; -struct bkey_format; struct btree; +struct btree_insert; -static inline void btree_node_reset_sib_u64s(struct btree *b) -{ - b->sib_u64s[0] = b->nr.live_u64s; - b->sib_u64s[1] = b->nr.live_u64s; -} - -struct btree_reserve { - struct disk_reservation disk_res; - unsigned nr; - struct btree *b[BTREE_RESERVE_MAX]; -}; - -void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *); -bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *, - struct bkey_format *); - -/* Btree node freeing/allocation: */ - -/* - * Tracks a btree node that has been (or is about to be) freed in memory, but - * has _not_ yet been freed on disk (because the write that makes the new - * node(s) visible and frees the old hasn't completed yet) - */ -struct pending_btree_node_free { - bool index_update_done; - - __le64 seq; - enum btree_id btree_id; - unsigned level; - __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); -}; - -/* - * Tracks an in progress split/rewrite of a btree node and the update to the - * parent node: - * - * When we split/rewrite a node, we do all the updates in memory without - * waiting for any writes to complete - we allocate the new node(s) and update - * the parent node, possibly recursively up to the root. - * - * The end result is that we have one or more new nodes being written - - * possibly several, if there were multiple splits - and then a write (updating - * an interior node) which will make all these new nodes visible. - * - * Additionally, as we split/rewrite nodes we free the old nodes - but the old - * nodes can't be freed (their space on disk can't be reclaimed) until the - * update to the interior node that makes the new node visible completes - - * until then, the old nodes are still reachable on disk. - * - */ -struct btree_interior_update { - struct closure cl; - struct bch_fs *c; - - struct list_head list; - - /* What kind of update are we doing? */ - enum { - BTREE_INTERIOR_NO_UPDATE, - BTREE_INTERIOR_UPDATING_NODE, - BTREE_INTERIOR_UPDATING_ROOT, - BTREE_INTERIOR_UPDATING_AS, - } mode; - - unsigned flags; - struct btree_reserve *reserve; - - /* - * BTREE_INTERIOR_UPDATING_NODE: - * The update that made the new nodes visible was a regular update to an - * existing interior node - @b. We can't write out the update to @b - * until the new nodes we created are finished writing, so we block @b - * from writing by putting this btree_interior update on the - * @b->write_blocked list with @write_blocked_list: - */ - struct btree *b; - struct list_head write_blocked_list; - - /* - * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now - * we're now blocking another btree_interior_update - * @parent_as - btree_interior_update that's waiting on our nodes to finish - * writing, before it can make new nodes visible on disk - * @wait - list of child btree_interior_updates that are waiting on this - * btree_interior_update to make all the new nodes visible before they can free - * their old btree nodes - */ - struct btree_interior_update *parent_as; - struct closure_waitlist wait; - - /* - * We may be freeing nodes that were dirty, and thus had journal entries - * pinned: we need to transfer the oldest of those pins to the - * btree_interior_update operation, and release it when the new node(s) - * are all persistent and reachable: - */ - struct journal_entry_pin journal; - - u64 journal_seq; - - /* - * Nodes being freed: - * Protected by c->btree_node_pending_free_lock - */ - struct pending_btree_node_free pending[BTREE_MAX_DEPTH + GC_MERGE_NODES]; - unsigned nr_pending; - - /* New nodes, that will be made reachable by this update: */ - struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES]; - unsigned nr_new_nodes; - - /* Only here to reduce stack usage on recursive splits: */ - struct keylist parent_keys; - /* - * Enough room for btree_split's keys without realloc - btree node - * pointers never have crc/compression info, so we only need to acount - * for the pointers for three keys - */ - u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; -}; - -#define BTREE_INTERIOR_UPDATE_MUST_REWRITE (1 << 0) - -#define for_each_pending_btree_node_free(c, as, p) \ - list_for_each_entry(as, &c->btree_interior_update_list, list) \ - for (p = as->pending; p < as->pending + as->nr_pending; p++) - -void bch2_btree_node_free_inmem(struct btree_iter *, struct btree *); -void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *); -void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *); - -struct btree *__bch2_btree_node_alloc_replacement(struct bch_fs *, - struct btree *, - struct bkey_format, - struct btree_interior_update *, - struct btree_reserve *); - -struct btree_interior_update * -bch2_btree_interior_update_alloc(struct bch_fs *); - -void bch2_btree_interior_update_will_free_node(struct bch_fs *, - struct btree_interior_update *, - struct btree *); - -void bch2_btree_set_root_initial(struct bch_fs *, struct btree *, - struct btree_reserve *); - -void bch2_btree_reserve_put(struct bch_fs *, struct btree_reserve *); -struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *, - struct btree *, unsigned, - unsigned, struct closure *); - -int bch2_btree_root_alloc(struct bch_fs *, enum btree_id, struct closure *); - -/* Inserting into a given leaf node (last stage of insert): */ - +void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *, + struct btree_iter *); bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, - struct btree_node_iter *, struct bkey_i *); + struct btree_node_iter *, struct bkey_i *); void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *, - struct bkey_i *); - -static inline void *btree_data_end(struct bch_fs *c, struct btree *b) -{ - return (void *) b->data + btree_bytes(c); -} - -static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, - struct btree *b) -{ - return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); -} - -static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, - struct btree *b) -{ - return btree_data_end(c, b); -} - -static inline void *write_block(struct btree *b) -{ - return (void *) b->data + (b->written << 9); -} - -static inline bool bset_written(struct btree *b, struct bset *i) -{ - return (void *) i < write_block(b); -} - -static inline bool bset_unwritten(struct btree *b, struct bset *i) -{ - return (void *) i > write_block(b); -} - -static inline unsigned bset_end_sector(struct bch_fs *c, struct btree *b, - struct bset *i) -{ - return round_up(bset_byte_offset(b, vstruct_end(i)), - block_bytes(c)) >> 9; -} - -static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, - struct btree *b) -{ - struct bset *i = btree_bset_last(b); - unsigned used = bset_byte_offset(b, vstruct_end(i)) / sizeof(u64) + - b->whiteout_u64s + - b->uncompacted_whiteout_u64s; - unsigned total = c->sb.btree_node_size << 6; - - EBUG_ON(used > total); - - if (bset_written(b, i)) - return 0; - - return total - used; -} - -static inline unsigned btree_write_set_buffer(struct btree *b) -{ - /* - * Could buffer up larger amounts of keys for btrees with larger keys, - * pending benchmarking: - */ - return 4 << 10; -} - -static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, - struct btree *b) -{ - struct bset *i = btree_bset_last(b); - unsigned offset = max_t(unsigned, b->written << 9, - bset_byte_offset(b, vstruct_end(i))); - ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t) - (offset + sizeof(struct btree_node_entry) + - b->whiteout_u64s * sizeof(u64) + - b->uncompacted_whiteout_u64s * sizeof(u64)); - - EBUG_ON(offset > btree_bytes(c)); - - if ((unlikely(bset_written(b, i)) && n > 0) || - (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) && - n > btree_write_set_buffer(b))) - return (void *) b->data + offset; - - return NULL; -} - -/* - * write lock must be held on @b (else the dirty bset that we were going to - * insert into could be written out from under us) - */ -static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, - struct btree *b, unsigned u64s) -{ - if (btree_node_is_extents(b)) { - /* The insert key might split an existing key - * (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case: - */ - u64s += BKEY_EXTENT_U64s_MAX; - } - - return u64s <= bch_btree_keys_u64s_remaining(c, b); -} - -static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) -{ - if (bset_written(b, bset(b, t))) { - EBUG_ON(b->uncompacted_whiteout_u64s < - bkeyp_key_u64s(&b->format, k)); - b->uncompacted_whiteout_u64s -= - bkeyp_key_u64s(&b->format, k); - } -} - -static inline void reserve_whiteout(struct btree *b, struct bset_tree *t, - struct bkey_packed *k) -{ - if (bset_written(b, bset(b, t))) { - BUG_ON(!k->needs_whiteout); - b->uncompacted_whiteout_u64s += - bkeyp_key_u64s(&b->format, k); - } -} - -void bch2_btree_insert_node(struct btree *, struct btree_iter *, - struct keylist *, struct btree_reserve *, - struct btree_interior_update *as); + struct bkey_i *); /* Normal update interface: */ struct btree_insert { - struct bch_fs *c; + struct bch_fs *c; struct disk_reservation *disk_res; struct journal_res journal_res; u64 *journal_seq; @@ -403,25 +117,6 @@ int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *, struct disk_reservation *, struct extent_insert_hook *, u64 *, unsigned); -static inline bool journal_res_insert_fits(struct btree_insert *trans, - struct btree_insert_entry *insert) -{ - unsigned u64s = 0; - struct btree_insert_entry *i; - - /* - * If we didn't get a journal reservation, we're in journal replay and - * we're not journalling updates: - */ - if (!trans->journal_res.ref) - return true; - - for (i = insert; i < trans->entries + trans->nr; i++) - u64s += jset_u64s(i->k->k.u64s + i->extra_res); - - return u64s <= trans->journal_res.u64s; -} - int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, struct disk_reservation *, struct extent_insert_hook *, u64 *, int flags); @@ -438,5 +133,5 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, int bch2_btree_node_update_key(struct bch_fs *, struct btree *, struct bkey_i_extent *); -#endif /* _BCACHE_BTREE_INSERT_H */ +#endif /* _BCACHE_BTREE_UPDATE_H */ diff --git a/libbcachefs/btree_update.c b/libbcachefs/btree_update_interior.c similarity index 55% rename from libbcachefs/btree_update.c rename to libbcachefs/btree_update_interior.c index c7b2018..350e2f9 100644 --- a/libbcachefs/btree_update.c +++ b/libbcachefs/btree_update_interior.c @@ -5,6 +5,7 @@ #include "btree_cache.h" #include "btree_gc.h" #include "btree_update.h" +#include "btree_update_interior.h" #include "btree_io.h" #include "btree_iter.h" #include "btree_locking.h" @@ -15,17 +16,54 @@ #include "super-io.h" #include -#include #include -static void btree_interior_update_updated_root(struct bch_fs *, - struct btree_interior_update *, - enum btree_id); -static void btree_interior_update_will_make_reachable(struct bch_fs *, - struct btree_interior_update *, - struct btree *); -static void btree_interior_update_drop_new_node(struct bch_fs *, - struct btree *); +static void btree_node_will_make_reachable(struct btree_update *, + struct btree *); +static void btree_update_drop_new_node(struct bch_fs *, struct btree *); +static void bch2_btree_set_root_ondisk(struct bch_fs *, struct btree *); + +/* Debug code: */ + +static void btree_node_interior_verify(struct btree *b) +{ + struct btree_node_iter iter; + struct bkey_packed *k; + + BUG_ON(!b->level); + + bch2_btree_node_iter_init(&iter, b, b->key.k.p, false, false); +#if 1 + BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) || + bkey_cmp_left_packed(b, k, &b->key.k.p)); + + BUG_ON((bch2_btree_node_iter_advance(&iter, b), + !bch2_btree_node_iter_end(&iter))); +#else + const char *msg; + + msg = "not found"; + k = bch2_btree_node_iter_peek(&iter, b); + if (!k) + goto err; + + msg = "isn't what it should be"; + if (bkey_cmp_left_packed(b, k, &b->key.k.p)) + goto err; + + bch2_btree_node_iter_advance(&iter, b); + + msg = "isn't last key"; + if (!bch2_btree_node_iter_end(&iter)) + goto err; + return; +err: + bch2_dump_btree_node(b); + printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode, + b->key.k.p.offset, msg); + BUG(); +#endif +} /* Calculate ideal packed bkey format for new btree nodes: */ @@ -81,7 +119,7 @@ static size_t btree_node_u64s_with_format(struct btree *b, * the re-packed keys would fit inside the node itself. */ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, - struct bkey_format *new_f) + struct bkey_format *new_f) { size_t u64s = btree_node_u64s_with_format(b, new_f); @@ -90,38 +128,60 @@ bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, /* Btree node freeing/allocation: */ +static bool btree_key_matches(struct bch_fs *c, + struct bkey_s_c_extent l, + struct bkey_s_c_extent r) +{ + const struct bch_extent_ptr *ptr1, *ptr2; + + extent_for_each_ptr(l, ptr1) + extent_for_each_ptr(r, ptr2) + if (ptr1->dev == ptr2->dev && + ptr1->gen == ptr2->gen && + ptr1->offset == ptr2->offset) + return true; + + return false; +} + /* * We're doing the index update that makes @b unreachable, update stuff to * reflect that: * - * Must be called _before_ btree_interior_update_updated_root() or - * btree_interior_update_updated_btree: + * Must be called _before_ btree_update_updated_root() or + * btree_update_updated_node: */ -static void bch2_btree_node_free_index(struct bch_fs *c, struct btree *b, - enum btree_id id, struct bkey_s_c k, - struct bch_fs_usage *stats) +static void bch2_btree_node_free_index(struct btree_update *as, struct btree *b, + struct bkey_s_c k, + struct bch_fs_usage *stats) { - struct btree_interior_update *as; + struct bch_fs *c = as->c; struct pending_btree_node_free *d; + unsigned replicas; + /* + * btree_update lock is only needed here to avoid racing with + * gc: + */ mutex_lock(&c->btree_interior_update_lock); - for_each_pending_btree_node_free(c, as, d) + for (d = as->pending; d < as->pending + as->nr_pending; d++) if (!bkey_cmp(k.k->p, d->key.k.p) && - bkey_val_bytes(k.k) == bkey_val_bytes(&d->key.k) && - !memcmp(k.v, &d->key.v, bkey_val_bytes(k.k))) + btree_key_matches(c, bkey_s_c_to_extent(k), + bkey_i_to_s_c_extent(&d->key))) goto found; - BUG(); found: + BUG_ON(d->index_update_done); d->index_update_done = true; /* * Btree nodes are accounted as freed in bch_alloc_stats when they're * freed from the index: */ - stats->s[S_COMPRESSED][S_META] -= c->sb.btree_node_size; - stats->s[S_UNCOMPRESSED][S_META] -= c->sb.btree_node_size; + replicas = bch2_extent_nr_dirty_ptrs(k); + if (replicas) + stats->s[replicas - 1].data[S_META] -= c->sb.btree_node_size; /* * We're dropping @k from the btree, but it's still live until the @@ -150,7 +210,7 @@ found: bch2_mark_key(c, bkey_i_to_s_c(&d->key), -c->sb.btree_node_size, true, b ? gc_pos_btree_node(b) - : gc_pos_btree_root(id), + : gc_pos_btree_root(as->btree_id), &tmp, 0); /* * Don't apply tmp - pending deletes aren't tracked in @@ -196,7 +256,7 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) { struct open_bucket *ob = b->ob; - btree_interior_update_drop_new_node(c, b); + btree_update_drop_new_node(c, b); b->ob = NULL; @@ -207,17 +267,18 @@ void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) bch2_open_bucket_put(c, ob); } -void bch2_btree_node_free_inmem(struct btree_iter *iter, struct btree *b) +void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, + struct btree_iter *iter) { bch2_btree_iter_node_drop_linked(iter, b); - __btree_node_free(iter->c, b, iter); + __btree_node_free(c, b, iter); bch2_btree_iter_node_drop(iter, b); } static void bch2_btree_node_free_ondisk(struct bch_fs *c, - struct pending_btree_node_free *pending) + struct pending_btree_node_free *pending) { struct bch_fs_usage stats = { 0 }; @@ -304,18 +365,16 @@ mem_alloc: return b; } -static struct btree *bch2_btree_node_alloc(struct bch_fs *c, - unsigned level, enum btree_id id, - struct btree_interior_update *as, - struct btree_reserve *reserve) +static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level) { + struct bch_fs *c = as->c; struct btree *b; - BUG_ON(!reserve->nr); + BUG_ON(!as->reserve->nr); - b = reserve->b[--reserve->nr]; + b = as->reserve->b[--as->reserve->nr]; - BUG_ON(bch2_btree_node_hash_insert(c, b, level, id)); + BUG_ON(bch2_btree_node_hash_insert(c, b, level, as->btree_id)); set_btree_node_accessed(b); set_btree_node_dirty(b); @@ -324,27 +383,25 @@ static struct btree *bch2_btree_node_alloc(struct bch_fs *c, memset(&b->nr, 0, sizeof(b->nr)); b->data->magic = cpu_to_le64(bset_magic(c)); b->data->flags = 0; - SET_BTREE_NODE_ID(b->data, id); + SET_BTREE_NODE_ID(b->data, as->btree_id); SET_BTREE_NODE_LEVEL(b->data, level); b->data->ptr = bkey_i_to_extent(&b->key)->v.start->ptr; bch2_btree_build_aux_trees(b); - btree_interior_update_will_make_reachable(c, as, b); + btree_node_will_make_reachable(as, b); trace_btree_node_alloc(c, b); return b; } -struct btree *__bch2_btree_node_alloc_replacement(struct bch_fs *c, +struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, struct btree *b, - struct bkey_format format, - struct btree_interior_update *as, - struct btree_reserve *reserve) + struct bkey_format format) { struct btree *n; - n = bch2_btree_node_alloc(c, b->level, b->btree_id, as, reserve); + n = bch2_btree_node_alloc(as, b->level); n->data->min_key = b->data->min_key; n->data->max_key = b->data->max_key; @@ -352,7 +409,7 @@ struct btree *__bch2_btree_node_alloc_replacement(struct bch_fs *c, btree_node_set_format(n, format); - bch2_btree_sort_into(c, n, b); + bch2_btree_sort_into(as->c, n, b); btree_node_reset_sib_u64s(n); @@ -360,10 +417,8 @@ struct btree *__bch2_btree_node_alloc_replacement(struct bch_fs *c, return n; } -static struct btree *bch2_btree_node_alloc_replacement(struct bch_fs *c, - struct btree *b, - struct btree_interior_update *as, - struct btree_reserve *reserve) +static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, + struct btree *b) { struct bkey_format new_f = bch2_btree_calc_format(b); @@ -371,127 +426,15 @@ static struct btree *bch2_btree_node_alloc_replacement(struct bch_fs *c, * The keys might expand with the new format - if they wouldn't fit in * the btree node anymore, use the old format for now: */ - if (!bch2_btree_node_format_fits(c, b, &new_f)) + if (!bch2_btree_node_format_fits(as->c, b, &new_f)) new_f = b->format; - return __bch2_btree_node_alloc_replacement(c, b, new_f, as, reserve); -} - -static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b, - struct btree_reserve *btree_reserve) -{ - struct btree *old = btree_node_root(c, b); - - /* Root nodes cannot be reaped */ - mutex_lock(&c->btree_cache_lock); - list_del_init(&b->list); - mutex_unlock(&c->btree_cache_lock); - - mutex_lock(&c->btree_root_lock); - btree_node_root(c, b) = b; - mutex_unlock(&c->btree_root_lock); - - if (btree_reserve) { - /* - * New allocation (we're not being called because we're in - * bch2_btree_root_read()) - do marking while holding - * btree_root_lock: - */ - struct bch_fs_usage stats = { 0 }; - - bch2_mark_key(c, bkey_i_to_s_c(&b->key), - c->sb.btree_node_size, true, - gc_pos_btree_root(b->btree_id), - &stats, 0); - - if (old) - bch2_btree_node_free_index(c, NULL, old->btree_id, - bkey_i_to_s_c(&old->key), - &stats); - bch2_fs_usage_apply(c, &stats, &btree_reserve->disk_res, - gc_pos_btree_root(b->btree_id)); - } - - bch2_recalc_btree_reserve(c); -} - -static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b) -{ - struct btree_root *r = &c->btree_roots[b->btree_id]; - - mutex_lock(&c->btree_root_lock); - - BUG_ON(b != r->b); - bkey_copy(&r->key, &b->key); - r->level = b->level; - r->alive = true; - - mutex_unlock(&c->btree_root_lock); -} - -/* - * Only for filesystem bringup, when first reading the btree roots or allocating - * btree roots when initializing a new filesystem: - */ -void bch2_btree_set_root_initial(struct bch_fs *c, struct btree *b, - struct btree_reserve *btree_reserve) -{ - BUG_ON(btree_node_root(c, b)); - - bch2_btree_set_root_inmem(c, b, btree_reserve); - bch2_btree_set_root_ondisk(c, b); -} - -/** - * bch_btree_set_root - update the root in memory and on disk - * - * To ensure forward progress, the current task must not be holding any - * btree node write locks. However, you must hold an intent lock on the - * old root. - * - * Note: This allocates a journal entry but doesn't add any keys to - * it. All the btree roots are part of every journal write, so there - * is nothing new to be done. This just guarantees that there is a - * journal write. - */ -static void bch2_btree_set_root(struct btree_iter *iter, struct btree *b, - struct btree_interior_update *as, - struct btree_reserve *btree_reserve) -{ - struct bch_fs *c = iter->c; - struct btree *old; - - trace_btree_set_root(c, b); - BUG_ON(!b->written); - - old = btree_node_root(c, b); - - /* - * Ensure no one is using the old root while we switch to the - * new root: - */ - bch2_btree_node_lock_write(old, iter); - - bch2_btree_set_root_inmem(c, b, btree_reserve); - - btree_interior_update_updated_root(c, as, iter->btree_id); - - /* - * Unlock old root after new root is visible: - * - * The new root isn't persistent, but that's ok: we still have - * an intent lock on the new root, and any updates that would - * depend on the new root would have to update the new root. - */ - bch2_btree_node_unlock_write(old, iter); + return __bch2_btree_node_alloc_replacement(as, b, new_f); } -static struct btree *__btree_root_alloc(struct bch_fs *c, unsigned level, - enum btree_id id, - struct btree_interior_update *as, - struct btree_reserve *reserve) +static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) { - struct btree *b = bch2_btree_node_alloc(c, level, id, as, reserve); + struct btree *b = bch2_btree_node_alloc(as, level); b->data->min_key = POS_MIN; b->data->max_key = POS_MAX; @@ -506,7 +449,7 @@ static struct btree *__btree_root_alloc(struct bch_fs *c, unsigned level, return b; } -void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve) +static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve) { bch2_disk_reservation_put(c, &reserve->disk_res); @@ -540,10 +483,10 @@ void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reserve) mempool_free(reserve, &c->btree_reserve_pool); } -static struct btree_reserve *__bch2_btree_reserve_get(struct bch_fs *c, - unsigned nr_nodes, - unsigned flags, - struct closure *cl) +static struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c, + unsigned nr_nodes, + unsigned flags, + struct closure *cl) { struct btree_reserve *reserve; struct btree *b; @@ -609,443 +552,134 @@ err_free: return ERR_PTR(ret); } -struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *c, - struct btree *b, - unsigned extra_nodes, - unsigned flags, - struct closure *cl) +/* Asynchronous interior node update machinery */ + +static void bch2_btree_update_free(struct btree_update *as) { - unsigned depth = btree_node_root(c, b)->level - b->level; - unsigned nr_nodes = btree_reserve_required_nodes(depth) + extra_nodes; + struct bch_fs *c = as->c; + + BUG_ON(as->nr_new_nodes); + BUG_ON(as->nr_pending); + + if (as->reserve) + bch2_btree_reserve_put(c, as->reserve); + + mutex_lock(&c->btree_interior_update_lock); + list_del(&as->list); + mutex_unlock(&c->btree_interior_update_lock); - return __bch2_btree_reserve_get(c, nr_nodes, flags, cl); + closure_debug_destroy(&as->cl); + mempool_free(as, &c->btree_interior_update_pool); + percpu_ref_put(&c->writes); } -int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id, - struct closure *writes) +static void btree_update_nodes_reachable(struct closure *cl) { - struct btree_interior_update as; - struct btree_reserve *reserve; - struct closure cl; - struct btree *b; - - memset(&as, 0, sizeof(as)); - closure_init_stack(&cl); + struct btree_update *as = + container_of(cl, struct btree_update, cl); + struct bch_fs *c = as->c; - while (1) { - /* XXX haven't calculated capacity yet :/ */ - reserve = __bch2_btree_reserve_get(c, 1, 0, &cl); - if (!IS_ERR(reserve)) - break; + bch2_journal_pin_drop(&c->journal, &as->journal); - if (PTR_ERR(reserve) == -ENOSPC) - return PTR_ERR(reserve); + mutex_lock(&c->btree_interior_update_lock); - closure_sync(&cl); - } + while (as->nr_new_nodes) { + struct btree *b = as->new_nodes[--as->nr_new_nodes]; - b = __btree_root_alloc(c, 0, id, &as, reserve); + BUG_ON(b->will_make_reachable != as); + b->will_make_reachable = NULL; + mutex_unlock(&c->btree_interior_update_lock); - bch2_btree_node_write(c, b, writes, SIX_LOCK_intent); + six_lock_read(&b->lock); + bch2_btree_node_write_dirty(c, b, NULL, btree_node_need_write(b)); + six_unlock_read(&b->lock); + mutex_lock(&c->btree_interior_update_lock); + } - bch2_btree_set_root_initial(c, b, reserve); + while (as->nr_pending) + bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]); - btree_interior_update_drop_new_node(c, b); - bch2_btree_open_bucket_put(c, b); - six_unlock_intent(&b->lock); + mutex_unlock(&c->btree_interior_update_lock); - bch2_btree_reserve_put(c, reserve); + closure_wake_up(&as->wait); - return 0; + bch2_btree_update_free(as); } -static void bch2_insert_fixup_btree_ptr(struct btree_iter *iter, - struct btree *b, - struct bkey_i *insert, - struct btree_node_iter *node_iter, - struct disk_reservation *disk_res) +static void btree_update_nodes_written(struct closure *cl) { - struct bch_fs *c = iter->c; - struct bch_fs_usage stats = { 0 }; - struct bkey_packed *k; - struct bkey tmp; - - if (bkey_extent_is_data(&insert->k)) - bch2_mark_key(c, bkey_i_to_s_c(insert), - c->sb.btree_node_size, true, - gc_pos_btree_node(b), &stats, 0); - - while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && - !btree_iter_pos_cmp_packed(b, &insert->k.p, k, false)) - bch2_btree_node_iter_advance(node_iter, b); + struct btree_update *as = + container_of(cl, struct btree_update, cl); + struct bch_fs *c = as->c; + struct btree *b; /* - * If we're overwriting, look up pending delete and mark so that gc - * marks it on the pending delete list: + * We did an update to a parent node where the pointers we added pointed + * to child nodes that weren't written yet: now, the child nodes have + * been written so we can write out the update to the interior node. */ - if (k && !bkey_cmp_packed(b, k, &insert->k)) - bch2_btree_node_free_index(c, b, iter->btree_id, - bkey_disassemble(b, k, &tmp), - &stats); - - bch2_fs_usage_apply(c, &stats, disk_res, gc_pos_btree_node(b)); +retry: + mutex_lock(&c->btree_interior_update_lock); + switch (as->mode) { + case BTREE_INTERIOR_NO_UPDATE: + BUG(); + case BTREE_INTERIOR_UPDATING_NODE: + /* The usual case: */ + b = READ_ONCE(as->b); - bch2_btree_bset_insert_key(iter, b, node_iter, insert); - set_btree_node_dirty(b); - set_btree_node_need_write(b); -} + if (!six_trylock_read(&b->lock)) { + mutex_unlock(&c->btree_interior_update_lock); + six_lock_read(&b->lock); + six_unlock_read(&b->lock); + goto retry; + } -/* Inserting into a given leaf node (last stage of insert): */ + BUG_ON(!btree_node_dirty(b)); + closure_wait(&btree_current_write(b)->wait, cl); -/* Handle overwrites and do insert, for non extents: */ -bool bch2_btree_bset_insert_key(struct btree_iter *iter, - struct btree *b, - struct btree_node_iter *node_iter, - struct bkey_i *insert) -{ - const struct bkey_format *f = &b->format; - struct bkey_packed *k; - struct bset_tree *t; - unsigned clobber_u64s; + list_del(&as->write_blocked_list); + mutex_unlock(&c->btree_interior_update_lock); - EBUG_ON(btree_node_just_written(b)); - EBUG_ON(bset_written(b, btree_bset_last(b))); - EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); - EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 || - bkey_cmp(insert->k.p, b->data->max_key) > 0); - BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(iter->c, b)); + bch2_btree_node_write_dirty(c, b, NULL, + btree_node_need_write(b)); + six_unlock_read(&b->lock); + break; - k = bch2_btree_node_iter_peek_all(node_iter, b); - if (k && !bkey_cmp_packed(b, k, &insert->k)) { - BUG_ON(bkey_whiteout(k)); + case BTREE_INTERIOR_UPDATING_AS: + /* + * The btree node we originally updated has been freed and is + * being rewritten - so we need to write anything here, we just + * need to signal to that btree_update that it's ok to make the + * new replacement node visible: + */ + closure_put(&as->parent_as->cl); - t = bch2_bkey_to_bset(b, k); + /* + * and then we have to wait on that btree_update to finish: + */ + closure_wait(&as->parent_as->wait, cl); + mutex_unlock(&c->btree_interior_update_lock); + break; - if (bset_unwritten(b, bset(b, t)) && - bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) { - BUG_ON(bkey_whiteout(k) != bkey_whiteout(&insert->k)); + case BTREE_INTERIOR_UPDATING_ROOT: + /* b is the new btree root: */ + b = READ_ONCE(as->b); - k->type = insert->k.type; - memcpy_u64s(bkeyp_val(f, k), &insert->v, - bkey_val_u64s(&insert->k)); - return true; + if (!six_trylock_read(&b->lock)) { + mutex_unlock(&c->btree_interior_update_lock); + six_lock_read(&b->lock); + six_unlock_read(&b->lock); + goto retry; } - insert->k.needs_whiteout = k->needs_whiteout; - - btree_keys_account_key_drop(&b->nr, t - b->set, k); + BUG_ON(c->btree_roots[b->btree_id].as != as); + c->btree_roots[b->btree_id].as = NULL; - if (t == bset_tree_last(b)) { - clobber_u64s = k->u64s; - - /* - * If we're deleting, and the key we're deleting doesn't - * need a whiteout (it wasn't overwriting a key that had - * been written to disk) - just delete it: - */ - if (bkey_whiteout(&insert->k) && !k->needs_whiteout) { - bch2_bset_delete(b, k, clobber_u64s); - bch2_btree_node_iter_fix(iter, b, node_iter, t, - k, clobber_u64s, 0); - return true; - } - - goto overwrite; - } - - k->type = KEY_TYPE_DELETED; - bch2_btree_node_iter_fix(iter, b, node_iter, t, k, - k->u64s, k->u64s); - - if (bkey_whiteout(&insert->k)) { - reserve_whiteout(b, t, k); - return true; - } else { - k->needs_whiteout = false; - } - } else { - /* - * Deleting, but the key to delete wasn't found - nothing to do: - */ - if (bkey_whiteout(&insert->k)) - return false; - - insert->k.needs_whiteout = false; - } - - t = bset_tree_last(b); - k = bch2_btree_node_iter_bset_pos(node_iter, b, t); - clobber_u64s = 0; -overwrite: - bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); - if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k)) - bch2_btree_node_iter_fix(iter, b, node_iter, t, k, - clobber_u64s, k->u64s); - return true; -} - -static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, - unsigned i, u64 seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct btree_write *w = container_of(pin, struct btree_write, journal); - struct btree *b = container_of(w, struct btree, writes[i]); - - six_lock_read(&b->lock); - bch2_btree_node_write_dirty(c, b, NULL, - (btree_current_write(b) == w && - w->journal.pin_list == journal_seq_pin(j, seq))); - six_unlock_read(&b->lock); -} - -static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) -{ - return __btree_node_flush(j, pin, 0, seq); -} - -static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) -{ - return __btree_node_flush(j, pin, 1, seq); -} - -void bch2_btree_journal_key(struct btree_insert *trans, - struct btree_iter *iter, - struct bkey_i *insert) -{ - struct bch_fs *c = trans->c; - struct journal *j = &c->journal; - struct btree *b = iter->nodes[0]; - struct btree_write *w = btree_current_write(b); - - EBUG_ON(iter->level || b->level); - EBUG_ON(trans->journal_res.ref != - !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); - - if (!journal_pin_active(&w->journal)) - bch2_journal_pin_add(j, &trans->journal_res, - &w->journal, - btree_node_write_idx(b) == 0 - ? btree_node_flush0 - : btree_node_flush1); - - if (trans->journal_res.ref) { - u64 seq = trans->journal_res.seq; - bool needs_whiteout = insert->k.needs_whiteout; - - /* ick */ - insert->k.needs_whiteout = false; - bch2_journal_add_keys(j, &trans->journal_res, - b->btree_id, insert); - insert->k.needs_whiteout = needs_whiteout; - - bch2_journal_set_has_inode(j, &trans->journal_res, - insert->k.p.inode); - - if (trans->journal_seq) - *trans->journal_seq = seq; - btree_bset_last(b)->journal_seq = cpu_to_le64(seq); - } - - if (!btree_node_dirty(b)) - set_btree_node_dirty(b); -} - -static enum btree_insert_ret -bch2_insert_fixup_key(struct btree_insert *trans, - struct btree_insert_entry *insert) -{ - struct btree_iter *iter = insert->iter; - - BUG_ON(iter->level); - - if (bch2_btree_bset_insert_key(iter, - iter->nodes[0], - &iter->node_iters[0], - insert->k)) - bch2_btree_journal_key(trans, iter, insert->k); - - trans->did_work = true; - return BTREE_INSERT_OK; -} - -static void verify_keys_sorted(struct keylist *l) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - struct bkey_i *k; - - for_each_keylist_key(l, k) - BUG_ON(bkey_next(k) != l->top && - bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0); -#endif -} - -static void btree_node_lock_for_insert(struct btree *b, struct btree_iter *iter) -{ - struct bch_fs *c = iter->c; - - bch2_btree_node_lock_write(b, iter); - - if (btree_node_just_written(b) && - bch2_btree_post_write_cleanup(c, b)) - bch2_btree_iter_reinit_node(iter, b); - - /* - * If the last bset has been written, or if it's gotten too big - start - * a new bset to insert into: - */ - if (want_new_bset(c, b)) - bch2_btree_init_next(c, b, iter); -} - -/* Asynchronous interior node update machinery */ - -struct btree_interior_update * -bch2_btree_interior_update_alloc(struct bch_fs *c) -{ - struct btree_interior_update *as; - - as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); - memset(as, 0, sizeof(*as)); - closure_init(&as->cl, &c->cl); - as->c = c; - as->mode = BTREE_INTERIOR_NO_UPDATE; - INIT_LIST_HEAD(&as->write_blocked_list); - - bch2_keylist_init(&as->parent_keys, as->inline_keys, - ARRAY_SIZE(as->inline_keys)); - - mutex_lock(&c->btree_interior_update_lock); - list_add(&as->list, &c->btree_interior_update_list); - mutex_unlock(&c->btree_interior_update_lock); - - return as; -} - -static void btree_interior_update_free(struct closure *cl) -{ - struct btree_interior_update *as = - container_of(cl, struct btree_interior_update, cl); - - mempool_free(as, &as->c->btree_interior_update_pool); -} - -static void btree_interior_update_nodes_reachable(struct closure *cl) -{ - struct btree_interior_update *as = - container_of(cl, struct btree_interior_update, cl); - struct bch_fs *c = as->c; - - bch2_journal_pin_drop(&c->journal, &as->journal); - - mutex_lock(&c->btree_interior_update_lock); - - while (as->nr_new_nodes) { - struct btree *b = as->new_nodes[--as->nr_new_nodes]; - - BUG_ON(b->will_make_reachable != as); - b->will_make_reachable = NULL; - mutex_unlock(&c->btree_interior_update_lock); - - six_lock_read(&b->lock); - bch2_btree_node_write_dirty(c, b, NULL, btree_node_need_write(b)); - six_unlock_read(&b->lock); - mutex_lock(&c->btree_interior_update_lock); - } - - while (as->nr_pending) - bch2_btree_node_free_ondisk(c, &as->pending[--as->nr_pending]); - - list_del(&as->list); - mutex_unlock(&c->btree_interior_update_lock); - - closure_wake_up(&as->wait); - - closure_return_with_destructor(cl, btree_interior_update_free); -} - -static void btree_interior_update_nodes_written(struct closure *cl) -{ - struct btree_interior_update *as = - container_of(cl, struct btree_interior_update, cl); - struct bch_fs *c = as->c; - struct btree *b; - - if (bch2_journal_error(&c->journal)) { - /* XXX what? */ - /* we don't want to free the nodes on disk, that's what */ - } - - /* XXX: missing error handling, damnit */ - - /* check for journal error, bail out if we flushed */ - - /* - * We did an update to a parent node where the pointers we added pointed - * to child nodes that weren't written yet: now, the child nodes have - * been written so we can write out the update to the interior node. - */ -retry: - mutex_lock(&c->btree_interior_update_lock); - switch (as->mode) { - case BTREE_INTERIOR_NO_UPDATE: - BUG(); - case BTREE_INTERIOR_UPDATING_NODE: - /* The usual case: */ - b = READ_ONCE(as->b); - - if (!six_trylock_read(&b->lock)) { - mutex_unlock(&c->btree_interior_update_lock); - six_lock_read(&b->lock); - six_unlock_read(&b->lock); - goto retry; - } - - BUG_ON(!btree_node_dirty(b)); - closure_wait(&btree_current_write(b)->wait, cl); - - list_del(&as->write_blocked_list); - mutex_unlock(&c->btree_interior_update_lock); - - bch2_btree_node_write_dirty(c, b, NULL, - btree_node_need_write(b)); - six_unlock_read(&b->lock); - break; - - case BTREE_INTERIOR_UPDATING_AS: - /* - * The btree node we originally updated has been freed and is - * being rewritten - so we need to write anything here, we just - * need to signal to that btree_interior_update that it's ok to make the - * new replacement node visible: - */ - closure_put(&as->parent_as->cl); - - /* - * and then we have to wait on that btree_interior_update to finish: - */ - closure_wait(&as->parent_as->wait, cl); - mutex_unlock(&c->btree_interior_update_lock); - break; - - case BTREE_INTERIOR_UPDATING_ROOT: - /* b is the new btree root: */ - b = READ_ONCE(as->b); - - if (!six_trylock_read(&b->lock)) { - mutex_unlock(&c->btree_interior_update_lock); - six_lock_read(&b->lock); - six_unlock_read(&b->lock); - goto retry; - } - - BUG_ON(c->btree_roots[b->btree_id].as != as); - c->btree_roots[b->btree_id].as = NULL; - - bch2_btree_set_root_ondisk(c, b); + bch2_btree_set_root_ondisk(c, b); /* * We don't have to wait anything anything here (before - * btree_interior_update_nodes_reachable frees the old nodes + * btree_update_nodes_reachable frees the old nodes * ondisk) - we've ensured that the very next journal write will * have the pointer to the new root, and before the allocator * can reuse the old nodes it'll have to do a journal commit: @@ -1076,17 +710,17 @@ retry: break; } - continue_at(cl, btree_interior_update_nodes_reachable, system_wq); + continue_at(cl, btree_update_nodes_reachable, system_wq); } /* * We're updating @b with pointers to nodes that haven't finished writing yet: * block @b from being written until @as completes */ -static void btree_interior_update_updated_btree(struct bch_fs *c, - struct btree_interior_update *as, - struct btree *b) +static void btree_update_updated_node(struct btree_update *as, struct btree *b) { + struct bch_fs *c = as->c; + mutex_lock(&c->btree_interior_update_lock); BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); @@ -1125,24 +759,22 @@ static void btree_interior_update_updated_btree(struct bch_fs *c, * hasn't been yet). */ bch2_journal_wait_on_seq(&c->journal, as->journal_seq, &as->cl); - - continue_at(&as->cl, btree_interior_update_nodes_written, - system_freezable_wq); } static void interior_update_flush(struct journal *j, struct journal_entry_pin *pin, u64 seq) { - struct btree_interior_update *as = - container_of(pin, struct btree_interior_update, journal); + struct btree_update *as = + container_of(pin, struct btree_update, journal); bch2_journal_flush_seq_async(j, as->journal_seq, NULL); } -static void btree_interior_update_reparent(struct bch_fs *c, - struct btree_interior_update *as, - struct btree_interior_update *child) +static void btree_update_reparent(struct btree_update *as, + struct btree_update *child) { + struct bch_fs *c = as->c; + child->b = NULL; child->mode = BTREE_INTERIOR_UPDATING_AS; child->parent_as = as; @@ -1151,11 +783,11 @@ static void btree_interior_update_reparent(struct bch_fs *c, /* * When we write a new btree root, we have to drop our journal pin * _before_ the new nodes are technically reachable; see - * btree_interior_update_nodes_written(). + * btree_update_nodes_written(). * * This goes for journal pins that are recursively blocked on us - so, * just transfer the journal pin to the new interior update so - * btree_interior_update_nodes_written() can drop it. + * btree_update_nodes_written() can drop it. */ bch2_journal_pin_add_if_older(&c->journal, &child->journal, &as->journal, interior_update_flush); @@ -1164,11 +796,10 @@ static void btree_interior_update_reparent(struct bch_fs *c, as->journal_seq = max(as->journal_seq, child->journal_seq); } -static void btree_interior_update_updated_root(struct bch_fs *c, - struct btree_interior_update *as, - enum btree_id btree_id) +static void btree_update_updated_root(struct btree_update *as) { - struct btree_root *r = &c->btree_roots[btree_id]; + struct bch_fs *c = as->c; + struct btree_root *r = &c->btree_roots[as->btree_id]; mutex_lock(&c->btree_interior_update_lock); @@ -1176,10 +807,10 @@ static void btree_interior_update_updated_root(struct bch_fs *c, /* * Old root might not be persistent yet - if so, redirect its - * btree_interior_update operation to point to us: + * btree_update operation to point to us: */ if (r->as) - btree_interior_update_reparent(c, as, r->as); + btree_update_reparent(as, r->as); as->mode = BTREE_INTERIOR_UPDATING_ROOT; as->b = r->b; @@ -1190,22 +821,20 @@ static void btree_interior_update_updated_root(struct bch_fs *c, /* * When we're rewriting nodes and updating interior nodes, there's an * issue with updates that haven't been written in the journal getting - * mixed together with older data - see * btree_interior_update_updated_btree() + * mixed together with older data - see btree_update_updated_node() * for the explanation. * * However, this doesn't affect us when we're writing a new btree root - * because to make that new root reachable we have to write out a new * journal entry, which must necessarily be newer than as->journal_seq. */ - - continue_at(&as->cl, btree_interior_update_nodes_written, - system_freezable_wq); } -static void btree_interior_update_will_make_reachable(struct bch_fs *c, - struct btree_interior_update *as, - struct btree *b) +static void btree_node_will_make_reachable(struct btree_update *as, + struct btree *b) { + struct bch_fs *c = as->c; + mutex_lock(&c->btree_interior_update_lock); BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); BUG_ON(b->will_make_reachable); @@ -1217,7 +846,7 @@ static void btree_interior_update_will_make_reachable(struct bch_fs *c, static void __btree_interior_update_drop_new_node(struct btree *b) { - struct btree_interior_update *as = b->will_make_reachable; + struct btree_update *as = b->will_make_reachable; unsigned i; BUG_ON(!as); @@ -1235,18 +864,17 @@ found: b->will_make_reachable = NULL; } -static void btree_interior_update_drop_new_node(struct bch_fs *c, - struct btree *b) +static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) { mutex_lock(&c->btree_interior_update_lock); __btree_interior_update_drop_new_node(b); mutex_unlock(&c->btree_interior_update_lock); } -static void bch2_btree_interior_update_add_node_reference(struct bch_fs *c, - struct btree_interior_update *as, - struct btree *b) +static void btree_interior_update_add_node_reference(struct btree_update *as, + struct btree *b) { + struct bch_fs *c = as->c; struct pending_btree_node_free *d; mutex_lock(&c->btree_interior_update_lock); @@ -1266,19 +894,19 @@ static void bch2_btree_interior_update_add_node_reference(struct bch_fs *c, /* * @b is being split/rewritten: it may have pointers to not-yet-written btree - * nodes and thus outstanding btree_interior_updates - redirect @b's - * btree_interior_updates to point to this btree_interior_update: + * nodes and thus outstanding btree_updates - redirect @b's + * btree_updates to point to this btree_update: */ -void bch2_btree_interior_update_will_free_node(struct bch_fs *c, - struct btree_interior_update *as, - struct btree *b) +void bch2_btree_interior_update_will_free_node(struct btree_update *as, + struct btree *b) { + struct bch_fs *c = as->c; struct closure *cl, *cl_n; - struct btree_interior_update *p, *n; + struct btree_update *p, *n; struct btree_write *w; struct bset_tree *t; - bch2_btree_interior_update_add_node_reference(c, as, b); + btree_interior_update_add_node_reference(as, b); /* * Does this node have data that hasn't been written in the journal? @@ -1294,16 +922,16 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c, mutex_lock(&c->btree_interior_update_lock); /* - * Does this node have any btree_interior_update operations preventing + * Does this node have any btree_update operations preventing * it from being written? * - * If so, redirect them to point to this btree_interior_update: we can + * If so, redirect them to point to this btree_update: we can * write out our new nodes, but we won't make them visible until those * operations complete */ list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { list_del(&p->write_blocked_list); - btree_interior_update_reparent(c, as, p); + btree_update_reparent(as, p); } clear_btree_node_dirty(b); @@ -1316,7 +944,7 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c, /* * Does this node have unwritten data that has a pin on the journal? * - * If so, transfer that pin to the btree_interior_update operation - + * If so, transfer that pin to the btree_update operation - * note that if we're freeing multiple nodes, we only need to keep the * oldest pin of any of the nodes we're freeing. We'll release the pin * when the new nodes are persistent and reachable on disk: @@ -1336,125 +964,200 @@ void bch2_btree_interior_update_will_free_node(struct bch_fs *c, mutex_unlock(&c->btree_interior_update_lock); } -static void btree_node_interior_verify(struct btree *b) +void bch2_btree_update_done(struct btree_update *as) { - struct btree_node_iter iter; - struct bkey_packed *k; - - BUG_ON(!b->level); - - bch2_btree_node_iter_init(&iter, b, b->key.k.p, false, false); -#if 1 - BUG_ON(!(k = bch2_btree_node_iter_peek(&iter, b)) || - bkey_cmp_left_packed(b, k, &b->key.k.p)); - - BUG_ON((bch2_btree_node_iter_advance(&iter, b), - !bch2_btree_node_iter_end(&iter))); -#else - const char *msg; - - msg = "not found"; - k = bch2_btree_node_iter_peek(&iter, b); - if (!k) - goto err; - - msg = "isn't what it should be"; - if (bkey_cmp_left_packed(b, k, &b->key.k.p)) - goto err; + BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); - bch2_btree_node_iter_advance(&iter, b); + bch2_btree_reserve_put(as->c, as->reserve); + as->reserve = NULL; - msg = "isn't last key"; - if (!bch2_btree_node_iter_end(&iter)) - goto err; - return; -err: - bch2_dump_btree_node(b); - printk(KERN_ERR "last key %llu:%llu %s\n", b->key.k.p.inode, - b->key.k.p.offset, msg); - BUG(); -#endif + continue_at(&as->cl, btree_update_nodes_written, system_freezable_wq); } -static int -bch2_btree_insert_keys_interior(struct btree *b, - struct btree_iter *iter, - struct keylist *insert_keys, - struct btree_interior_update *as, - struct btree_reserve *res) +struct btree_update * +bch2_btree_update_start(struct bch_fs *c, enum btree_id id, + unsigned nr_nodes, unsigned flags, + struct closure *cl) { - struct bch_fs *c = iter->c; - struct btree_iter *linked; - struct btree_node_iter node_iter; - struct bkey_i *insert = bch2_keylist_front(insert_keys); - struct bkey_packed *k; - - BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level)); - BUG_ON(!b->level); - BUG_ON(!as || as->b); - verify_keys_sorted(insert_keys); + struct btree_reserve *reserve; + struct btree_update *as; - btree_node_lock_for_insert(b, iter); + if (unlikely(!percpu_ref_tryget(&c->writes))) + return ERR_PTR(-EROFS); - if (bch_keylist_u64s(insert_keys) > - bch_btree_keys_u64s_remaining(c, b)) { - bch2_btree_node_unlock_write(b, iter); - return -1; + reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl); + if (IS_ERR(reserve)) { + percpu_ref_put(&c->writes); + return ERR_CAST(reserve); } - /* Don't screw up @iter's position: */ - node_iter = iter->node_iters[b->level]; + as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); + memset(as, 0, sizeof(*as)); + closure_init(&as->cl, NULL); + as->c = c; + as->mode = BTREE_INTERIOR_NO_UPDATE; + as->btree_id = id; + as->reserve = reserve; + INIT_LIST_HEAD(&as->write_blocked_list); - /* - * btree_split(), btree_gc_coalesce() will insert keys before - * the iterator's current position - they know the keys go in - * the node the iterator points to: - */ - while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && - (bkey_cmp_packed(b, k, &insert->k) >= 0)) - ; + bch2_keylist_init(&as->parent_keys, as->inline_keys, + ARRAY_SIZE(as->inline_keys)); - while (!bch2_keylist_empty(insert_keys)) { - insert = bch2_keylist_front(insert_keys); + mutex_lock(&c->btree_interior_update_lock); + list_add(&as->list, &c->btree_interior_update_list); + mutex_unlock(&c->btree_interior_update_lock); - bch2_insert_fixup_btree_ptr(iter, b, insert, - &node_iter, &res->disk_res); - bch2_keylist_pop_front(insert_keys); - } + return as; +} - btree_interior_update_updated_btree(c, as, b); +/* Btree root updates: */ - for_each_linked_btree_node(iter, b, linked) - bch2_btree_node_iter_peek(&linked->node_iters[b->level], - b); - bch2_btree_node_iter_peek(&iter->node_iters[b->level], b); +static void __bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) +{ + /* Root nodes cannot be reaped */ + mutex_lock(&c->btree_cache_lock); + list_del_init(&b->list); + mutex_unlock(&c->btree_cache_lock); - bch2_btree_iter_verify(iter, b); + mutex_lock(&c->btree_root_lock); + btree_node_root(c, b) = b; + mutex_unlock(&c->btree_root_lock); - if (bch2_maybe_compact_whiteouts(c, b)) - bch2_btree_iter_reinit_node(iter, b); + bch2_recalc_btree_reserve(c); +} - bch2_btree_node_unlock_write(b, iter); +static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) +{ + struct bch_fs *c = as->c; + struct btree *old = btree_node_root(c, b); + struct bch_fs_usage stats = { 0 }; - btree_node_interior_verify(b); - return 0; + __bch2_btree_set_root_inmem(c, b); + + bch2_mark_key(c, bkey_i_to_s_c(&b->key), + c->sb.btree_node_size, true, + gc_pos_btree_root(b->btree_id), + &stats, 0); + + if (old) + bch2_btree_node_free_index(as, NULL, + bkey_i_to_s_c(&old->key), + &stats); + bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res, + gc_pos_btree_root(b->btree_id)); +} + +static void bch2_btree_set_root_ondisk(struct bch_fs *c, struct btree *b) +{ + struct btree_root *r = &c->btree_roots[b->btree_id]; + + mutex_lock(&c->btree_root_lock); + + BUG_ON(b != r->b); + bkey_copy(&r->key, &b->key); + r->level = b->level; + r->alive = true; + + mutex_unlock(&c->btree_root_lock); +} + +/** + * bch_btree_set_root - update the root in memory and on disk + * + * To ensure forward progress, the current task must not be holding any + * btree node write locks. However, you must hold an intent lock on the + * old root. + * + * Note: This allocates a journal entry but doesn't add any keys to + * it. All the btree roots are part of every journal write, so there + * is nothing new to be done. This just guarantees that there is a + * journal write. + */ +static void bch2_btree_set_root(struct btree_update *as, struct btree *b, + struct btree_iter *iter) +{ + struct bch_fs *c = as->c; + struct btree *old; + + trace_btree_set_root(c, b); + BUG_ON(!b->written); + + old = btree_node_root(c, b); + + /* + * Ensure no one is using the old root while we switch to the + * new root: + */ + bch2_btree_node_lock_write(old, iter); + + bch2_btree_set_root_inmem(as, b); + + btree_update_updated_root(as); + + /* + * Unlock old root after new root is visible: + * + * The new root isn't persistent, but that's ok: we still have + * an intent lock on the new root, and any updates that would + * depend on the new root would have to update the new root. + */ + bch2_btree_node_unlock_write(old, iter); +} + +/* Interior node updates: */ + +static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b, + struct btree_iter *iter, + struct bkey_i *insert, + struct btree_node_iter *node_iter) +{ + struct bch_fs *c = as->c; + struct bch_fs_usage stats = { 0 }; + struct bkey_packed *k; + struct bkey tmp; + + BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b)); + + if (bkey_extent_is_data(&insert->k)) + bch2_mark_key(c, bkey_i_to_s_c(insert), + c->sb.btree_node_size, true, + gc_pos_btree_node(b), &stats, 0); + + while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && + !btree_iter_pos_cmp_packed(b, &insert->k.p, k, false)) + bch2_btree_node_iter_advance(node_iter, b); + + /* + * If we're overwriting, look up pending delete and mark so that gc + * marks it on the pending delete list: + */ + if (k && !bkey_cmp_packed(b, k, &insert->k)) + bch2_btree_node_free_index(as, b, + bkey_disassemble(b, k, &tmp), + &stats); + + bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res, + gc_pos_btree_node(b)); + + bch2_btree_bset_insert_key(iter, b, node_iter, insert); + set_btree_node_dirty(b); + set_btree_node_need_write(b); } /* * Move keys from n1 (original replacement node, now lower node) to n2 (higher * node) */ -static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n1, - struct btree_reserve *reserve, - struct btree_interior_update *as) +static struct btree *__btree_split_node(struct btree_update *as, + struct btree *n1, + struct btree_iter *iter) { - struct bch_fs *c = iter->c; size_t nr_packed = 0, nr_unpacked = 0; struct btree *n2; struct bset *set1, *set2; struct bkey_packed *k, *prev = NULL; - n2 = bch2_btree_node_alloc(c, n1->level, iter->btree_id, as, reserve); + n2 = bch2_btree_node_alloc(as, n1->level); n2->data->max_key = n1->data->max_key; n2->data->format = n1->format; @@ -1540,9 +1243,9 @@ static struct btree *__btree_split_node(struct btree_iter *iter, struct btree *n * nodes that were coalesced, and thus in the middle of a child node post * coalescing: */ -static void btree_split_insert_keys(struct btree_iter *iter, struct btree *b, - struct keylist *keys, - struct btree_reserve *res) +static void btree_split_insert_keys(struct btree_update *as, struct btree *b, + struct btree_iter *iter, + struct keylist *keys) { struct btree_node_iter node_iter; struct bkey_i *k = bch2_keylist_front(keys); @@ -1557,11 +1260,11 @@ static void btree_split_insert_keys(struct btree_iter *iter, struct btree *b, k = bch2_keylist_front(keys); BUG_ON(bch_keylist_u64s(keys) > - bch_btree_keys_u64s_remaining(iter->c, b)); + bch_btree_keys_u64s_remaining(as->c, b)); BUG_ON(bkey_cmp(k->k.p, b->data->min_key) < 0); BUG_ON(bkey_cmp(k->k.p, b->data->max_key) > 0); - bch2_insert_fixup_btree_ptr(iter, b, k, &node_iter, &res->disk_res); + bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter); bch2_keylist_pop_front(keys); } @@ -1588,12 +1291,10 @@ static void btree_split_insert_keys(struct btree_iter *iter, struct btree *b, btree_node_interior_verify(b); } -static void btree_split(struct btree *b, struct btree_iter *iter, - struct keylist *insert_keys, - struct btree_reserve *reserve, - struct btree_interior_update *as) +static void btree_split(struct btree_update *as, struct btree *b, + struct btree_iter *iter, struct keylist *keys) { - struct bch_fs *c = iter->c; + struct bch_fs *c = as->c; struct btree *parent = iter->nodes[b->level + 1]; struct btree *n1, *n2 = NULL, *n3 = NULL; u64 start_time = local_clock(); @@ -1601,17 +1302,17 @@ static void btree_split(struct btree *b, struct btree_iter *iter, BUG_ON(!parent && (b != btree_node_root(c, b))); BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level)); - bch2_btree_interior_update_will_free_node(c, as, b); + bch2_btree_interior_update_will_free_node(as, b); - n1 = bch2_btree_node_alloc_replacement(c, b, as, reserve); + n1 = bch2_btree_node_alloc_replacement(as, b); - if (b->level) - btree_split_insert_keys(iter, n1, insert_keys, reserve); + if (keys) + btree_split_insert_keys(as, n1, iter, keys); if (vstruct_blocks(n1->data, c->block_bits) > BTREE_SPLIT_THRESHOLD(c)) { trace_btree_node_split(c, b, b->nr.live_u64s); - n2 = __btree_split_node(iter, n1, reserve, as); + n2 = __btree_split_node(as, n1, iter); bch2_btree_build_aux_trees(n2); bch2_btree_build_aux_trees(n1); @@ -1621,7 +1322,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter, bch2_btree_node_write(c, n2, &as->cl, SIX_LOCK_intent); /* - * Note that on recursive parent_keys == insert_keys, so we + * Note that on recursive parent_keys == keys, so we * can't start adding new keys to parent_keys before emptying it * out (which we did with btree_split_insert_keys() above) */ @@ -1630,15 +1331,12 @@ static void btree_split(struct btree *b, struct btree_iter *iter, if (!parent) { /* Depth increases, make a new root */ - n3 = __btree_root_alloc(c, b->level + 1, - iter->btree_id, - as, reserve); + n3 = __btree_root_alloc(as, b->level + 1); n3->sib_u64s[0] = U16_MAX; n3->sib_u64s[1] = U16_MAX; - btree_split_insert_keys(iter, n3, &as->parent_keys, - reserve); + btree_split_insert_keys(as, n3, iter, &as->parent_keys); bch2_btree_node_write(c, n3, &as->cl, SIX_LOCK_intent); } } else { @@ -1656,13 +1354,12 @@ static void btree_split(struct btree *b, struct btree_iter *iter, if (parent) { /* Split a non root node */ - bch2_btree_insert_node(parent, iter, &as->parent_keys, - reserve, as); + bch2_btree_insert_node(as, parent, iter, &as->parent_keys); } else if (n3) { - bch2_btree_set_root(iter, n3, as, reserve); + bch2_btree_set_root(as, n3, iter); } else { /* Root filled up but didn't need to be split */ - bch2_btree_set_root(iter, n1, as, reserve); + bch2_btree_set_root(as, n1, iter); } bch2_btree_open_bucket_put(c, n1); @@ -1680,7 +1377,7 @@ static void btree_split(struct btree *b, struct btree_iter *iter, * We have to free the node first because the bch2_iter_node_replace() * calls will drop _our_ iterator's reference - and intent lock - to @b. */ - bch2_btree_node_free_inmem(iter, b); + bch2_btree_node_free_inmem(c, b, iter); /* Successful split, update the iterator to point to the new nodes: */ @@ -1693,11 +1390,70 @@ static void btree_split(struct btree *b, struct btree_iter *iter, bch2_time_stats_update(&c->btree_split_time, start_time); } +static int +bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, + struct btree_iter *iter, struct keylist *keys) +{ + struct bch_fs *c = as->c; + struct btree_iter *linked; + struct btree_node_iter node_iter; + struct bkey_i *insert = bch2_keylist_front(keys); + struct bkey_packed *k; + + BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->level)); + BUG_ON(!b->level); + BUG_ON(!as || as->b); + bch2_verify_keylist_sorted(keys); + + bch2_btree_node_lock_for_insert(c, b, iter); + + if (bch_keylist_u64s(keys) > bch_btree_keys_u64s_remaining(c, b)) { + bch2_btree_node_unlock_write(b, iter); + return -1; + } + + /* Don't screw up @iter's position: */ + node_iter = iter->node_iters[b->level]; + + /* + * btree_split(), btree_gc_coalesce() will insert keys before + * the iterator's current position - they know the keys go in + * the node the iterator points to: + */ + while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && + (bkey_cmp_packed(b, k, &insert->k) >= 0)) + ; + + while (!bch2_keylist_empty(keys)) { + insert = bch2_keylist_front(keys); + + bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter); + bch2_keylist_pop_front(keys); + } + + btree_update_updated_node(as, b); + + for_each_linked_btree_node(iter, b, linked) + bch2_btree_node_iter_peek(&linked->node_iters[b->level], + b); + bch2_btree_node_iter_peek(&iter->node_iters[b->level], b); + + bch2_btree_iter_verify(iter, b); + + if (bch2_maybe_compact_whiteouts(c, b)) + bch2_btree_iter_reinit_node(iter, b); + + bch2_btree_node_unlock_write(b, iter); + + btree_node_interior_verify(b); + return 0; +} + /** * bch_btree_insert_node - insert bkeys into a given btree node * * @iter: btree iterator - * @insert_keys: list of keys to insert + * @keys: list of keys to insert * @hook: insert callback * @persistent: if not null, @persistent will wait on journal write * @@ -1705,27 +1461,21 @@ static void btree_split(struct btree *b, struct btree_iter *iter, * If a split occurred, this function will return early. This can only happen * for leaf nodes -- inserts into interior nodes have to be atomic. */ -void bch2_btree_insert_node(struct btree *b, - struct btree_iter *iter, - struct keylist *insert_keys, - struct btree_reserve *reserve, - struct btree_interior_update *as) +void bch2_btree_insert_node(struct btree_update *as, struct btree *b, + struct btree_iter *iter, struct keylist *keys) { BUG_ON(!b->level); - BUG_ON(!reserve || !as); if ((as->flags & BTREE_INTERIOR_UPDATE_MUST_REWRITE) || - bch2_btree_insert_keys_interior(b, iter, insert_keys, - as, reserve)) - btree_split(b, iter, insert_keys, reserve, as); + bch2_btree_insert_keys_interior(as, b, iter, keys)) + btree_split(as, b, iter, keys); } -static int bch2_btree_split_leaf(struct btree_iter *iter, unsigned flags) +int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, + unsigned btree_reserve_flags) { - struct bch_fs *c = iter->c; struct btree *b = iter->nodes[0]; - struct btree_reserve *reserve; - struct btree_interior_update *as; + struct btree_update *as; struct closure cl; int ret = 0; @@ -1735,6 +1485,9 @@ static int bch2_btree_split_leaf(struct btree_iter *iter, unsigned flags) if (!down_read_trylock(&c->gc_lock)) { bch2_btree_iter_unlock(iter); down_read(&c->gc_lock); + + if (btree_iter_linked(iter)) + ret = -EINTR; } /* @@ -1746,9 +1499,11 @@ static int bch2_btree_split_leaf(struct btree_iter *iter, unsigned flags) goto out; } - reserve = bch2_btree_reserve_get(c, b, 0, flags, &cl); - if (IS_ERR(reserve)) { - ret = PTR_ERR(reserve); + as = bch2_btree_update_start(c, iter->btree_id, + btree_update_reserve_required(c, b), + btree_reserve_flags, &cl); + if (IS_ERR(as)) { + ret = PTR_ERR(as); if (ret == -EAGAIN) { bch2_btree_iter_unlock(iter); up_read(&c->gc_lock); @@ -1758,10 +1513,8 @@ static int bch2_btree_split_leaf(struct btree_iter *iter, unsigned flags) goto out; } - as = bch2_btree_interior_update_alloc(c); - - btree_split(b, iter, NULL, reserve, as); - bch2_btree_reserve_put(c, reserve); + btree_split(as, b, iter, NULL); + bch2_btree_update_done(as); bch2_btree_iter_set_locks_want(iter, 1); out: @@ -1769,68 +1522,11 @@ out: return ret; } -enum btree_node_sibling { - btree_prev_sib, - btree_next_sib, -}; - -static struct btree *btree_node_get_sibling(struct btree_iter *iter, - struct btree *b, - enum btree_node_sibling sib) +int bch2_foreground_maybe_merge(struct bch_fs *c, + struct btree_iter *iter, + enum btree_node_sibling sib) { - struct btree *parent; - struct btree_node_iter node_iter; - struct bkey_packed *k; - BKEY_PADDED(k) tmp; - struct btree *ret; - unsigned level = b->level; - - parent = iter->nodes[level + 1]; - if (!parent) - return NULL; - - if (!bch2_btree_node_relock(iter, level + 1)) { - bch2_btree_iter_set_locks_want(iter, level + 2); - return ERR_PTR(-EINTR); - } - - node_iter = iter->node_iters[parent->level]; - - k = bch2_btree_node_iter_peek_all(&node_iter, parent); - BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); - - do { - k = sib == btree_prev_sib - ? bch2_btree_node_iter_prev_all(&node_iter, parent) - : (bch2_btree_node_iter_advance(&node_iter, parent), - bch2_btree_node_iter_peek_all(&node_iter, parent)); - if (!k) - return NULL; - } while (bkey_deleted(k)); - - bch2_bkey_unpack(parent, &tmp.k, k); - - ret = bch2_btree_node_get(iter, &tmp.k, level, SIX_LOCK_intent); - - if (IS_ERR(ret) && PTR_ERR(ret) == -EINTR) { - btree_node_unlock(iter, level); - ret = bch2_btree_node_get(iter, &tmp.k, level, SIX_LOCK_intent); - } - - if (!IS_ERR(ret) && !bch2_btree_node_relock(iter, level)) { - six_unlock_intent(&ret->lock); - ret = ERR_PTR(-EINTR); - } - - return ret; -} - -static int __foreground_maybe_merge(struct btree_iter *iter, - enum btree_node_sibling sib) -{ - struct bch_fs *c = iter->c; - struct btree_reserve *reserve; - struct btree_interior_update *as; + struct btree_update *as; struct bkey_format_state new_s; struct bkey_format new_f; struct bkey_i delete; @@ -1854,7 +1550,7 @@ retry: return 0; /* XXX: can't be holding read locks */ - m = btree_node_get_sibling(iter, b, sib); + m = bch2_btree_node_get_sibling(c, iter, b, sib); if (IS_ERR(m)) { ret = PTR_ERR(m); goto out; @@ -1912,21 +1608,20 @@ retry: goto out_unlock; } - reserve = bch2_btree_reserve_get(c, b, 0, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE, - &cl); - if (IS_ERR(reserve)) { - ret = PTR_ERR(reserve); + as = bch2_btree_update_start(c, iter->btree_id, + btree_update_reserve_required(c, b), + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE, + &cl); + if (IS_ERR(as)) { + ret = PTR_ERR(as); goto out_unlock; } - as = bch2_btree_interior_update_alloc(c); - - bch2_btree_interior_update_will_free_node(c, as, b); - bch2_btree_interior_update_will_free_node(c, as, m); + bch2_btree_interior_update_will_free_node(as, b); + bch2_btree_interior_update_will_free_node(as, m); - n = bch2_btree_node_alloc(c, b->level, b->btree_id, as, reserve); + n = bch2_btree_node_alloc(as, b->level); n->data->min_key = prev->data->min_key; n->data->max_key = next->data->max_key; @@ -1948,16 +1643,16 @@ retry: bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent); - bch2_btree_insert_node(parent, iter, &as->parent_keys, reserve, as); + bch2_btree_insert_node(as, parent, iter, &as->parent_keys); bch2_btree_open_bucket_put(c, n); - bch2_btree_node_free_inmem(iter, b); - bch2_btree_node_free_inmem(iter, m); + bch2_btree_node_free_inmem(c, b, iter); + bch2_btree_node_free_inmem(c, m, iter); bch2_btree_iter_node_replace(iter, n); bch2_btree_iter_verify(iter, n); - bch2_btree_reserve_put(c, reserve); + bch2_btree_update_done(as); out_unlock: if (ret != -EINTR && ret != -EAGAIN) bch2_btree_iter_set_locks_want(iter, 1); @@ -1980,478 +1675,24 @@ out: return ret; } -static int inline foreground_maybe_merge(struct btree_iter *iter, - enum btree_node_sibling sib) -{ - struct bch_fs *c = iter->c; - struct btree *b; - - if (!btree_node_locked(iter, iter->level)) - return 0; - - b = iter->nodes[iter->level]; - if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) - return 0; - - return __foreground_maybe_merge(iter, sib); -} - -/** - * btree_insert_key - insert a key one key into a leaf node - */ -static enum btree_insert_ret -btree_insert_key(struct btree_insert *trans, - struct btree_insert_entry *insert) -{ - struct bch_fs *c = trans->c; - struct btree_iter *iter = insert->iter; - struct btree *b = iter->nodes[0]; - enum btree_insert_ret ret; - int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); - int old_live_u64s = b->nr.live_u64s; - int live_u64s_added, u64s_added; - - iter->flags &= ~BTREE_ITER_UPTODATE; - - ret = !btree_node_is_extents(b) - ? bch2_insert_fixup_key(trans, insert) - : bch2_insert_fixup_extent(trans, insert); - - live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; - u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; - - if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); - if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) - b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); - - if (u64s_added > live_u64s_added && - bch2_maybe_compact_whiteouts(iter->c, b)) - bch2_btree_iter_reinit_node(iter, b); - - trace_btree_insert_key(c, b, insert->k); - return ret; -} - -static bool same_leaf_as_prev(struct btree_insert *trans, - struct btree_insert_entry *i) -{ - /* - * Because we sorted the transaction entries, if multiple iterators - * point to the same leaf node they'll always be adjacent now: - */ - return i != trans->entries && - i[0].iter->nodes[0] == i[-1].iter->nodes[0]; -} - -#define trans_for_each_entry(trans, i) \ - for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++) - -static void multi_lock_write(struct btree_insert *trans) -{ - struct btree_insert_entry *i; - - trans_for_each_entry(trans, i) - if (!same_leaf_as_prev(trans, i)) - btree_node_lock_for_insert(i->iter->nodes[0], i->iter); -} - -static void multi_unlock_write(struct btree_insert *trans) -{ - struct btree_insert_entry *i; - - trans_for_each_entry(trans, i) - if (!same_leaf_as_prev(trans, i)) - bch2_btree_node_unlock_write(i->iter->nodes[0], i->iter); -} - -static int btree_trans_entry_cmp(const void *_l, const void *_r) -{ - const struct btree_insert_entry *l = _l; - const struct btree_insert_entry *r = _r; - - return btree_iter_cmp(l->iter, r->iter); -} - -/* Normal update interface: */ - -/** - * __bch_btree_insert_at - insert keys at given iterator positions - * - * This is main entry point for btree updates. - * - * Return values: - * -EINTR: locking changed, this function should be called again. Only returned - * if passed BTREE_INSERT_ATOMIC. - * -EROFS: filesystem read only - * -EIO: journal or btree node IO error - */ -int __bch2_btree_insert_at(struct btree_insert *trans) -{ - struct bch_fs *c = trans->c; - struct btree_insert_entry *i; - struct btree_iter *split = NULL; - bool cycle_gc_lock = false; - unsigned u64s; - int ret; - - trans_for_each_entry(trans, i) { - BUG_ON(i->iter->level); - BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); - } - - sort(trans->entries, trans->nr, sizeof(trans->entries[0]), - btree_trans_entry_cmp, NULL); - - if (unlikely(!percpu_ref_tryget(&c->writes))) - return -EROFS; -retry_locks: - ret = -EINTR; - trans_for_each_entry(trans, i) - if (!bch2_btree_iter_set_locks_want(i->iter, 1)) - goto err; -retry: - trans->did_work = false; - u64s = 0; - trans_for_each_entry(trans, i) - if (!i->done) - u64s += jset_u64s(i->k->k.u64s + i->extra_res); - - memset(&trans->journal_res, 0, sizeof(trans->journal_res)); - - ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) - ? bch2_journal_res_get(&c->journal, - &trans->journal_res, - u64s, u64s) - : 0; - if (ret) - goto err; - - multi_lock_write(trans); - - u64s = 0; - trans_for_each_entry(trans, i) { - /* Multiple inserts might go to same leaf: */ - if (!same_leaf_as_prev(trans, i)) - u64s = 0; - - /* - * bch2_btree_node_insert_fits() must be called under write lock: - * with only an intent lock, another thread can still call - * bch2_btree_node_write(), converting an unwritten bset to a - * written one - */ - if (!i->done) { - u64s += i->k->k.u64s + i->extra_res; - if (!bch2_btree_node_insert_fits(c, - i->iter->nodes[0], u64s)) { - split = i->iter; - goto unlock; - } - } - } - - ret = 0; - split = NULL; - cycle_gc_lock = false; - - trans_for_each_entry(trans, i) { - if (i->done) - continue; - - switch (btree_insert_key(trans, i)) { - case BTREE_INSERT_OK: - i->done = true; - break; - case BTREE_INSERT_JOURNAL_RES_FULL: - case BTREE_INSERT_NEED_TRAVERSE: - ret = -EINTR; - break; - case BTREE_INSERT_NEED_RESCHED: - ret = -EAGAIN; - break; - case BTREE_INSERT_BTREE_NODE_FULL: - split = i->iter; - break; - case BTREE_INSERT_ENOSPC: - ret = -ENOSPC; - break; - case BTREE_INSERT_NEED_GC_LOCK: - cycle_gc_lock = true; - ret = -EINTR; - break; - default: - BUG(); - } - - if (!trans->did_work && (ret || split)) - break; - } -unlock: - multi_unlock_write(trans); - bch2_journal_res_put(&c->journal, &trans->journal_res); - - if (split) - goto split; - if (ret) - goto err; - - /* - * hack: iterators are inconsistent when they hit end of leaf, until - * traversed again - */ - trans_for_each_entry(trans, i) - if (i->iter->flags & BTREE_ITER_AT_END_OF_LEAF) - goto out; - - trans_for_each_entry(trans, i) - if (!same_leaf_as_prev(trans, i)) { - foreground_maybe_merge(i->iter, btree_prev_sib); - foreground_maybe_merge(i->iter, btree_next_sib); - } -out: - /* make sure we didn't lose an error: */ - if (!ret && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) - trans_for_each_entry(trans, i) - BUG_ON(!i->done); - - percpu_ref_put(&c->writes); - return ret; -split: - /* - * have to drop journal res before splitting, because splitting means - * allocating new btree nodes, and holding a journal reservation - * potentially blocks the allocator: - */ - ret = bch2_btree_split_leaf(split, trans->flags); - if (ret) - goto err; - /* - * if the split didn't have to drop locks the insert will still be - * atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked() - * and is overwriting won't have changed) - */ - goto retry_locks; -err: - if (cycle_gc_lock) { - down_read(&c->gc_lock); - up_read(&c->gc_lock); - } - - if (ret == -EINTR) { - trans_for_each_entry(trans, i) { - int ret2 = bch2_btree_iter_traverse(i->iter); - if (ret2) { - ret = ret2; - goto out; - } - } - - /* - * BTREE_ITER_ATOMIC means we have to return -EINTR if we - * dropped locks: - */ - if (!(trans->flags & BTREE_INSERT_ATOMIC)) - goto retry; - } - - goto out; -} - -int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags) -{ - struct bkey_i k; - - bkey_init(&k.k); - k.k.p = iter->pos; - - return bch2_btree_insert_at(iter->c, NULL, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE|flags, - BTREE_INSERT_ENTRY(iter, &k)); -} - -int bch2_btree_insert_list_at(struct btree_iter *iter, - struct keylist *keys, - struct disk_reservation *disk_res, - struct extent_insert_hook *hook, - u64 *journal_seq, unsigned flags) -{ - BUG_ON(flags & BTREE_INSERT_ATOMIC); - BUG_ON(bch2_keylist_empty(keys)); - verify_keys_sorted(keys); - - while (!bch2_keylist_empty(keys)) { - /* need to traverse between each insert */ - int ret = bch2_btree_iter_traverse(iter); - if (ret) - return ret; - - ret = bch2_btree_insert_at(iter->c, disk_res, hook, - journal_seq, flags, - BTREE_INSERT_ENTRY(iter, bch2_keylist_front(keys))); - if (ret) - return ret; - - bch2_keylist_pop_front(keys); - } - - return 0; -} - -/** - * bch_btree_insert - insert keys into the extent btree - * @c: pointer to struct bch_fs - * @id: btree to insert into - * @insert_keys: list of keys to insert - * @hook: insert callback - */ -int bch2_btree_insert(struct bch_fs *c, enum btree_id id, - struct bkey_i *k, - struct disk_reservation *disk_res, - struct extent_insert_hook *hook, - u64 *journal_seq, int flags) -{ - struct btree_iter iter; - int ret, ret2; - - bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k), - BTREE_ITER_INTENT); - - ret = bch2_btree_iter_traverse(&iter); - if (unlikely(ret)) - goto out; - - ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags, - BTREE_INSERT_ENTRY(&iter, k)); -out: ret2 = bch2_btree_iter_unlock(&iter); - - return ret ?: ret2; -} - -/** - * bch_btree_update - like bch2_btree_insert(), but asserts that we're - * overwriting an existing key - */ -int bch2_btree_update(struct bch_fs *c, enum btree_id id, - struct bkey_i *k, u64 *journal_seq) -{ - struct btree_iter iter; - struct bkey_s_c u; - int ret; - - EBUG_ON(id == BTREE_ID_EXTENTS); - - bch2_btree_iter_init(&iter, c, id, k->k.p, - BTREE_ITER_INTENT); - - u = bch2_btree_iter_peek_with_holes(&iter); - ret = btree_iter_err(u); - if (ret) - return ret; - - if (bkey_deleted(u.k)) { - bch2_btree_iter_unlock(&iter); - return -ENOENT; - } - - ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, 0, - BTREE_INSERT_ENTRY(&iter, k)); - bch2_btree_iter_unlock(&iter); - return ret; -} - -/* - * bch_btree_delete_range - delete everything within a given range - * - * Range is a half open interval - [start, end) - */ -int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, - struct bpos start, - struct bpos end, - struct bversion version, - struct disk_reservation *disk_res, - struct extent_insert_hook *hook, - u64 *journal_seq) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - bch2_btree_iter_init(&iter, c, id, start, - BTREE_ITER_INTENT); - - while ((k = bch2_btree_iter_peek(&iter)).k && - !(ret = btree_iter_err(k))) { - unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); - /* really shouldn't be using a bare, unpadded bkey_i */ - struct bkey_i delete; - - if (bkey_cmp(iter.pos, end) >= 0) - break; - - bkey_init(&delete.k); - - /* - * For extents, iter.pos won't necessarily be the same as - * bkey_start_pos(k.k) (for non extents they always will be the - * same). It's important that we delete starting from iter.pos - * because the range we want to delete could start in the middle - * of k. - * - * (bch2_btree_iter_peek() does guarantee that iter.pos >= - * bkey_start_pos(k.k)). - */ - delete.k.p = iter.pos; - delete.k.version = version; - - if (iter.flags & BTREE_ITER_IS_EXTENTS) { - /* - * The extents btree is special - KEY_TYPE_DISCARD is - * used for deletions, not KEY_TYPE_DELETED. This is an - * internal implementation detail that probably - * shouldn't be exposed (internally, KEY_TYPE_DELETED is - * used as a proxy for k->size == 0): - */ - delete.k.type = KEY_TYPE_DISCARD; - - /* create the biggest key we can */ - bch2_key_resize(&delete.k, max_sectors); - bch2_cut_back(end, &delete.k); - } - - ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, - BTREE_INSERT_NOFAIL, - BTREE_INSERT_ENTRY(&iter, &delete)); - if (ret) - break; - - bch2_btree_iter_cond_resched(&iter); - } - - bch2_btree_iter_unlock(&iter); - return ret; -} - static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, struct btree *b, unsigned flags, struct closure *cl) { struct btree *n, *parent = iter->nodes[b->level + 1]; - struct btree_reserve *reserve; - struct btree_interior_update *as; + struct btree_update *as; - reserve = bch2_btree_reserve_get(c, b, 0, flags, cl); - if (IS_ERR(reserve)) { + as = bch2_btree_update_start(c, iter->btree_id, + btree_update_reserve_required(c, b), + flags, cl); + if (IS_ERR(as)) { trace_btree_gc_rewrite_node_fail(c, b); - return PTR_ERR(reserve); + return PTR_ERR(as); } - as = bch2_btree_interior_update_alloc(c); - - bch2_btree_interior_update_will_free_node(c, as, b); + bch2_btree_interior_update_will_free_node(as, b); - n = bch2_btree_node_alloc_replacement(c, b, as, reserve); + n = bch2_btree_node_alloc_replacement(as, b); bch2_btree_build_aux_trees(n); six_unlock_write(&n->lock); @@ -2461,20 +1702,19 @@ static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, bch2_btree_node_write(c, n, &as->cl, SIX_LOCK_intent); if (parent) { - bch2_btree_insert_node(parent, iter, - &keylist_single(&n->key), - reserve, as); + bch2_btree_insert_node(as, parent, iter, + &keylist_single(&n->key)); } else { - bch2_btree_set_root(iter, n, as, reserve); + bch2_btree_set_root(as, n, iter); } bch2_btree_open_bucket_put(c, n); - bch2_btree_node_free_inmem(iter, b); + bch2_btree_node_free_inmem(c, b, iter); BUG_ON(!bch2_btree_iter_node_replace(iter, n)); - bch2_btree_reserve_put(c, reserve); + bch2_btree_update_done(as); return 0; } @@ -2535,8 +1775,7 @@ int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, int bch2_btree_node_update_key(struct bch_fs *c, struct btree *b, struct bkey_i_extent *new_key) { - struct btree_interior_update *as; - struct btree_reserve *reserve = NULL; + struct btree_update *as = NULL; struct btree *parent, *new_hash = NULL; struct btree_iter iter; struct closure cl; @@ -2548,7 +1787,19 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree *b, b->level, 0); closure_init_stack(&cl); - if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) { + ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE); + if (ret) + return ret; + +retry: + down_read(&c->gc_lock); + ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto err; + + /* check PTR_HASH() after @b is locked by btree_iter_traverse(): */ + if (!new_hash && + PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) { /* bch2_btree_reserve_get will unlock */ do { ret = bch2_btree_node_cannibalize_lock(c, &cl); @@ -2559,26 +1810,24 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree *b, new_hash = bch2_btree_node_mem_alloc(c); } -retry: - reserve = bch2_btree_reserve_get(c, b, 0, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE, - &cl); - closure_sync(&cl); - if (IS_ERR(reserve)) { - ret = PTR_ERR(reserve); - if (ret == -EAGAIN || ret == -EINTR) + + as = bch2_btree_update_start(c, iter.btree_id, + btree_update_reserve_required(c, b), + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_USE_ALLOC_RESERVE, + &cl); + if (IS_ERR(as)) { + ret = PTR_ERR(as); + if (ret == -EAGAIN || ret == -EINTR) { + bch2_btree_iter_unlock(&iter); + up_read(&c->gc_lock); + closure_sync(&cl); goto retry; + } goto err; } - down_read(&c->gc_lock); - - ret = bch2_btree_iter_traverse(&iter); - if (ret) - goto err; - mutex_lock(&c->btree_interior_update_lock); /* @@ -2615,50 +1864,61 @@ retry: mutex_unlock(&c->btree_interior_update_lock); - ret = bch2_check_mark_super(c, extent_i_to_s_c(new_key), BCH_DATA_BTREE); - if (ret) - goto err; - - as = bch2_btree_interior_update_alloc(c); - if (must_rewrite_parent) as->flags |= BTREE_INTERIOR_UPDATE_MUST_REWRITE; - bch2_btree_interior_update_add_node_reference(c, as, b); - - if (new_hash) { - bkey_copy(&new_hash->key, &new_key->k_i); - BUG_ON(bch2_btree_node_hash_insert(c, new_hash, - b->level, b->btree_id)); - } + btree_interior_update_add_node_reference(as, b); parent = iter.nodes[b->level + 1]; if (parent) { - bch2_btree_insert_node(parent, &iter, - &keylist_single(&b->key), - reserve, as); - } else { - bch2_btree_set_root(&iter, b, as, reserve); - } + if (new_hash) { + bkey_copy(&new_hash->key, &new_key->k_i); + BUG_ON(bch2_btree_node_hash_insert(c, new_hash, + b->level, b->btree_id)); + } - if (new_hash) { - mutex_lock(&c->btree_cache_lock); - bch2_btree_node_hash_remove(c, b); + bch2_btree_insert_node(as, parent, &iter, + &keylist_single(&new_key->k_i)); - bkey_copy(&b->key, &new_key->k_i); - __bch2_btree_node_hash_insert(c, b); + if (new_hash) { + mutex_lock(&c->btree_cache_lock); + bch2_btree_node_hash_remove(c, b); - bch2_btree_node_hash_remove(c, new_hash); - mutex_unlock(&c->btree_cache_lock); + bkey_copy(&b->key, &new_key->k_i); + __bch2_btree_node_hash_insert(c, b); + + bch2_btree_node_hash_remove(c, new_hash); + mutex_unlock(&c->btree_cache_lock); + } else { + bkey_copy(&b->key, &new_key->k_i); + } } else { + struct bch_fs_usage stats = { 0 }; + + BUG_ON(btree_node_root(c, b) != b); + + bch2_btree_node_lock_write(b, &iter); + + bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i), + c->sb.btree_node_size, true, + gc_pos_btree_root(b->btree_id), + &stats, 0); + bch2_btree_node_free_index(as, NULL, + bkey_i_to_s_c(&b->key), + &stats); + bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res, + gc_pos_btree_root(b->btree_id)); bkey_copy(&b->key, &new_key->k_i); + + btree_update_updated_root(as); + bch2_btree_node_unlock_write(b, &iter); } -err: - if (!IS_ERR_OR_NULL(reserve)) - bch2_btree_reserve_put(c, reserve); + + bch2_btree_update_done(as); +out: if (new_hash) { mutex_lock(&c->btree_cache_lock); - list_move(&b->list, &c->btree_cache_freeable); + list_move(&new_hash->list, &c->btree_cache_freeable); mutex_unlock(&c->btree_cache_lock); six_unlock_write(&new_hash->lock); @@ -2667,4 +1927,62 @@ err: bch2_btree_iter_unlock(&iter); up_read(&c->gc_lock); return ret; +err: + if (as) + bch2_btree_update_free(as); + goto out; +} + +/* Init code: */ + +/* + * Only for filesystem bringup, when first reading the btree roots or allocating + * btree roots when initializing a new filesystem: + */ +void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) +{ + BUG_ON(btree_node_root(c, b)); + + __bch2_btree_set_root_inmem(c, b); + bch2_btree_set_root_ondisk(c, b); +} + +int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id, + struct closure *writes) +{ + struct btree_update *as; + struct closure cl; + struct btree *b; + + memset(&as, 0, sizeof(as)); + closure_init_stack(&cl); + + while (1) { + /* XXX haven't calculated capacity yet :/ */ + as = bch2_btree_update_start(c, id, 1, 0, &cl); + if (!IS_ERR(as)) + break; + + if (PTR_ERR(as) == -ENOSPC) + return PTR_ERR(as); + + closure_sync(&cl); + } + + b = __btree_root_alloc(as, 0); + + bch2_btree_node_write(c, b, writes, SIX_LOCK_intent); + btree_update_drop_new_node(c, b); + + BUG_ON(btree_node_root(c, b)); + + bch2_btree_set_root_inmem(as, b); + bch2_btree_set_root_ondisk(c, b); + + bch2_btree_open_bucket_put(c, b); + six_unlock_intent(&b->lock); + + bch2_btree_update_free(as); + + return 0; } diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h new file mode 100644 index 0000000..b1fa06c --- /dev/null +++ b/libbcachefs/btree_update_interior.h @@ -0,0 +1,312 @@ +#ifndef _BCACHE_BTREE_UPDATE_INTERIOR_H +#define _BCACHE_BTREE_UPDATE_INTERIOR_H + +#include "btree_cache.h" +#include "btree_update.h" + +struct btree_reserve { + struct disk_reservation disk_res; + unsigned nr; + struct btree *b[BTREE_RESERVE_MAX]; +}; + +void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *); +bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *, + struct bkey_format *); + +/* Btree node freeing/allocation: */ + +/* + * Tracks a btree node that has been (or is about to be) freed in memory, but + * has _not_ yet been freed on disk (because the write that makes the new + * node(s) visible and frees the old hasn't completed yet) + */ +struct pending_btree_node_free { + bool index_update_done; + + __le64 seq; + enum btree_id btree_id; + unsigned level; + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); +}; + +/* + * Tracks an in progress split/rewrite of a btree node and the update to the + * parent node: + * + * When we split/rewrite a node, we do all the updates in memory without + * waiting for any writes to complete - we allocate the new node(s) and update + * the parent node, possibly recursively up to the root. + * + * The end result is that we have one or more new nodes being written - + * possibly several, if there were multiple splits - and then a write (updating + * an interior node) which will make all these new nodes visible. + * + * Additionally, as we split/rewrite nodes we free the old nodes - but the old + * nodes can't be freed (their space on disk can't be reclaimed) until the + * update to the interior node that makes the new node visible completes - + * until then, the old nodes are still reachable on disk. + * + */ +struct btree_update { + struct closure cl; + struct bch_fs *c; + + struct list_head list; + + /* What kind of update are we doing? */ + enum { + BTREE_INTERIOR_NO_UPDATE, + BTREE_INTERIOR_UPDATING_NODE, + BTREE_INTERIOR_UPDATING_ROOT, + BTREE_INTERIOR_UPDATING_AS, + } mode; + enum btree_id btree_id; + + unsigned flags; + struct btree_reserve *reserve; + + /* + * BTREE_INTERIOR_UPDATING_NODE: + * The update that made the new nodes visible was a regular update to an + * existing interior node - @b. We can't write out the update to @b + * until the new nodes we created are finished writing, so we block @b + * from writing by putting this btree_interior update on the + * @b->write_blocked list with @write_blocked_list: + */ + struct btree *b; + struct list_head write_blocked_list; + + /* + * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now + * we're now blocking another btree_update + * @parent_as - btree_update that's waiting on our nodes to finish + * writing, before it can make new nodes visible on disk + * @wait - list of child btree_updates that are waiting on this + * btree_update to make all the new nodes visible before they can free + * their old btree nodes + */ + struct btree_update *parent_as; + struct closure_waitlist wait; + + /* + * We may be freeing nodes that were dirty, and thus had journal entries + * pinned: we need to transfer the oldest of those pins to the + * btree_update operation, and release it when the new node(s) + * are all persistent and reachable: + */ + struct journal_entry_pin journal; + + u64 journal_seq; + + /* + * Nodes being freed: + * Protected by c->btree_node_pending_free_lock + */ + struct pending_btree_node_free pending[BTREE_MAX_DEPTH + GC_MERGE_NODES]; + unsigned nr_pending; + + /* New nodes, that will be made reachable by this update: */ + struct btree *new_nodes[BTREE_MAX_DEPTH * 2 + GC_MERGE_NODES]; + unsigned nr_new_nodes; + + /* Only here to reduce stack usage on recursive splits: */ + struct keylist parent_keys; + /* + * Enough room for btree_split's keys without realloc - btree node + * pointers never have crc/compression info, so we only need to acount + * for the pointers for three keys + */ + u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; +}; + +#define BTREE_INTERIOR_UPDATE_MUST_REWRITE (1 << 0) + +#define for_each_pending_btree_node_free(c, as, p) \ + list_for_each_entry(as, &c->btree_interior_update_list, list) \ + for (p = as->pending; p < as->pending + as->nr_pending; p++) + +void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *, + struct btree_iter *); +void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *); +void bch2_btree_open_bucket_put(struct bch_fs *, struct btree *); + +struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, + struct btree *, + struct bkey_format); + +void bch2_btree_update_done(struct btree_update *); +struct btree_update * +bch2_btree_update_start(struct bch_fs *, enum btree_id, unsigned, + unsigned, struct closure *); + +void bch2_btree_interior_update_will_free_node(struct btree_update *, + struct btree *); + +void bch2_btree_insert_node(struct btree_update *, struct btree *, + struct btree_iter *, struct keylist *); +int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned); +int bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, + enum btree_node_sibling); + +void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); +int bch2_btree_root_alloc(struct bch_fs *, enum btree_id, struct closure *); + +static inline unsigned btree_update_reserve_required(struct bch_fs *c, + struct btree *b) +{ + unsigned depth = btree_node_root(c, b)->level - b->level; + + return btree_reserve_required_nodes(depth); +} + +static inline void btree_node_reset_sib_u64s(struct btree *b) +{ + b->sib_u64s[0] = b->nr.live_u64s; + b->sib_u64s[1] = b->nr.live_u64s; +} + +static inline void *btree_data_end(struct bch_fs *c, struct btree *b) +{ + return (void *) b->data + btree_bytes(c); +} + +static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, + struct btree *b) +{ + return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); +} + +static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, + struct btree *b) +{ + return btree_data_end(c, b); +} + +static inline void *write_block(struct btree *b) +{ + return (void *) b->data + (b->written << 9); +} + +static inline bool bset_written(struct btree *b, struct bset *i) +{ + return (void *) i < write_block(b); +} + +static inline bool bset_unwritten(struct btree *b, struct bset *i) +{ + return (void *) i > write_block(b); +} + +static inline unsigned bset_end_sector(struct bch_fs *c, struct btree *b, + struct bset *i) +{ + return round_up(bset_byte_offset(b, vstruct_end(i)), + block_bytes(c)) >> 9; +} + +static inline unsigned btree_write_set_buffer(struct btree *b) +{ + /* + * Could buffer up larger amounts of keys for btrees with larger keys, + * pending benchmarking: + */ + return 4 << 10; +} + +static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, + struct btree *b) +{ + struct bset *i = btree_bset_last(b); + unsigned offset = max_t(unsigned, b->written << 9, + bset_byte_offset(b, vstruct_end(i))); + ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t) + (offset + sizeof(struct btree_node_entry) + + b->whiteout_u64s * sizeof(u64) + + b->uncompacted_whiteout_u64s * sizeof(u64)); + + EBUG_ON(offset > btree_bytes(c)); + + if ((unlikely(bset_written(b, i)) && n > 0) || + (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) && + n > btree_write_set_buffer(b))) + return (void *) b->data + offset; + + return NULL; +} + +static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t, + struct bkey_packed *k) +{ + if (bset_written(b, bset(b, t))) { + EBUG_ON(b->uncompacted_whiteout_u64s < + bkeyp_key_u64s(&b->format, k)); + b->uncompacted_whiteout_u64s -= + bkeyp_key_u64s(&b->format, k); + } +} + +static inline void reserve_whiteout(struct btree *b, struct bset_tree *t, + struct bkey_packed *k) +{ + if (bset_written(b, bset(b, t))) { + BUG_ON(!k->needs_whiteout); + b->uncompacted_whiteout_u64s += + bkeyp_key_u64s(&b->format, k); + } +} + +static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, + struct btree *b) +{ + struct bset *i = btree_bset_last(b); + unsigned used = bset_byte_offset(b, vstruct_end(i)) / sizeof(u64) + + b->whiteout_u64s + + b->uncompacted_whiteout_u64s; + unsigned total = c->sb.btree_node_size << 6; + + EBUG_ON(used > total); + + if (bset_written(b, i)) + return 0; + + return total - used; +} + +/* + * write lock must be held on @b (else the dirty bset that we were going to + * insert into could be written out from under us) + */ +static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, + struct btree *b, unsigned u64s) +{ + if (btree_node_is_extents(b)) { + /* The insert key might split an existing key + * (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case: + */ + u64s += BKEY_EXTENT_U64s_MAX; + } + + return u64s <= bch_btree_keys_u64s_remaining(c, b); +} + +static inline bool journal_res_insert_fits(struct btree_insert *trans, + struct btree_insert_entry *insert) +{ + unsigned u64s = 0; + struct btree_insert_entry *i; + + /* + * If we didn't get a journal reservation, we're in journal replay and + * we're not journalling updates: + */ + if (!trans->journal_res.ref) + return true; + + for (i = insert; i < trans->entries + trans->nr; i++) + u64s += jset_u64s(i->k->k.u64s + i->extra_res); + + return u64s <= trans->journal_res.u64s; +} + +#endif /* _BCACHE_BTREE_UPDATE_INTERIOR_H */ diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c new file mode 100644 index 0000000..6c490dd --- /dev/null +++ b/libbcachefs/btree_update_leaf.c @@ -0,0 +1,660 @@ + +#include "bcachefs.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "btree_locking.h" +#include "debug.h" +#include "extents.h" +#include "journal.h" +#include "keylist.h" + +#include +#include + +/* Inserting into a given leaf node (last stage of insert): */ + +/* Handle overwrites and do insert, for non extents: */ +bool bch2_btree_bset_insert_key(struct btree_iter *iter, + struct btree *b, + struct btree_node_iter *node_iter, + struct bkey_i *insert) +{ + const struct bkey_format *f = &b->format; + struct bkey_packed *k; + struct bset_tree *t; + unsigned clobber_u64s; + + EBUG_ON(btree_node_just_written(b)); + EBUG_ON(bset_written(b, btree_bset_last(b))); + EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); + EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 || + bkey_cmp(insert->k.p, b->data->max_key) > 0); + + k = bch2_btree_node_iter_peek_all(node_iter, b); + if (k && !bkey_cmp_packed(b, k, &insert->k)) { + BUG_ON(bkey_whiteout(k)); + + t = bch2_bkey_to_bset(b, k); + + if (bset_unwritten(b, bset(b, t)) && + bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) { + BUG_ON(bkey_whiteout(k) != bkey_whiteout(&insert->k)); + + k->type = insert->k.type; + memcpy_u64s(bkeyp_val(f, k), &insert->v, + bkey_val_u64s(&insert->k)); + return true; + } + + insert->k.needs_whiteout = k->needs_whiteout; + + btree_keys_account_key_drop(&b->nr, t - b->set, k); + + if (t == bset_tree_last(b)) { + clobber_u64s = k->u64s; + + /* + * If we're deleting, and the key we're deleting doesn't + * need a whiteout (it wasn't overwriting a key that had + * been written to disk) - just delete it: + */ + if (bkey_whiteout(&insert->k) && !k->needs_whiteout) { + bch2_bset_delete(b, k, clobber_u64s); + bch2_btree_node_iter_fix(iter, b, node_iter, t, + k, clobber_u64s, 0); + return true; + } + + goto overwrite; + } + + k->type = KEY_TYPE_DELETED; + bch2_btree_node_iter_fix(iter, b, node_iter, t, k, + k->u64s, k->u64s); + + if (bkey_whiteout(&insert->k)) { + reserve_whiteout(b, t, k); + return true; + } else { + k->needs_whiteout = false; + } + } else { + /* + * Deleting, but the key to delete wasn't found - nothing to do: + */ + if (bkey_whiteout(&insert->k)) + return false; + + insert->k.needs_whiteout = false; + } + + t = bset_tree_last(b); + k = bch2_btree_node_iter_bset_pos(node_iter, b, t); + clobber_u64s = 0; +overwrite: + bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); + if (k->u64s != clobber_u64s || bkey_whiteout(&insert->k)) + bch2_btree_node_iter_fix(iter, b, node_iter, t, k, + clobber_u64s, k->u64s); + return true; +} + +static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, + unsigned i, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct btree_write *w = container_of(pin, struct btree_write, journal); + struct btree *b = container_of(w, struct btree, writes[i]); + + six_lock_read(&b->lock); + bch2_btree_node_write_dirty(c, b, NULL, + (btree_current_write(b) == w && + w->journal.pin_list == journal_seq_pin(j, seq))); + six_unlock_read(&b->lock); +} + +static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) +{ + return __btree_node_flush(j, pin, 0, seq); +} + +static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) +{ + return __btree_node_flush(j, pin, 1, seq); +} + +void bch2_btree_journal_key(struct btree_insert *trans, + struct btree_iter *iter, + struct bkey_i *insert) +{ + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; + struct btree *b = iter->nodes[0]; + struct btree_write *w = btree_current_write(b); + + EBUG_ON(iter->level || b->level); + EBUG_ON(trans->journal_res.ref != + !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); + + if (!journal_pin_active(&w->journal)) + bch2_journal_pin_add(j, &trans->journal_res, + &w->journal, + btree_node_write_idx(b) == 0 + ? btree_node_flush0 + : btree_node_flush1); + + if (trans->journal_res.ref) { + u64 seq = trans->journal_res.seq; + bool needs_whiteout = insert->k.needs_whiteout; + + /* ick */ + insert->k.needs_whiteout = false; + bch2_journal_add_keys(j, &trans->journal_res, + b->btree_id, insert); + insert->k.needs_whiteout = needs_whiteout; + + bch2_journal_set_has_inode(j, &trans->journal_res, + insert->k.p.inode); + + if (trans->journal_seq) + *trans->journal_seq = seq; + btree_bset_last(b)->journal_seq = cpu_to_le64(seq); + } + + if (!btree_node_dirty(b)) + set_btree_node_dirty(b); +} + +static enum btree_insert_ret +bch2_insert_fixup_key(struct btree_insert *trans, + struct btree_insert_entry *insert) +{ + struct btree_iter *iter = insert->iter; + + BUG_ON(iter->level); + BUG_ON(insert->k->k.u64s > + bch_btree_keys_u64s_remaining(trans->c, iter->nodes[0])); + + if (bch2_btree_bset_insert_key(iter, iter->nodes[0], + &iter->node_iters[0], + insert->k)) + bch2_btree_journal_key(trans, iter, insert->k); + + trans->did_work = true; + return BTREE_INSERT_OK; +} + +static int inline foreground_maybe_merge(struct bch_fs *c, + struct btree_iter *iter, + enum btree_node_sibling sib) +{ + struct btree *b; + + if (!btree_node_locked(iter, iter->level)) + return 0; + + b = iter->nodes[iter->level]; + if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) + return 0; + + return bch2_foreground_maybe_merge(c, iter, sib); +} + +/** + * btree_insert_key - insert a key one key into a leaf node + */ +static enum btree_insert_ret +btree_insert_key(struct btree_insert *trans, + struct btree_insert_entry *insert) +{ + struct bch_fs *c = trans->c; + struct btree_iter *iter = insert->iter; + struct btree *b = iter->nodes[0]; + enum btree_insert_ret ret; + int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + + iter->flags &= ~BTREE_ITER_UPTODATE; + + ret = !btree_node_is_extents(b) + ? bch2_insert_fixup_key(trans, insert) + : bch2_insert_fixup_extent(trans, insert); + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; + u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; + + if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); + if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); + + if (u64s_added > live_u64s_added && + bch2_maybe_compact_whiteouts(c, b)) + bch2_btree_iter_reinit_node(iter, b); + + trace_btree_insert_key(c, b, insert->k); + return ret; +} + +static bool same_leaf_as_prev(struct btree_insert *trans, + struct btree_insert_entry *i) +{ + /* + * Because we sorted the transaction entries, if multiple iterators + * point to the same leaf node they'll always be adjacent now: + */ + return i != trans->entries && + i[0].iter->nodes[0] == i[-1].iter->nodes[0]; +} + +#define trans_for_each_entry(trans, i) \ + for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++) + +inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, + struct btree_iter *iter) +{ + bch2_btree_node_lock_write(b, iter); + + if (btree_node_just_written(b) && + bch2_btree_post_write_cleanup(c, b)) + bch2_btree_iter_reinit_node(iter, b); + + /* + * If the last bset has been written, or if it's gotten too big - start + * a new bset to insert into: + */ + if (want_new_bset(c, b)) + bch2_btree_init_next(c, b, iter); +} + +static void multi_lock_write(struct bch_fs *c, struct btree_insert *trans) +{ + struct btree_insert_entry *i; + + trans_for_each_entry(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_lock_for_insert(c, i->iter->nodes[0], i->iter); +} + +static void multi_unlock_write(struct btree_insert *trans) +{ + struct btree_insert_entry *i; + + trans_for_each_entry(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_unlock_write(i->iter->nodes[0], i->iter); +} + +static int btree_trans_entry_cmp(const void *_l, const void *_r) +{ + const struct btree_insert_entry *l = _l; + const struct btree_insert_entry *r = _r; + + return btree_iter_cmp(l->iter, r->iter); +} + +/* Normal update interface: */ + +/** + * __bch_btree_insert_at - insert keys at given iterator positions + * + * This is main entry point for btree updates. + * + * Return values: + * -EINTR: locking changed, this function should be called again. Only returned + * if passed BTREE_INSERT_ATOMIC. + * -EROFS: filesystem read only + * -EIO: journal or btree node IO error + */ +int __bch2_btree_insert_at(struct btree_insert *trans) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + struct btree_iter *split = NULL; + bool cycle_gc_lock = false; + unsigned u64s; + int ret; + + trans_for_each_entry(trans, i) { + BUG_ON(i->iter->level); + BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); + BUG_ON(debug_check_bkeys(c) && + bch2_bkey_invalid(c, i->iter->btree_id, + bkey_i_to_s_c(i->k))); + } + + sort(trans->entries, trans->nr, sizeof(trans->entries[0]), + btree_trans_entry_cmp, NULL); + + if (unlikely(!percpu_ref_tryget(&c->writes))) + return -EROFS; +retry_locks: + ret = -EINTR; + trans_for_each_entry(trans, i) + if (!bch2_btree_iter_set_locks_want(i->iter, 1)) + goto err; +retry: + trans->did_work = false; + u64s = 0; + trans_for_each_entry(trans, i) + if (!i->done) + u64s += jset_u64s(i->k->k.u64s + i->extra_res); + + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + + ret = !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY) + ? bch2_journal_res_get(&c->journal, + &trans->journal_res, + u64s, u64s) + : 0; + if (ret) + goto err; + + multi_lock_write(c, trans); + + u64s = 0; + trans_for_each_entry(trans, i) { + /* Multiple inserts might go to same leaf: */ + if (!same_leaf_as_prev(trans, i)) + u64s = 0; + + /* + * bch2_btree_node_insert_fits() must be called under write lock: + * with only an intent lock, another thread can still call + * bch2_btree_node_write(), converting an unwritten bset to a + * written one + */ + if (!i->done) { + u64s += i->k->k.u64s + i->extra_res; + if (!bch2_btree_node_insert_fits(c, + i->iter->nodes[0], u64s)) { + split = i->iter; + goto unlock; + } + } + } + + ret = 0; + split = NULL; + cycle_gc_lock = false; + + trans_for_each_entry(trans, i) { + if (i->done) + continue; + + switch (btree_insert_key(trans, i)) { + case BTREE_INSERT_OK: + i->done = true; + break; + case BTREE_INSERT_JOURNAL_RES_FULL: + case BTREE_INSERT_NEED_TRAVERSE: + ret = -EINTR; + break; + case BTREE_INSERT_NEED_RESCHED: + ret = -EAGAIN; + break; + case BTREE_INSERT_BTREE_NODE_FULL: + split = i->iter; + break; + case BTREE_INSERT_ENOSPC: + ret = -ENOSPC; + break; + case BTREE_INSERT_NEED_GC_LOCK: + cycle_gc_lock = true; + ret = -EINTR; + break; + default: + BUG(); + } + + if (!trans->did_work && (ret || split)) + break; + } +unlock: + multi_unlock_write(trans); + bch2_journal_res_put(&c->journal, &trans->journal_res); + + if (split) + goto split; + if (ret) + goto err; + + /* + * hack: iterators are inconsistent when they hit end of leaf, until + * traversed again + */ + trans_for_each_entry(trans, i) + if (i->iter->flags & BTREE_ITER_AT_END_OF_LEAF) + goto out; + + trans_for_each_entry(trans, i) + if (!same_leaf_as_prev(trans, i)) { + foreground_maybe_merge(c, i->iter, btree_prev_sib); + foreground_maybe_merge(c, i->iter, btree_next_sib); + } +out: + /* make sure we didn't lose an error: */ + if (!ret && IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) + trans_for_each_entry(trans, i) + BUG_ON(!i->done); + + percpu_ref_put(&c->writes); + return ret; +split: + /* + * have to drop journal res before splitting, because splitting means + * allocating new btree nodes, and holding a journal reservation + * potentially blocks the allocator: + */ + ret = bch2_btree_split_leaf(c, split, trans->flags); + if (ret) + goto err; + /* + * if the split didn't have to drop locks the insert will still be + * atomic (in the BTREE_INSERT_ATOMIC sense, what the caller peeked() + * and is overwriting won't have changed) + */ + goto retry_locks; +err: + if (cycle_gc_lock) { + down_read(&c->gc_lock); + up_read(&c->gc_lock); + } + + if (ret == -EINTR) { + trans_for_each_entry(trans, i) { + int ret2 = bch2_btree_iter_traverse(i->iter); + if (ret2) { + ret = ret2; + goto out; + } + } + + /* + * BTREE_ITER_ATOMIC means we have to return -EINTR if we + * dropped locks: + */ + if (!(trans->flags & BTREE_INSERT_ATOMIC)) + goto retry; + } + + goto out; +} + +int bch2_btree_delete_at(struct btree_iter *iter, unsigned flags) +{ + struct bkey_i k; + + bkey_init(&k.k); + k.k.p = iter->pos; + + return bch2_btree_insert_at(iter->c, NULL, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE|flags, + BTREE_INSERT_ENTRY(iter, &k)); +} + +int bch2_btree_insert_list_at(struct btree_iter *iter, + struct keylist *keys, + struct disk_reservation *disk_res, + struct extent_insert_hook *hook, + u64 *journal_seq, unsigned flags) +{ + BUG_ON(flags & BTREE_INSERT_ATOMIC); + BUG_ON(bch2_keylist_empty(keys)); + bch2_verify_keylist_sorted(keys); + + while (!bch2_keylist_empty(keys)) { + /* need to traverse between each insert */ + int ret = bch2_btree_iter_traverse(iter); + if (ret) + return ret; + + ret = bch2_btree_insert_at(iter->c, disk_res, hook, + journal_seq, flags, + BTREE_INSERT_ENTRY(iter, bch2_keylist_front(keys))); + if (ret) + return ret; + + bch2_keylist_pop_front(keys); + } + + return 0; +} + +/** + * bch_btree_insert - insert keys into the extent btree + * @c: pointer to struct bch_fs + * @id: btree to insert into + * @insert_keys: list of keys to insert + * @hook: insert callback + */ +int bch2_btree_insert(struct bch_fs *c, enum btree_id id, + struct bkey_i *k, + struct disk_reservation *disk_res, + struct extent_insert_hook *hook, + u64 *journal_seq, int flags) +{ + struct btree_iter iter; + int ret, ret2; + + bch2_btree_iter_init(&iter, c, id, bkey_start_pos(&k->k), + BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(&iter); + if (unlikely(ret)) + goto out; + + ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, flags, + BTREE_INSERT_ENTRY(&iter, k)); +out: ret2 = bch2_btree_iter_unlock(&iter); + + return ret ?: ret2; +} + +/** + * bch_btree_update - like bch2_btree_insert(), but asserts that we're + * overwriting an existing key + */ +int bch2_btree_update(struct bch_fs *c, enum btree_id id, + struct bkey_i *k, u64 *journal_seq) +{ + struct btree_iter iter; + struct bkey_s_c u; + int ret; + + EBUG_ON(id == BTREE_ID_EXTENTS); + + bch2_btree_iter_init(&iter, c, id, k->k.p, + BTREE_ITER_INTENT); + + u = bch2_btree_iter_peek_with_holes(&iter); + ret = btree_iter_err(u); + if (ret) + return ret; + + if (bkey_deleted(u.k)) { + bch2_btree_iter_unlock(&iter); + return -ENOENT; + } + + ret = bch2_btree_insert_at(c, NULL, NULL, journal_seq, 0, + BTREE_INSERT_ENTRY(&iter, k)); + bch2_btree_iter_unlock(&iter); + return ret; +} + +/* + * bch_btree_delete_range - delete everything within a given range + * + * Range is a half open interval - [start, end) + */ +int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, + struct bpos start, + struct bpos end, + struct bversion version, + struct disk_reservation *disk_res, + struct extent_insert_hook *hook, + u64 *journal_seq) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + bch2_btree_iter_init(&iter, c, id, start, + BTREE_ITER_INTENT); + + while ((k = bch2_btree_iter_peek(&iter)).k && + !(ret = btree_iter_err(k))) { + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + /* really shouldn't be using a bare, unpadded bkey_i */ + struct bkey_i delete; + + if (bkey_cmp(iter.pos, end) >= 0) + break; + + bkey_init(&delete.k); + + /* + * For extents, iter.pos won't necessarily be the same as + * bkey_start_pos(k.k) (for non extents they always will be the + * same). It's important that we delete starting from iter.pos + * because the range we want to delete could start in the middle + * of k. + * + * (bch2_btree_iter_peek() does guarantee that iter.pos >= + * bkey_start_pos(k.k)). + */ + delete.k.p = iter.pos; + delete.k.version = version; + + if (iter.flags & BTREE_ITER_IS_EXTENTS) { + /* + * The extents btree is special - KEY_TYPE_DISCARD is + * used for deletions, not KEY_TYPE_DELETED. This is an + * internal implementation detail that probably + * shouldn't be exposed (internally, KEY_TYPE_DELETED is + * used as a proxy for k->size == 0): + */ + delete.k.type = KEY_TYPE_DISCARD; + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end, &delete.k); + } + + ret = bch2_btree_insert_at(c, disk_res, hook, journal_seq, + BTREE_INSERT_NOFAIL, + BTREE_INSERT_ENTRY(&iter, &delete)); + if (ret) + break; + + bch2_btree_iter_cond_resched(&iter); + } + + bch2_btree_iter_unlock(&iter); + return ret; +} diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index e522705..a113d0d 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -80,21 +80,25 @@ static void bch2_fs_stats_verify(struct bch_fs *c) { struct bch_fs_usage stats = __bch2_fs_usage_read(c); + unsigned i; - if ((s64) stats.sectors_dirty < 0) - panic("sectors_dirty underflow: %lli\n", stats.sectors_dirty); - - if ((s64) stats.sectors_cached < 0) - panic("sectors_cached underflow: %lli\n", stats.sectors_cached); + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + if ((s64) stats.s[i].data[S_META] < 0) + panic("replicas %u meta underflow: %lli\n", + i + 1, stats.s[i].data[S_META]); - if ((s64) stats.sectors_meta < 0) - panic("sectors_meta underflow: %lli\n", stats.sectors_meta); + if ((s64) stats.s[i].data[S_DIRTY] < 0) + panic("replicas %u dirty underflow: %lli\n", + i + 1, stats.s[i].data[S_DIRTY]); - if ((s64) stats.sectors_persistent_reserved < 0) - panic("sectors_persistent_reserved underflow: %lli\n", stats.sectors_persistent_reserved); + if ((s64) stats.s[i].persistent_reserved < 0) + panic("replicas %u reserved underflow: %lli\n", + i + 1, stats.s[i].persistent_reserved); + } - if ((s64) stats.sectors_online_reserved < 0) - panic("sectors_online_reserved underflow: %lli\n", stats.sectors_online_reserved); + if ((s64) stats.online_reserved < 0) + panic("sectors_online_reserved underflow: %lli\n", + stats.online_reserved); } #else @@ -223,11 +227,8 @@ void bch2_fs_usage_apply(struct bch_fs *c, struct disk_reservation *disk_res, struct gc_pos gc_pos) { - s64 added = - stats->s[S_COMPRESSED][S_META] + - stats->s[S_COMPRESSED][S_DIRTY] + - stats->persistent_reserved + - stats->online_reserved; + struct fs_usage_sum sum = __fs_usage_sum(*stats); + s64 added = sum.data + sum.reserved; /* * Not allowed to reduce sectors_available except by getting a @@ -255,19 +256,8 @@ void bch2_fs_usage_apply(struct bch_fs *c, memset(stats, 0, sizeof(*stats)); } -static void bch2_fs_usage_update(struct bch_fs_usage *fs_usage, - struct bucket_mark old, struct bucket_mark new) -{ - fs_usage->s[S_COMPRESSED][S_CACHED] += - (int) new.cached_sectors - (int) old.cached_sectors; - fs_usage->s[S_COMPRESSED][bucket_type(old)] -= - old.dirty_sectors; - fs_usage->s[S_COMPRESSED][bucket_type(new)] += - new.dirty_sectors; -} - static void bch2_dev_usage_update(struct bch_dev *ca, - struct bucket_mark old, struct bucket_mark new) + struct bucket_mark old, struct bucket_mark new) { struct bch_fs *c = ca->fs; struct bch_dev_usage *dev_usage; @@ -280,7 +270,7 @@ static void bch2_dev_usage_update(struct bch_dev *ca, preempt_disable(); dev_usage = this_cpu_ptr(ca->usage_percpu); - dev_usage->sectors[S_CACHED] += + dev_usage->sectors_cached += (int) new.cached_sectors - (int) old.cached_sectors; dev_usage->sectors[bucket_type(old)] -= old.dirty_sectors; @@ -289,9 +279,9 @@ static void bch2_dev_usage_update(struct bch_dev *ca, dev_usage->buckets_alloc += (int) new.owned_by_allocator - (int) old.owned_by_allocator; - dev_usage->buckets_meta += is_meta_bucket(new) - is_meta_bucket(old); + dev_usage->buckets[S_META] += is_meta_bucket(new) - is_meta_bucket(old); + dev_usage->buckets[S_DIRTY] += is_dirty_bucket(new) - is_dirty_bucket(old); dev_usage->buckets_cached += is_cached_bucket(new) - is_cached_bucket(old); - dev_usage->buckets_dirty += is_dirty_bucket(new) - is_dirty_bucket(old); preempt_enable(); if (!is_available_bucket(old) && is_available_bucket(new)) @@ -309,7 +299,6 @@ static void bch2_dev_usage_update(struct bch_dev *ca, bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g, struct bucket_mark *old) { - struct bch_fs_usage stats = { 0 }; struct bucket_mark new; *old = bucket_data_cmpxchg(ca, g, new, ({ @@ -324,12 +313,8 @@ bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g, new.gen++; })); - /* XXX: we're not actually updating fs usage's cached sectors... */ - bch2_fs_usage_update(&stats, *old, new); - if (!old->owned_by_allocator && old->cached_sectors) - trace_invalidate(ca, g - ca->buckets, - old->cached_sectors); + trace_invalidate(ca, g - ca->buckets, old->cached_sectors); return true; } @@ -367,12 +352,15 @@ void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g) void bch2_mark_alloc_bucket(struct bch_dev *ca, struct bucket *g, bool owned_by_allocator) { - struct bucket_mark new; + struct bucket_mark old, new; - bucket_data_cmpxchg(ca, g, new, ({ + old = bucket_data_cmpxchg(ca, g, new, ({ new.touched_this_mount = 1; new.owned_by_allocator = owned_by_allocator; })); + + BUG_ON(!owned_by_allocator && !old.owned_by_allocator && + ca->fs->gc_pos.phase == GC_PHASE_DONE); } #define saturated_add(ca, dst, src, max) \ @@ -414,33 +402,13 @@ void bch2_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g, bucket_became_unavailable(ca->fs, old, new)); } -#if 0 /* Reverting this until the copygc + compression issue is fixed: */ -static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors) -{ - return crc_compression_type(crc) - ? sectors * crc_compressed_size(crc) / crc_uncompressed_size(crc) - : sectors; -} - -static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors) -{ - return crc_compression_type(crc) - ? min_t(unsigned, crc_compressed_size(crc), sectors) - : sectors; -} -#else -static unsigned __disk_sectors(const union bch_extent_crc *crc, unsigned sectors) -{ - return sectors; -} - -static unsigned __compressed_sectors(const union bch_extent_crc *crc, unsigned sectors) +static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors) { - return sectors; + return sectors * crc_compressed_size(NULL, crc) / + crc_uncompressed_size(NULL, crc); } -#endif /* * Checking against gc's position has to be done here, inside the cmpxchg() @@ -452,9 +420,8 @@ static void bch2_mark_pointer(struct bch_fs *c, const union bch_extent_crc *crc, const struct bch_extent_ptr *ptr, s64 sectors, enum s_alloc type, - bool may_make_unavailable, struct bch_fs_usage *stats, - bool gc_will_visit, u64 journal_seq) + u64 journal_seq, unsigned flags) { struct bucket_mark old, new; unsigned saturated; @@ -462,23 +429,24 @@ static void bch2_mark_pointer(struct bch_fs *c, struct bucket *g = ca->buckets + PTR_BUCKET_NR(ca, ptr); unsigned data_type = type == S_META ? BUCKET_BTREE : BUCKET_DATA; - unsigned old_sectors, new_sectors; - int disk_sectors, compressed_sectors; + u64 v; - if (sectors > 0) { - old_sectors = 0; - new_sectors = sectors; - } else { - old_sectors = e.k->size; - new_sectors = e.k->size + sectors; - } + if (crc_compression_type(crc)) { + unsigned old_sectors, new_sectors; + + if (sectors > 0) { + old_sectors = 0; + new_sectors = sectors; + } else { + old_sectors = e.k->size; + new_sectors = e.k->size + sectors; + } - disk_sectors = -__disk_sectors(crc, old_sectors) - + __disk_sectors(crc, new_sectors); - compressed_sectors = -__compressed_sectors(crc, old_sectors) - + __compressed_sectors(crc, new_sectors); + sectors = -__disk_sectors(crc, old_sectors) + +__disk_sectors(crc, new_sectors); + } - if (gc_will_visit) { + if (flags & BCH_BUCKET_MARK_GC_WILL_VISIT) { if (journal_seq) bucket_cmpxchg(g, new, ({ new.touched_this_mount = 1; @@ -486,10 +454,12 @@ static void bch2_mark_pointer(struct bch_fs *c, new.journal_seq = journal_seq; })); - goto out; + return; } - old = bucket_data_cmpxchg(ca, g, new, ({ + v = READ_ONCE(g->_mark.counter); + do { + new.counter = old.counter = v; saturated = 0; /* @@ -498,21 +468,21 @@ static void bch2_mark_pointer(struct bch_fs *c, * checked the gen */ if (gen_after(new.gen, ptr->gen)) { - EBUG_ON(type != S_CACHED && + EBUG_ON(!ptr->cached && test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)); return; } - if (type != S_CACHED && + if (!ptr->cached && new.dirty_sectors == GC_MAX_SECTORS_USED && - disk_sectors < 0) - saturated = -disk_sectors; + sectors < 0) + saturated = -sectors; - if (type == S_CACHED) - saturated_add(ca, new.cached_sectors, disk_sectors, + if (ptr->cached) + saturated_add(ca, new.cached_sectors, sectors, GC_MAX_SECTORS_USED); else - saturated_add(ca, new.dirty_sectors, disk_sectors, + saturated_add(ca, new.dirty_sectors, sectors, GC_MAX_SECTORS_USED); if (!new.dirty_sectors && @@ -528,7 +498,16 @@ static void bch2_mark_pointer(struct bch_fs *c, } new.touched_this_mount = 1; - })); + + if (flags & BCH_BUCKET_MARK_NOATOMIC) { + g->_mark = new; + break; + } + } while ((v = cmpxchg(&g->_mark.counter, + old.counter, + new.counter)) != old.counter); + + bch2_dev_usage_update(ca, old, new); if (old.data_type != data_type && (old.data_type || @@ -537,7 +516,7 @@ static void bch2_mark_pointer(struct bch_fs *c, bch_err(ca->fs, "bucket %zu has multiple types of data (%u, %u)", g - ca->buckets, old.data_type, new.data_type); - BUG_ON(!may_make_unavailable && + BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) && bucket_became_unavailable(c, old, new)); if (saturated && @@ -549,66 +528,61 @@ static void bch2_mark_pointer(struct bch_fs *c, wake_up_process(c->gc_thread); } } -out: - stats->s[S_COMPRESSED][type] += compressed_sectors; - stats->s[S_UNCOMPRESSED][type] += sectors; } static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c_extent e, s64 sectors, bool metadata, - bool may_make_unavailable, struct bch_fs_usage *stats, - bool gc_will_visit, u64 journal_seq) + u64 journal_seq, unsigned flags) { const struct bch_extent_ptr *ptr; const union bch_extent_crc *crc; enum s_alloc type = metadata ? S_META : S_DIRTY; + unsigned replicas = 0; BUG_ON(metadata && bkey_extent_is_cached(e.k)); BUG_ON(!sectors); - extent_for_each_ptr_crc(e, ptr, crc) - bch2_mark_pointer(c, e, crc, ptr, sectors, - ptr->cached ? S_CACHED : type, - may_make_unavailable, - stats, gc_will_visit, journal_seq); + extent_for_each_ptr_crc(e, ptr, crc) { + bch2_mark_pointer(c, e, crc, ptr, sectors, type, + stats, journal_seq, flags); + replicas += !ptr->cached; + } + + BUG_ON(replicas >= BCH_REPLICAS_MAX); + + if (replicas) + stats->s[replicas - 1].data[type] += sectors; } -static void __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, - s64 sectors, bool metadata, - bool may_make_unavailable, - struct bch_fs_usage *stats, - bool gc_will_visit, u64 journal_seq) +void __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, + s64 sectors, bool metadata, + struct bch_fs_usage *stats, + u64 journal_seq, unsigned flags) { switch (k.k->type) { case BCH_EXTENT: case BCH_EXTENT_CACHED: bch2_mark_extent(c, bkey_s_c_to_extent(k), sectors, metadata, - may_make_unavailable, stats, - gc_will_visit, journal_seq); + stats, journal_seq, flags); break; case BCH_RESERVATION: { struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - stats->persistent_reserved += r.v->nr_replicas * sectors; + if (r.v->nr_replicas) + stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors; break; } } } -void __bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, - s64 sectors, bool metadata, - struct bch_fs_usage *stats) -{ - __bch2_mark_key(c, k, sectors, metadata, true, stats, false, 0); -} - void bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, - s64 sectors, bool metadata) + s64 sectors, bool metadata, unsigned flags) { struct bch_fs_usage stats = { 0 }; - __bch2_gc_mark_key(c, k, sectors, metadata, &stats); + __bch2_mark_key(c, k, sectors, metadata, &stats, 0, + flags|BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); preempt_disable(); bch2_usage_add(this_cpu_ptr(c->usage_percpu), &stats); @@ -619,6 +593,8 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, s64 sectors, bool metadata, struct gc_pos gc_pos, struct bch_fs_usage *stats, u64 journal_seq) { + unsigned flags = gc_will_visit(c, gc_pos) + ? BCH_BUCKET_MARK_GC_WILL_VISIT : 0; /* * synchronization w.r.t. GC: * @@ -647,9 +623,7 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, * (e.g. the btree node lock, or the relevant allocator lock). */ lg_local_lock(&c->usage_lock); - __bch2_mark_key(c, k, sectors, metadata, false, stats, - gc_will_visit(c, gc_pos), journal_seq); - + __bch2_mark_key(c, k, sectors, metadata, stats, journal_seq, flags); bch2_fs_stats_verify(c); lg_local_unlock(&c->usage_lock); } diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 37eb471..618802c 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -124,9 +124,9 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca, { return max_t(s64, 0, ca->mi.nbuckets - ca->mi.first_bucket - - stats.buckets_dirty - - stats.buckets_alloc - - stats.buckets_meta); + stats.buckets[S_META] - + stats.buckets[S_DIRTY] - + stats.buckets_alloc); } /* @@ -157,16 +157,31 @@ struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *); void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, struct disk_reservation *, struct gc_pos); +struct fs_usage_sum { + u64 data; + u64 reserved; +}; + +static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats) +{ + struct fs_usage_sum sum = { 0 }; + unsigned i; + + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + sum.data += (stats.s[i].data[S_META] + + stats.s[i].data[S_DIRTY]) * (i + 1); + sum.reserved += stats.s[i].persistent_reserved * (i + 1); + } + + sum.reserved += stats.online_reserved; + return sum; +} + static inline u64 __bch2_fs_sectors_used(struct bch_fs *c) { - struct bch_fs_usage stats = __bch2_fs_usage_read(c); - u64 reserved = stats.persistent_reserved + - stats.online_reserved; + struct fs_usage_sum sum = __fs_usage_sum(__bch2_fs_usage_read(c)); - return stats.s[S_COMPRESSED][S_META] + - stats.s[S_COMPRESSED][S_DIRTY] + - reserved + - (reserved >> 7); + return sum.data + sum.reserved + (sum.reserved >> 7); } static inline u64 bch2_fs_sectors_used(struct bch_fs *c) @@ -199,9 +214,15 @@ void bch2_mark_alloc_bucket(struct bch_dev *, struct bucket *, bool); void bch2_mark_metadata_bucket(struct bch_dev *, struct bucket *, enum bucket_data_type, bool); -void __bch2_gc_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, - struct bch_fs_usage *); -void bch2_gc_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool); +#define BCH_BUCKET_MARK_NOATOMIC (1 << 0) +#define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 1) +#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 2) + +void __bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, + struct bch_fs_usage *, u64, unsigned); + +void bch2_gc_mark_key(struct bch_fs *, struct bkey_s_c, + s64, bool, unsigned); void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, struct gc_pos, struct bch_fs_usage *, u64); diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index c25c9fa..396d770 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -7,7 +7,6 @@ enum bucket_data_type { BUCKET_DATA = 0, BUCKET_BTREE, - BUCKET_PRIOS, BUCKET_JOURNAL, BUCKET_SB, }; @@ -49,32 +48,33 @@ struct bucket { }; }; -enum s_compressed { - S_COMPRESSED, - S_UNCOMPRESSED, - S_COMPRESSED_NR, -}; - +/* kill, switch to bucket_data_type */ enum s_alloc { S_META, S_DIRTY, - S_CACHED, S_ALLOC_NR, }; struct bch_dev_usage { - u64 buckets_dirty; + u64 buckets[S_ALLOC_NR]; u64 buckets_cached; - u64 buckets_meta; u64 buckets_alloc; + /* _compressed_ sectors: */ u64 sectors[S_ALLOC_NR]; + u64 sectors_cached; }; struct bch_fs_usage { /* all fields are in units of 512 byte sectors: */ - u64 s[S_COMPRESSED_NR][S_ALLOC_NR]; - u64 persistent_reserved; + + /* _uncompressed_ sectors: */ + + struct { + u64 data[S_ALLOC_NR]; + u64 persistent_reserved; + } s[BCH_REPLICAS_MAX]; + u64 online_reserved; u64 available_cache; }; diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 694fcd2..47af7a2 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -73,12 +73,12 @@ static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) return -EINVAL; user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL); - if (!devs) + if (!user_devs) return -ENOMEM; devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); - if (copy_from_user(user_devs, user_arg->devs, + if (copy_from_user(user_devs, arg.devs, sizeof(u64) * arg.nr_devs)) goto err; diff --git a/libbcachefs/debug.c b/libbcachefs/debug.c index d4c8ce5..db03a34 100644 --- a/libbcachefs/debug.c +++ b/libbcachefs/debug.c @@ -71,7 +71,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) memcpy(n_ondisk, n_sorted, btree_bytes(c)); - bch2_btree_node_read_done(c, v, pick.ca, &pick.ptr); + bch2_btree_node_read_done(c, v); n_sorted = c->verify_data->data; percpu_ref_put(&pick.ca->io_ref); diff --git a/libbcachefs/error.c b/libbcachefs/error.c index 5b7316d..8357c8d 100644 --- a/libbcachefs/error.c +++ b/libbcachefs/error.c @@ -26,7 +26,7 @@ void bch2_fatal_error(struct bch_fs *c) bch_err(c, "emergency read only"); } -void bch2_nonfatal_io_error_work(struct work_struct *work) +void bch2_io_error_work(struct work_struct *work) { struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); struct bch_fs *c = ca->fs; @@ -45,9 +45,9 @@ void bch2_nonfatal_io_error_work(struct work_struct *work) mutex_unlock(&c->state_lock); } -void bch2_nonfatal_io_error(struct bch_dev *ca) +void bch2_io_error(struct bch_dev *ca) { - queue_work(system_long_wq, &ca->io_error_work); + //queue_work(system_long_wq, &ca->io_error_work); } #ifdef __KERNEL__ diff --git a/libbcachefs/error.h b/libbcachefs/error.h index 750c676..f2032d9 100644 --- a/libbcachefs/error.h +++ b/libbcachefs/error.h @@ -179,63 +179,32 @@ do { \ _ret; \ }) -#define bch2_dev_fatal_error(ca, ...) \ -do { \ - bch_err(ca, __VA_ARGS__); \ - bch2_fatal_error(c); \ -} while (0) - -#define bch2_dev_fatal_io_error(ca, fmt, ...) \ -do { \ - printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \ - "fatal IO error on %s for " fmt), \ - (ca)->name, ##__VA_ARGS__); \ - bch2_fatal_error((ca)->fs); \ -} while (0) - -#define bch2_dev_fatal_io_err_on(cond, ca, ...) \ -({ \ - int _ret = !!(cond); \ - \ - if (_ret) \ - bch2_dev_fatal_io_error(ca, __VA_ARGS__); \ - _ret; \ -}) - /* - * Nonfatal IO errors: either recoverable metadata IO (because we have - * replicas), or data IO - we need to log it and print out a message, but we - * don't (necessarily) want to shut down the fs: + * IO errors: either recoverable metadata IO (because we have replicas), or data + * IO - we need to log it and print out a message, but we don't (necessarily) + * want to shut down the fs: */ -void bch2_nonfatal_io_error_work(struct work_struct *); +void bch2_io_error_work(struct work_struct *); /* Does the error handling without logging a message */ -void bch2_nonfatal_io_error(struct bch_dev *); - -#if 0 -#define bch2_fs_nonfatal_io_error(c, ...) \ -do { \ - bch_err(c, __VA_ARGS__); \ - bch2_nonfatal_io_error(c); \ -} while (0) -#endif +void bch2_io_error(struct bch_dev *); /* Logs message and handles the error: */ -#define bch2_dev_nonfatal_io_error(ca, fmt, ...) \ +#define bch2_dev_io_error(ca, fmt, ...) \ do { \ printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \ "IO error on %s for " fmt), \ (ca)->name, ##__VA_ARGS__); \ - bch2_nonfatal_io_error(ca); \ + bch2_io_error(ca); \ } while (0) -#define bch2_dev_nonfatal_io_err_on(cond, ca, ...) \ +#define bch2_dev_io_err_on(cond, ca, ...) \ ({ \ bool _ret = (cond); \ \ if (_ret) \ - bch2_dev_nonfatal_io_error(ca, __VA_ARGS__); \ + bch2_dev_io_error(ca, __VA_ARGS__); \ _ret; \ }) diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 1b0e3da..5819cef 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -9,6 +9,8 @@ #include "bkey_methods.h" #include "btree_gc.h" #include "btree_update.h" +#include "btree_update_interior.h" +#include "buckets.h" #include "checksum.h" #include "debug.h" #include "dirent.h" @@ -497,6 +499,54 @@ out: return out - buf; } +void bch2_get_read_device(struct bch_fs *c, + const struct bkey *k, + const struct bch_extent_ptr *ptr, + const union bch_extent_crc *crc, + struct bch_devs_mask *avoid, + struct extent_pick_ptr *pick) +{ + struct bch_dev *ca = c->devs[ptr->dev]; + + if (ptr->cached && ptr_stale(ca, ptr)) + return; + + if (ca->mi.state == BCH_MEMBER_STATE_FAILED) + return; + + if (avoid && test_bit(ca->dev_idx, avoid->d)) + return; + + if (pick->ca && pick->ca->mi.tier < ca->mi.tier) + return; + + if (!percpu_ref_tryget(&ca->io_ref)) + return; + + if (pick->ca) + percpu_ref_put(&pick->ca->io_ref); + + *pick = (struct extent_pick_ptr) { + .ptr = *ptr, + .ca = ca, + }; + + if (k->size) + pick->crc = crc_to_128(k, crc); +} + +static void extent_pick_read_device(struct bch_fs *c, + struct bkey_s_c_extent e, + struct bch_devs_mask *avoid, + struct extent_pick_ptr *pick) +{ + const union bch_extent_crc *crc; + const struct bch_extent_ptr *ptr; + + extent_for_each_ptr_crc(e, ptr, crc) + bch2_get_read_device(c, e.k, ptr, crc, avoid, pick); +} + /* Btree ptrs */ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c, @@ -615,36 +665,10 @@ static void bch2_btree_ptr_to_text(struct bch_fs *c, char *buf, struct extent_pick_ptr bch2_btree_pick_ptr(struct bch_fs *c, const struct btree *b) { - struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); - const union bch_extent_crc *crc; - const struct bch_extent_ptr *ptr; struct extent_pick_ptr pick = { .ca = NULL }; - extent_for_each_ptr_crc(e, ptr, crc) { - struct bch_dev *ca = c->devs[ptr->dev]; - struct btree *root = btree_node_root(c, b); - - if (bch2_fs_inconsistent_on(crc, c, - "btree node pointer with crc at btree %u level %u/%u bucket %zu", - b->btree_id, b->level, root ? root->level : -1, - PTR_BUCKET_NR(ca, ptr))) - break; - - if (ca->mi.state == BCH_MEMBER_STATE_FAILED) - continue; - - if (pick.ca && pick.ca->mi.tier < ca->mi.tier) - continue; - - if (!percpu_ref_tryget(&ca->io_ref)) - continue; - - if (pick.ca) - percpu_ref_put(&pick.ca->io_ref); - - pick.ca = ca; - pick.ptr = *ptr; - } + extent_pick_read_device(c, bkey_i_to_s_c_extent(&b->key), + NULL, &pick); return pick; } @@ -2029,13 +2053,11 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c, * as the pointers are sorted by tier, hence preferring pointers to tier 0 * rather than pointers to tier 1. */ -void bch2_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k, - struct bch_dev *avoid, - struct extent_pick_ptr *ret) +void bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k, + struct bch_devs_mask *avoid, + struct extent_pick_ptr *ret) { struct bkey_s_c_extent e; - const union bch_extent_crc *crc; - const struct bch_extent_ptr *ptr; switch (k.k->type) { case KEY_TYPE_DELETED: @@ -2053,32 +2075,7 @@ void bch2_extent_pick_ptr_avoiding(struct bch_fs *c, struct bkey_s_c k, e = bkey_s_c_to_extent(k); ret->ca = NULL; - extent_for_each_ptr_crc(e, ptr, crc) { - struct bch_dev *ca = c->devs[ptr->dev]; - - if (ptr->cached && ptr_stale(ca, ptr)) - continue; - - if (ca->mi.state == BCH_MEMBER_STATE_FAILED) - continue; - - if (ret->ca && - (ca == avoid || - ret->ca->mi.tier < ca->mi.tier)) - continue; - - if (!percpu_ref_tryget(&ca->io_ref)) - continue; - - if (ret->ca) - percpu_ref_put(&ret->ca->io_ref); - - *ret = (struct extent_pick_ptr) { - .crc = crc_to_128(e.k, crc), - .ptr = *ptr, - .ca = ca, - }; - } + extent_pick_read_device(c, bkey_s_c_to_extent(k), avoid, ret); if (!ret->ca && !bkey_extent_is_cached(e.k)) ret->ca = ERR_PTR(-EIO); diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 3dc06cb..e49b9cf 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -3,11 +3,16 @@ #include "bcachefs.h" #include "bkey.h" +#include "io_types.h" +struct bch_fs; +struct journal_res; struct btree_node_iter; struct btree_insert; struct btree_insert_entry; struct extent_insert_hook; +struct bch_devs_mask; +union bch_extent_crc; struct btree_nr_keys bch2_key_sort_fix_overlapping(struct bset *, struct btree *, @@ -20,27 +25,18 @@ struct btree_nr_keys bch2_extent_sort_fix_overlapping(struct bch_fs *c, extern const struct bkey_ops bch2_bkey_btree_ops; extern const struct bkey_ops bch2_bkey_extent_ops; -struct bch_fs; -struct journal_res; - -struct extent_pick_ptr { - struct bch_extent_crc128 crc; - struct bch_extent_ptr ptr; - struct bch_dev *ca; -}; - +void bch2_get_read_device(struct bch_fs *, + const struct bkey *, + const struct bch_extent_ptr *, + const union bch_extent_crc *, + struct bch_devs_mask *, + struct extent_pick_ptr *); struct extent_pick_ptr bch2_btree_pick_ptr(struct bch_fs *, const struct btree *); -void bch2_extent_pick_ptr_avoiding(struct bch_fs *, struct bkey_s_c, - struct bch_dev *, struct extent_pick_ptr *); - -static inline void -bch2_extent_pick_ptr(struct bch_fs *c, struct bkey_s_c k, - struct extent_pick_ptr *ret) -{ - bch2_extent_pick_ptr_avoiding(c, k, NULL, ret); -} +void bch2_extent_pick_ptr(struct bch_fs *, struct bkey_s_c, + struct bch_devs_mask *, + struct extent_pick_ptr *); enum btree_insert_ret bch2_insert_fixup_extent(struct btree_insert *, @@ -558,6 +554,12 @@ void bch2_extent_drop_ptr_idx(struct bkey_s_extent, unsigned); const struct bch_extent_ptr * bch2_extent_has_device(struct bkey_s_c_extent, unsigned); +struct bch_extent_ptr * +bch2_extent_find_ptr(struct bch_fs *, struct bkey_s_extent, + struct bch_extent_ptr); +struct bch_extent_ptr * +bch2_extent_find_matching_ptr(struct bch_fs *, struct bkey_s_extent, + struct bkey_s_c_extent); bool bch2_cut_front(struct bpos, struct bkey_i *); bool bch2_cut_back(struct bpos, struct bkey *); diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 079f958..5845603 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -21,6 +21,8 @@ #include #include #include + +#include #include struct bio_set *bch2_writepage_bioset; @@ -700,8 +702,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter, { struct bio *bio = &rbio->bio; int flags = BCH_READ_RETRY_IF_STALE| - BCH_READ_PROMOTE| - BCH_READ_MAY_REUSE_BIO; + BCH_READ_MAY_PROMOTE; while (1) { struct extent_pick_ptr pick; @@ -727,7 +728,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter, bch2_btree_iter_unlock(iter); k = bkey_i_to_s_c(&tmp.k); - bch2_extent_pick_ptr(c, k, &pick); + bch2_extent_pick_ptr(c, k, NULL, &pick); if (IS_ERR(pick.ca)) { bcache_io_error(c, bio, "no device to read from"); bio_endio(bio); @@ -753,15 +754,14 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter, bkey_extent_is_compressed(k)) bch2_mark_pages_unalloc(bio); - if (is_last) - flags |= BCH_READ_IS_LAST; - if (pick.ca) { - PTR_BUCKET(pick.ca, &pick.ptr)->prio[READ] = - c->prio_clock[READ].hand; + if (!is_last) { + bio_inc_remaining(&rbio->bio); + flags |= BCH_READ_MUST_CLONE; + trace_read_split(&rbio->bio); + } bch2_read_extent(c, rbio, k, &pick, flags); - flags &= ~BCH_READ_MAY_REUSE_BIO; } else { zero_fill_bio(bio); @@ -803,9 +803,7 @@ int bch2_readpages(struct file *file, struct address_space *mapping, BCH_ENCODED_EXTENT_MAX >> PAGE_SECTOR_SHIFT); struct bch_read_bio *rbio = - container_of(bio_alloc_bioset(GFP_NOFS, n, - &c->bio_read), - struct bch_read_bio, bio); + to_rbio(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read)); rbio->bio.bi_end_io = bch2_readpages_end_io; bio_add_page_contig(&rbio->bio, page); @@ -854,9 +852,7 @@ int bch2_readpage(struct file *file, struct page *page) struct bch_fs *c = inode->i_sb->s_fs_info; struct bch_read_bio *rbio; - rbio = container_of(bio_alloc_bioset(GFP_NOFS, 1, - &c->bio_read), - struct bch_read_bio, bio); + rbio = to_rbio(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read)); rbio->bio.bi_end_io = bch2_readpages_end_io; __bchfs_readpage(c, rbio, inode->i_ino, page); @@ -1240,9 +1236,7 @@ static int bch2_read_single_page(struct page *page, int ret; DECLARE_COMPLETION_ONSTACK(done); - rbio = container_of(bio_alloc_bioset(GFP_NOFS, 1, - &c->bio_read), - struct bch_read_bio, bio); + rbio = to_rbio(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read)); rbio->bio.bi_private = &done; rbio->bio.bi_end_io = bch2_read_single_page_end_io; @@ -1464,9 +1458,7 @@ start: if (iter->count) closure_get(&dio->cl); - bch2_read(c, container_of(bio, - struct bch_read_bio, bio), - inode->i_ino); + bch2_read(c, to_rbio(bio), inode->i_ino); } if (sync) { @@ -2088,13 +2080,14 @@ static long bch2_fpunch(struct inode *inode, loff_t offset, loff_t len) if (unlikely(ret)) goto out; - ret = bch2_discard(c, - POS(ino, discard_start), - POS(ino, discard_end), - ZERO_VERSION, - &disk_res, - &i_sectors_hook.hook, - &ei->journal_seq); + ret = bch2_btree_delete_range(c, + BTREE_ID_EXTENTS, + POS(ino, discard_start), + POS(ino, discard_end), + ZERO_VERSION, + &disk_res, + &i_sectors_hook.hook, + &ei->journal_seq); i_sectors_dirty_put(ei, &i_sectors_hook); bch2_disk_reservation_put(c, &disk_res); diff --git a/libbcachefs/inode.c b/libbcachefs/inode.c index 0a37153..18bc182 100644 --- a/libbcachefs/inode.c +++ b/libbcachefs/inode.c @@ -328,8 +328,11 @@ again: int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size, struct extent_insert_hook *hook, u64 *journal_seq) { - return bch2_discard(c, POS(inode_nr, new_size), POS(inode_nr + 1, 0), - ZERO_VERSION, NULL, hook, journal_seq); + return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, + POS(inode_nr, new_size), + POS(inode_nr + 1, 0), + ZERO_VERSION, NULL, hook, + journal_seq); } int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 78cdaa3..9eed97b 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -27,13 +27,6 @@ #include -static inline void __bio_inc_remaining(struct bio *bio) -{ - bio_set_flag(bio, BIO_CHAIN); - smp_mb__before_atomic(); - atomic_inc(&bio->__bi_remaining); -} - /* Allocate, free from mempool: */ void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) @@ -97,6 +90,9 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, BUG_ON(c->opts.nochanges); extent_for_each_ptr(e, ptr) { + BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || + !c->devs[ptr->dev]); + ca = c->devs[ptr->dev]; if (ptr + 1 < &extent_entry_last(e)->ptr) { @@ -110,7 +106,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, n->bounce = false; n->put_bio = true; n->bio.bi_opf = wbio->bio.bi_opf; - __bio_inc_remaining(&wbio->bio); + bio_inc_remaining(&wbio->bio); } else { n = wbio; n->split = false; @@ -128,7 +124,7 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, if (likely(percpu_ref_tryget(&ca->io_ref))) { n->have_io_ref = true; n->bio.bi_bdev = ca->disk_sb.bdev; - generic_make_request(&n->bio); + submit_bio(&n->bio); } else { n->have_io_ref = false; bcache_io_error(c, &n->bio, "device has been removed"); @@ -241,68 +237,41 @@ static void bch2_write_index(struct closure *cl) } } -/** - * bch_write_discard - discard range of keys - * - * Used to implement discard, and to handle when writethrough write hits - * a write error on the cache device. - */ -static void bch2_write_discard(struct closure *cl) -{ - struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bpos end = op->pos; - - end.offset += bio_sectors(&op->wbio.bio); - - op->error = bch2_discard(op->c, op->pos, end, op->version, - &op->res, NULL, NULL); -} - -/* - * Convert extents to be inserted to discards after an error: - */ static void bch2_write_io_error(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct keylist *keys = &op->insert_keys; + struct bch_fs *c = op->c; + struct bch_extent_ptr *ptr; + struct bkey_i *k; + int ret; - if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) { - struct bkey_i *src = bch2_keylist_front(&op->insert_keys); - struct bkey_i *dst = bch2_keylist_front(&op->insert_keys); - - /* - * Our data write just errored, which means we've got a bunch - * of keys to insert that point to data that wasn't - * successfully written. - * - * We don't have to insert those keys but we still have to - * invalidate that region of the cache - so, if we just strip - * off all the pointers from the keys we'll accomplish just - * that. - */ + for_each_keylist_key(keys, k) { + struct bkey_i *n = bkey_next(k); + struct bkey_s_extent e = bkey_i_to_s_extent(k); - while (src != op->insert_keys.top) { - struct bkey_i *n = bkey_next(src); + extent_for_each_ptr_backwards(e, ptr) + if (test_bit(ptr->dev, op->failed.d)) + bch2_extent_drop_ptr(e, ptr); - set_bkey_val_u64s(&src->k, 0); - src->k.type = KEY_TYPE_DISCARD; - bkey_copy(dst, src); + memmove(bkey_next(k), n, (void *) keys->top - (void *) n); + keys->top_p -= (u64 *) n - (u64 *) bkey_next(k); - dst = bkey_next(dst); - src = n; + ret = bch2_extent_nr_ptrs(e.c) + ? bch2_check_mark_super(c, e.c, BCH_DATA_USER) + : -EIO; + if (ret) { + keys->top = keys->keys; + op->error = ret; + op->flags |= BCH_WRITE_DONE; + break; } - - op->insert_keys.top = dst; - op->flags |= BCH_WRITE_DISCARD; - } else { - /* TODO: We could try to recover from this. */ - while (!bch2_keylist_empty(&op->insert_keys)) - bch2_keylist_pop_front(&op->insert_keys); - - op->error = -EIO; - op->flags |= BCH_WRITE_DONE; } + memset(&op->failed, 0, sizeof(op->failed)); + bch2_write_index(cl); + return; } static void bch2_write_endio(struct bio *bio) @@ -314,9 +283,10 @@ static void bch2_write_endio(struct bio *bio) struct bch_fs *c = wbio->c; struct bch_dev *ca = wbio->ca; - if (bch2_dev_nonfatal_io_err_on(bio->bi_error, ca, - "data write")) + if (bch2_dev_io_err_on(bio->bi_error, ca, "data write")) { + set_bit(ca->dev_idx, op->failed.d); set_closure_fn(cl, bch2_write_io_error, index_update_wq(op)); + } if (wbio->have_io_ref) percpu_ref_put(&ca->io_ref); @@ -538,14 +508,6 @@ static void __bch2_write(struct closure *cl) struct open_bucket *b; int ret; - memset(op->open_buckets, 0, sizeof(op->open_buckets)); - - if (op->flags & BCH_WRITE_DISCARD) { - bch2_write_discard(cl); - op->flags |= BCH_WRITE_DONE; - continue_at(cl, bch2_write_done, index_update_wq(op)); - } - do { if (open_bucket_nr == ARRAY_SIZE(op->open_buckets)) continue_at(cl, bch2_write_index, index_update_wq(op)); @@ -614,27 +576,15 @@ static void __bch2_write(struct closure *cl) op->flags |= BCH_WRITE_DONE; continue_at(cl, bch2_write_index, index_update_wq(op)); err: - if (op->flags & BCH_WRITE_DISCARD_ON_ERROR) { - /* - * If we were writing cached data, not doing the write is fine - * so long as we discard whatever would have been overwritten - - * then it's equivalent to doing the write and immediately - * reclaiming it. - */ - - bch2_write_discard(cl); - } else { - /* - * Right now we can only error here if we went RO - the - * allocation failed, but we already checked for -ENOSPC when we - * got our reservation. - * - * XXX capacity might have changed, but we don't check for that - * yet: - */ - op->error = ret; - } - + /* + * Right now we can only error here if we went RO - the + * allocation failed, but we already checked for -ENOSPC when we + * got our reservation. + * + * XXX capacity might have changed, but we don't check for that + * yet: + */ + op->error = ret; op->flags |= BCH_WRITE_DONE; /* @@ -707,14 +657,13 @@ void bch2_write(struct closure *cl) op->version.lo = atomic64_inc_return(&c->key_version) + 1; - if (!(op->flags & BCH_WRITE_DISCARD)) - bch2_increment_clock(c, bio_sectors(bio), WRITE); + bch2_increment_clock(c, bio_sectors(bio), WRITE); /* Don't call bch2_next_delay() if rate is >= 1 GB/sec */ if (c->foreground_write_ratelimit_enabled && c->foreground_write_pd.rate.rate < (1 << 30) && - !(op->flags & BCH_WRITE_DISCARD) && op->wp->throttle) { + op->wp->throttle) { unsigned long flags; u64 delay; @@ -784,6 +733,9 @@ void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, op->index_update_fn = bch2_write_index_default; + memset(op->open_buckets, 0, sizeof(op->open_buckets)); + memset(&op->failed, 0, sizeof(op->failed)); + bch2_keylist_init(&op->insert_keys, op->inline_keys, ARRAY_SIZE(op->inline_keys)); @@ -792,53 +744,228 @@ void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, get_random_bytes(&op->version, sizeof(op->version)); } -/* Discard */ - -/* bch_discard - discard a range of keys from start_key to end_key. - * @c filesystem - * @start_key pointer to start location - * NOTE: discard starts at bkey_start_offset(start_key) - * @end_key pointer to end location - * NOTE: discard ends at KEY_OFFSET(end_key) - * @version version of discard (0ULL if none) - * - * Returns: - * 0 on success - * <0 on error - * - * XXX: this needs to be refactored with inode_truncate, or more - * appropriately inode_truncate should call this - */ -int bch2_discard(struct bch_fs *c, struct bpos start, - struct bpos end, struct bversion version, - struct disk_reservation *disk_res, - struct extent_insert_hook *hook, - u64 *journal_seq) -{ - return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, start, end, version, - disk_res, hook, journal_seq); -} - /* Cache promotion on read */ -struct cache_promote_op { +struct promote_op { struct closure cl; struct migrate_write write; struct bio_vec bi_inline_vecs[0]; /* must be last */ }; +static void promote_done(struct closure *cl) +{ + struct promote_op *op = + container_of(cl, struct promote_op, cl); + struct bch_fs *c = op->write.op.c; + + percpu_ref_put(&c->writes); + bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio); + kfree(op); +} + +static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) +{ + struct bch_fs *c = rbio->c; + struct closure *cl = &op->cl; + struct bio *bio = &op->write.op.wbio.bio; + + BUG_ON(!rbio->split || !rbio->bounce); + + if (!percpu_ref_tryget(&c->writes)) + return; + + trace_promote(&rbio->bio); + + /* we now own pages: */ + swap(bio->bi_vcnt, rbio->bio.bi_vcnt); + memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, + sizeof(struct bio_vec) * bio->bi_vcnt); + rbio->promote = NULL; + + closure_init(cl, NULL); + closure_call(&op->write.op.cl, bch2_write, c->wq, cl); + closure_return_with_destructor(cl, promote_done); +} + +/* + * XXX: multiple promotes can race with each other, wastefully. Keep a list of + * outstanding promotes? + */ +static struct promote_op *promote_alloc(struct bch_fs *c, + struct bvec_iter iter, + struct bkey_s_c k, + struct extent_pick_ptr *pick, + bool read_full) +{ + struct promote_op *op; + struct bio *bio; + /* + * biovec needs to be big enough to hold decompressed data, if + * bch2_write_extent() has to decompress/recompress it: + */ + unsigned sectors = max_t(unsigned, k.k->size, + crc_uncompressed_size(NULL, &pick->crc)); + unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); + + op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); + if (!op) + return NULL; + + bio = &op->write.op.wbio.bio; + bio_init(bio, bio->bi_inline_vecs, pages); + + bio->bi_iter = iter; + + if (pick->crc.compression_type) { + op->write.op.flags |= BCH_WRITE_DATA_COMPRESSED; + op->write.op.crc = pick->crc; + op->write.op.size = k.k->size; + } else if (read_full) { + /* + * Adjust bio to correspond to _live_ portion of @k - + * which might be less than what we're actually reading: + */ + bio_advance(bio, pick->crc.offset << 9); + BUG_ON(bio_sectors(bio) < k.k->size); + bio->bi_iter.bi_size = k.k->size << 9; + } else { + /* + * Set insert pos to correspond to what we're actually + * reading: + */ + op->write.op.pos.offset = iter.bi_sector; + } + bch2_migrate_write_init(c, &op->write, &c->promote_write_point, + k, NULL, + BCH_WRITE_ALLOC_NOWAIT| + BCH_WRITE_CACHED); + op->write.promote = true; + + return op; +} + +/* only promote if we're not reading from the fastest tier: */ +static bool should_promote(struct bch_fs *c, + struct extent_pick_ptr *pick, unsigned flags) +{ + if (!(flags & BCH_READ_MAY_PROMOTE)) + return false; + + if (flags & BCH_READ_IN_RETRY) + return false; + + if (percpu_ref_is_dying(&c->writes)) + return false; + + return c->fastest_tier && + c->fastest_tier < c->tiers + pick->ca->mi.tier; +} + /* Read */ -static int bio_checksum_uncompress(struct bch_fs *c, - struct bch_read_bio *rbio) +#define READ_RETRY_AVOID 1 +#define READ_RETRY 2 +#define READ_ERR 3 + +static inline struct bch_read_bio * +bch2_rbio_parent(struct bch_read_bio *rbio) { + return rbio->split ? rbio->parent : rbio; +} + +__always_inline +static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, + struct workqueue_struct *wq) +{ + + if (!wq || rbio->process_context) { + fn(&rbio->work); + } else { + rbio->work.func = fn; + rbio->process_context = true; + queue_work(wq, &rbio->work); + } +} + +static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) +{ + struct bch_read_bio *parent = rbio->parent; + + BUG_ON(!rbio->split); + + if (rbio->promote) + kfree(rbio->promote); + if (rbio->bounce) + bch2_bio_free_pages_pool(rbio->c, &rbio->bio); + bio_put(&rbio->bio); + + return parent; +} + +static void bch2_rbio_done(struct bch_read_bio *rbio) +{ + if (rbio->promote) + kfree(rbio->promote); + rbio->promote = NULL; + + if (rbio->split) + rbio = bch2_rbio_free(rbio); + bio_endio(&rbio->bio); +} + +static void bch2_rbio_retry(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct bvec_iter iter = rbio->bvec_iter; + unsigned flags = rbio->flags; + u64 inode = rbio->inode; + struct bch_devs_mask avoid; + + trace_read_retry(&rbio->bio); + + memset(&avoid, 0, sizeof(avoid)); + + if (rbio->retry == READ_RETRY_AVOID) + __set_bit(rbio->pick.ca->dev_idx, avoid.d); + + if (rbio->split) + rbio = bch2_rbio_free(rbio); + else + rbio->bio.bi_error = 0; + + flags |= BCH_READ_MUST_CLONE; + flags |= BCH_READ_IN_RETRY; + + __bch2_read(c, rbio, iter, inode, &avoid, flags); +} + +static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error) +{ + rbio->retry = retry; + + if (rbio->flags & BCH_READ_IN_RETRY) + return; + + if (retry == READ_ERR) { + bch2_rbio_parent(rbio)->bio.bi_error = error; + bch2_rbio_done(rbio); + } else { + bch2_rbio_punt(rbio, bch2_rbio_retry, rbio->c->wq); + } +} + +static int bch2_rbio_checksum_uncompress(struct bio *dst, + struct bch_read_bio *rbio) +{ + struct bch_fs *c = rbio->c; struct bio *src = &rbio->bio; - struct bio *dst = &bch2_rbio_parent(rbio)->bio; - struct bvec_iter dst_iter = rbio->parent_iter; + struct bvec_iter dst_iter = rbio->bvec_iter; struct nonce nonce = extent_nonce(rbio->version, - rbio->crc.nonce, - crc_uncompressed_size(NULL, &rbio->crc), - rbio->crc.compression_type); + rbio->pick.crc.nonce, + crc_uncompressed_size(NULL, &rbio->pick.crc), + rbio->pick.crc.compression_type); struct bch_csum csum; int ret = 0; @@ -849,130 +976,64 @@ static int bio_checksum_uncompress(struct bch_fs *c, * in order to promote */ if (rbio->bounce) { - src->bi_iter.bi_size = crc_compressed_size(NULL, &rbio->crc) << 9; + src->bi_iter.bi_size = crc_compressed_size(NULL, &rbio->pick.crc) << 9; src->bi_iter.bi_idx = 0; src->bi_iter.bi_bvec_done = 0; } else { - src->bi_iter = rbio->parent_iter; + src->bi_iter = rbio->bvec_iter; } - csum = bch2_checksum_bio(c, rbio->crc.csum_type, nonce, src); - if (bch2_dev_nonfatal_io_err_on(bch2_crc_cmp(rbio->crc.csum, csum), - rbio->ca, + csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, nonce, src); + if (bch2_dev_io_err_on(bch2_crc_cmp(rbio->pick.crc.csum, csum), + rbio->pick.ca, "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)", - rbio->inode, (u64) rbio->parent_iter.bi_sector << 9, - rbio->crc.csum.hi, rbio->crc.csum.lo, csum.hi, csum.lo, - rbio->crc.csum_type)) + rbio->inode, (u64) rbio->bvec_iter.bi_sector << 9, + rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, + csum.hi, csum.lo, + rbio->pick.crc.csum_type)) ret = -EIO; /* * If there was a checksum error, still copy the data back - unless it * was compressed, we don't want to decompress bad data: */ - if (rbio->crc.compression_type != BCH_COMPRESSION_NONE) { + if (rbio->pick.crc.compression_type != BCH_COMPRESSION_NONE) { if (!ret) { - bch2_encrypt_bio(c, rbio->crc.csum_type, nonce, src); + bch2_encrypt_bio(c, rbio->pick.crc.csum_type, nonce, src); ret = bch2_bio_uncompress(c, src, dst, - dst_iter, rbio->crc); + dst_iter, rbio->pick.crc); if (ret) __bcache_io_error(c, "decompression error"); } } else if (rbio->bounce) { - bio_advance(src, rbio->crc.offset << 9); + bio_advance(src, rbio->pick.crc.offset << 9); /* don't need to decrypt the entire bio: */ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); src->bi_iter.bi_size = dst_iter.bi_size; - nonce = nonce_add(nonce, rbio->crc.offset << 9); + nonce = nonce_add(nonce, rbio->pick.crc.offset << 9); - bch2_encrypt_bio(c, rbio->crc.csum_type, + bch2_encrypt_bio(c, rbio->pick.crc.csum_type, nonce, src); bio_copy_data_iter(dst, &dst_iter, src, &src->bi_iter); } else { - bch2_encrypt_bio(c, rbio->crc.csum_type, nonce, src); + bch2_encrypt_bio(c, rbio->pick.crc.csum_type, nonce, src); } return ret; } -static void bch2_rbio_free(struct bch_read_bio *rbio) -{ - struct bch_fs *c = rbio->c; - struct bio *bio = &rbio->bio; - - BUG_ON(rbio->ca); - BUG_ON(!rbio->split); - - if (rbio->promote) - kfree(rbio->promote); - if (rbio->bounce) - bch2_bio_free_pages_pool(c, bio); - - bio_put(bio); -} - -static void bch2_rbio_done(struct bch_read_bio *rbio) -{ - struct bio *orig = &bch2_rbio_parent(rbio)->bio; - - percpu_ref_put(&rbio->ca->io_ref); - rbio->ca = NULL; - - if (rbio->split) { - if (rbio->bio.bi_error) - orig->bi_error = rbio->bio.bi_error; - - bio_endio(orig); - bch2_rbio_free(rbio); - } else { - if (rbio->promote) - kfree(rbio->promote); - - orig->bi_end_io = rbio->orig_bi_end_io; - bio_endio_nodec(orig); - } -} - -static void bch2_rbio_error(struct bch_read_bio *rbio, int error) -{ - bch2_rbio_parent(rbio)->bio.bi_error = error; - bch2_rbio_done(rbio); -} - -static void bch2_rbio_retry(struct bch_fs *c, struct bch_read_bio *rbio) -{ - unsigned long flags; - - percpu_ref_put(&rbio->ca->io_ref); - rbio->ca = NULL; - - spin_lock_irqsave(&c->read_retry_lock, flags); - bio_list_add(&c->read_retry_list, &rbio->bio); - spin_unlock_irqrestore(&c->read_retry_lock, flags); - queue_work(c->wq, &c->read_retry_work); -} - -static void cache_promote_done(struct closure *cl) -{ - struct cache_promote_op *op = - container_of(cl, struct cache_promote_op, cl); - - bch2_bio_free_pages_pool(op->write.op.c, &op->write.op.wbio.bio); - kfree(op); -} - /* Inner part that may run in process context */ static void __bch2_read_endio(struct work_struct *work) { struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; int ret; - ret = bio_checksum_uncompress(c, rbio); + ret = bch2_rbio_checksum_uncompress(&bch2_rbio_parent(rbio)->bio, rbio); if (ret) { /* * Checksum error: if the bio wasn't bounced, we may have been @@ -980,34 +1041,19 @@ static void __bch2_read_endio(struct work_struct *work) * scribble over) - retry the read, bouncing it this time: */ if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { - rbio->flags |= BCH_READ_FORCE_BOUNCE; - bch2_rbio_retry(c, rbio); + rbio->flags |= BCH_READ_MUST_BOUNCE; + bch2_rbio_error(rbio, READ_RETRY, ret); } else { - bch2_rbio_error(rbio, -EIO); + bch2_rbio_error(rbio, READ_RETRY_AVOID, ret); } return; } - if (rbio->promote) { - struct cache_promote_op *promote = rbio->promote; - struct closure *cl = &promote->cl; - - BUG_ON(!rbio->split || !rbio->bounce); - - trace_promote(&rbio->bio); - - /* we now own pages: */ - swap(promote->write.op.wbio.bio.bi_vcnt, rbio->bio.bi_vcnt); - rbio->promote = NULL; - - bch2_rbio_done(rbio); + if (rbio->promote) + promote_start(rbio->promote, rbio); - closure_init(cl, &c->cl); - closure_call(&promote->write.op.cl, bch2_write, c->wq, cl); - closure_return_with_destructor(cl, cache_promote_done); - } else { + if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) bch2_rbio_done(rbio); - } } static void bch2_read_endio(struct bio *bio) @@ -1015,90 +1061,55 @@ static void bch2_read_endio(struct bio *bio) struct bch_read_bio *rbio = container_of(bio, struct bch_read_bio, bio); struct bch_fs *c = rbio->c; + struct workqueue_struct *wq = NULL; + + percpu_ref_put(&rbio->pick.ca->io_ref); - if (bch2_dev_nonfatal_io_err_on(bio->bi_error, rbio->ca, "data read")) { - /* XXX: retry IO errors when we have another replica */ - bch2_rbio_error(rbio, bio->bi_error); + if (!rbio->split) + rbio->bio.bi_end_io = rbio->end_io; + + if (bch2_dev_io_err_on(bio->bi_error, rbio->pick.ca, "data read")) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_error); return; } - if (rbio->ptr.cached && + if (rbio->pick.ptr.cached && (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || - ptr_stale(rbio->ca, &rbio->ptr))) { + ptr_stale(rbio->pick.ca, &rbio->pick.ptr))) { atomic_long_inc(&c->read_realloc_races); if (rbio->flags & BCH_READ_RETRY_IF_STALE) - bch2_rbio_retry(c, rbio); + bch2_rbio_error(rbio, READ_RETRY, -EINTR); else - bch2_rbio_error(rbio, -EINTR); + bch2_rbio_error(rbio, READ_ERR, -EINTR); return; } - if (rbio->crc.compression_type || - bch2_csum_type_is_encryption(rbio->crc.csum_type)) - queue_work(system_unbound_wq, &rbio->work); - else if (rbio->crc.csum_type) - queue_work(system_highpri_wq, &rbio->work); - else - __bch2_read_endio(&rbio->work); -} - -static bool should_promote(struct bch_fs *c, - struct extent_pick_ptr *pick, unsigned flags) -{ - if (!(flags & BCH_READ_PROMOTE)) - return false; - - if (percpu_ref_is_dying(&c->writes)) - return false; + if (rbio->pick.crc.compression_type || + bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) + wq = system_unbound_wq; + else if (rbio->pick.crc.csum_type) + wq = system_highpri_wq; - return c->fastest_tier && - c->fastest_tier < c->tiers + pick->ca->mi.tier; + bch2_rbio_punt(rbio, __bch2_read_endio, wq); } -void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig, - struct bvec_iter iter, struct bkey_s_c k, - struct extent_pick_ptr *pick, unsigned flags) +int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, + struct bvec_iter iter, struct bkey_s_c k, + struct extent_pick_ptr *pick, unsigned flags) { struct bch_read_bio *rbio; - struct cache_promote_op *promote_op = NULL; + struct promote_op *promote_op = NULL; unsigned skip = iter.bi_sector - bkey_start_offset(k.k); bool bounce = false, split, read_full = false; + int ret = 0; bch2_increment_clock(c, bio_sectors(&orig->bio), READ); + PTR_BUCKET(pick->ca, &pick->ptr)->prio[READ] = c->prio_clock[READ].hand; EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector || k.k->p.offset < bvec_iter_end_sector(iter)); - /* only promote if we're not reading from the fastest tier: */ - - /* - * XXX: multiple promotes can race with each other, wastefully. Keep a - * list of outstanding promotes? - */ - if (should_promote(c, pick, flags)) { - /* - * biovec needs to be big enough to hold decompressed data, if - * the bch2_write_extent() has to decompress/recompress it: - */ - unsigned sectors = - max_t(unsigned, k.k->size, - crc_uncompressed_size(NULL, &pick->crc)); - unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); - - promote_op = kmalloc(sizeof(*promote_op) + - sizeof(struct bio_vec) * pages, GFP_NOIO); - if (promote_op) { - struct bio *promote_bio = &promote_op->write.op.wbio.bio; - - bio_init(promote_bio, - promote_bio->bi_inline_vecs, - pages); - bounce = true; - /* could also set read_full */ - } - } - /* * note: if compression_type and crc_type both == none, then * compressed/uncompressed size is zero @@ -1108,25 +1119,30 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig, (bvec_iter_sectors(iter) != crc_uncompressed_size(NULL, &pick->crc) || (bch2_csum_type_is_encryption(pick->crc.csum_type) && (flags & BCH_READ_USER_MAPPED)) || - (flags & BCH_READ_FORCE_BOUNCE)))) { + (flags & BCH_READ_MUST_BOUNCE)))) { read_full = true; bounce = true; } + if (should_promote(c, pick, flags)) + promote_op = promote_alloc(c, iter, k, pick, read_full); + + /* could also set read_full */ + if (promote_op) + bounce = true; + if (bounce) { unsigned sectors = read_full ? (crc_compressed_size(NULL, &pick->crc) ?: k.k->size) : bvec_iter_sectors(iter); - rbio = container_of(bio_alloc_bioset(GFP_NOIO, + rbio = rbio_init(bio_alloc_bioset(GFP_NOIO, DIV_ROUND_UP(sectors, PAGE_SECTORS), - &c->bio_read_split), - struct bch_read_bio, bio); + &c->bio_read_split)); bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); split = true; - } else if (!(flags & BCH_READ_MAY_REUSE_BIO) || - !(flags & BCH_READ_IS_LAST)) { + } else if (flags & BCH_READ_MUST_CLONE) { /* * Have to clone if there were any splits, due to error * reporting issues (if a split errored, and retrying didn't @@ -1135,9 +1151,8 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig, * from the whole bio, in which case we don't want to retry and * lose the error) */ - rbio = container_of(bio_clone_fast(&orig->bio, - GFP_NOIO, &c->bio_read_split), - struct bch_read_bio, bio); + rbio = rbio_init(bio_clone_fast(&orig->bio, + GFP_NOIO, &c->bio_read_split)); rbio->bio.bi_iter = iter; split = true; } else { @@ -1147,80 +1162,39 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig, BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); } - if (!(flags & BCH_READ_IS_LAST)) - __bio_inc_remaining(&orig->bio); + rbio->c = c; if (split) rbio->parent = orig; else - rbio->orig_bi_end_io = orig->bio.bi_end_io; - rbio->parent_iter = iter; + rbio->end_io = orig->bio.bi_end_io; + rbio->bvec_iter = iter; rbio->flags = flags; rbio->bounce = bounce; rbio->split = split; - rbio->c = c; - rbio->ca = pick->ca; - rbio->ptr = pick->ptr; - rbio->crc = pick->crc; + rbio->process_context = false; + rbio->retry = 0; + rbio->pick = *pick; /* * crc.compressed_size will be 0 if there wasn't any checksum * information, also we need to stash the original size of the bio if we * bounced (which isn't necessarily the original key size, if we bounced * only for promoting) */ - rbio->crc._compressed_size = bio_sectors(&rbio->bio) - 1; + rbio->pick.crc._compressed_size = bio_sectors(&rbio->bio) - 1; rbio->version = k.k->version; rbio->promote = promote_op; rbio->inode = k.k->p.inode; - INIT_WORK(&rbio->work, __bch2_read_endio); + INIT_WORK(&rbio->work, NULL); rbio->bio.bi_bdev = pick->ca->disk_sb.bdev; rbio->bio.bi_opf = orig->bio.bi_opf; rbio->bio.bi_iter.bi_sector = pick->ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; - if (promote_op) { - struct bio *promote_bio = &promote_op->write.op.wbio.bio; - - promote_bio->bi_iter = rbio->bio.bi_iter; - memcpy(promote_bio->bi_io_vec, rbio->bio.bi_io_vec, - sizeof(struct bio_vec) * rbio->bio.bi_vcnt); - - bch2_migrate_write_init(c, &promote_op->write, - &c->promote_write_point, - k, NULL, - BCH_WRITE_ALLOC_NOWAIT| - BCH_WRITE_CACHED); - promote_op->write.promote = true; - - if (rbio->crc.compression_type) { - promote_op->write.op.flags |= BCH_WRITE_DATA_COMPRESSED; - promote_op->write.op.crc = rbio->crc; - promote_op->write.op.size = k.k->size; - } else if (read_full) { - /* - * Adjust bio to correspond to _live_ portion of @k - - * which might be less than what we're actually reading: - */ - bio_advance(promote_bio, rbio->crc.offset << 9); - BUG_ON(bio_sectors(promote_bio) < k.k->size); - promote_bio->bi_iter.bi_size = k.k->size << 9; - } else { - /* - * Set insert pos to correspond to what we're actually - * reading: - */ - promote_op->write.op.pos.offset = iter.bi_sector; - } - - promote_bio->bi_iter.bi_sector = - promote_op->write.op.pos.offset; - } - - /* _after_ promete stuff has looked at rbio->crc.offset */ if (read_full) - rbio->crc.offset += skip; + rbio->pick.crc.offset += skip; else rbio->bio.bi_iter.bi_sector += skip; @@ -1229,28 +1203,36 @@ void bch2_read_extent_iter(struct bch_fs *c, struct bch_read_bio *orig, if (bounce) trace_read_bounce(&rbio->bio); - if (!(flags & BCH_READ_IS_LAST)) - trace_read_split(&rbio->bio); + if (likely(!(flags & BCH_READ_IN_RETRY))) { + submit_bio(&rbio->bio); + } else { + submit_bio_wait(&rbio->bio); + + rbio->process_context = true; + bch2_read_endio(&rbio->bio); - generic_make_request(&rbio->bio); + ret = rbio->retry; + if (!ret) + bch2_rbio_done(rbio); + } + + return ret; } -static void bch2_read_iter(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, u64 inode, - unsigned flags) +void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, u64 inode, + struct bch_devs_mask *avoid, unsigned flags) { - struct bio *bio = &rbio->bio; struct btree_iter iter; struct bkey_s_c k; int ret; - +retry: for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inode, bvec_iter.bi_sector), BTREE_ITER_WITH_HOLES, k) { BKEY_PADDED(k) tmp; struct extent_pick_ptr pick; - unsigned bytes, sectors; - bool is_last; + struct bvec_iter fragment; /* * Unlock the iterator while the btree node's lock is still in @@ -1260,43 +1242,47 @@ static void bch2_read_iter(struct bch_fs *c, struct bch_read_bio *rbio, k = bkey_i_to_s_c(&tmp.k); bch2_btree_iter_unlock(&iter); - bch2_extent_pick_ptr(c, k, &pick); + bch2_extent_pick_ptr(c, k, avoid, &pick); if (IS_ERR(pick.ca)) { - bcache_io_error(c, bio, "no device to read from"); - bio_endio(bio); + bcache_io_error(c, &rbio->bio, "no device to read from"); + bio_endio(&rbio->bio); return; } - sectors = min_t(u64, k.k->p.offset, - bvec_iter_end_sector(bvec_iter)) - - bvec_iter.bi_sector; - bytes = sectors << 9; - is_last = bytes == bvec_iter.bi_size; - swap(bvec_iter.bi_size, bytes); - - if (is_last) - flags |= BCH_READ_IS_LAST; + fragment = bvec_iter; + fragment.bi_size = (min_t(u64, k.k->p.offset, + bvec_iter_end_sector(bvec_iter)) - + bvec_iter.bi_sector) << 9; if (pick.ca) { - PTR_BUCKET(pick.ca, &pick.ptr)->prio[READ] = - c->prio_clock[READ].hand; - - bch2_read_extent_iter(c, rbio, bvec_iter, - k, &pick, flags); + if (fragment.bi_size != bvec_iter.bi_size) { + bio_inc_remaining(&rbio->bio); + flags |= BCH_READ_MUST_CLONE; + trace_read_split(&rbio->bio); + } - flags &= ~BCH_READ_MAY_REUSE_BIO; + ret = __bch2_read_extent(c, rbio, fragment, + k, &pick, flags); + switch (ret) { + case READ_RETRY_AVOID: + __set_bit(pick.ca->dev_idx, avoid->d); + case READ_RETRY: + goto retry; + case READ_ERR: + bio_endio(&rbio->bio); + return; + }; } else { - zero_fill_bio_iter(bio, bvec_iter); + zero_fill_bio_iter(&rbio->bio, fragment); - if (is_last) - bio_endio(bio); + if (fragment.bi_size == bvec_iter.bi_size) + bio_endio(&rbio->bio); } - if (is_last) + if (fragment.bi_size == bvec_iter.bi_size) return; - swap(bvec_iter.bi_size, bytes); - bio_advance_iter(bio, &bvec_iter, bytes); + bio_advance_iter(&rbio->bio, &bvec_iter, fragment.bi_size); } /* @@ -1305,55 +1291,6 @@ static void bch2_read_iter(struct bch_fs *c, struct bch_read_bio *rbio, */ ret = bch2_btree_iter_unlock(&iter); BUG_ON(!ret); - bcache_io_error(c, bio, "btree IO error %i", ret); - bio_endio(bio); -} - -void bch2_read(struct bch_fs *c, struct bch_read_bio *bio, u64 inode) -{ - bch2_read_iter(c, bio, bio->bio.bi_iter, inode, - BCH_READ_RETRY_IF_STALE| - BCH_READ_PROMOTE| - BCH_READ_MAY_REUSE_BIO| - BCH_READ_USER_MAPPED); -} - -/** - * bch_read_retry - re-submit a bio originally from bch2_read() - */ -static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio) -{ - struct bch_read_bio *parent = bch2_rbio_parent(rbio); - struct bvec_iter iter = rbio->parent_iter; - unsigned flags = rbio->flags; - u64 inode = rbio->inode; - - trace_read_retry(&rbio->bio); - - if (rbio->split) - bch2_rbio_free(rbio); - else - rbio->bio.bi_end_io = rbio->orig_bi_end_io; - - bch2_read_iter(c, parent, iter, inode, flags); -} - -void bch2_read_retry_work(struct work_struct *work) -{ - struct bch_fs *c = container_of(work, struct bch_fs, - read_retry_work); - struct bch_read_bio *rbio; - struct bio *bio; - - while (1) { - spin_lock_irq(&c->read_retry_lock); - bio = bio_list_pop(&c->read_retry_list); - spin_unlock_irq(&c->read_retry_lock); - - if (!bio) - break; - - rbio = container_of(bio, struct bch_read_bio, bio); - bch2_read_retry(c, rbio); - } + bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); + bio_endio(&rbio->bio); } diff --git a/libbcachefs/io.h b/libbcachefs/io.h index 619bf56..1aa0bfa 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -13,18 +13,20 @@ void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); +void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, + const struct bkey_i *); + enum bch_write_flags { BCH_WRITE_ALLOC_NOWAIT = (1 << 0), - BCH_WRITE_DISCARD = (1 << 1), - BCH_WRITE_CACHED = (1 << 2), - BCH_WRITE_FLUSH = (1 << 3), - BCH_WRITE_DISCARD_ON_ERROR = (1 << 4), - BCH_WRITE_DATA_COMPRESSED = (1 << 5), + BCH_WRITE_CACHED = (1 << 1), + BCH_WRITE_FLUSH = (1 << 2), + BCH_WRITE_DATA_COMPRESSED = (1 << 3), /* Internal: */ - BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 6), - BCH_WRITE_DONE = (1 << 7), - BCH_WRITE_LOOPED = (1 << 8), + BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 4), + BCH_WRITE_DONE = (1 << 5), + BCH_WRITE_LOOPED = (1 << 6), + __BCH_WRITE_KEYLIST_LOCKED = 8, }; static inline u64 *op_journal_seq(struct bch_write_op *op) @@ -53,43 +55,54 @@ static inline struct bch_write_bio *wbio_init(struct bio *bio) return wbio; } -struct cache_promote_op; +void bch2_wake_delayed_writes(unsigned long data); +struct bch_devs_mask; +struct cache_promote_op; struct extent_pick_ptr; -void bch2_read_extent_iter(struct bch_fs *, struct bch_read_bio *, - struct bvec_iter, struct bkey_s_c k, - struct extent_pick_ptr *, unsigned); +int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, + struct bkey_s_c k, struct extent_pick_ptr *, unsigned); +void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, + u64, struct bch_devs_mask *, unsigned); + +enum bch_read_flags { + BCH_READ_RETRY_IF_STALE = 1 << 0, + BCH_READ_MAY_PROMOTE = 1 << 1, + BCH_READ_USER_MAPPED = 1 << 2, + + /* internal: */ + BCH_READ_MUST_BOUNCE = 1 << 3, + BCH_READ_MUST_CLONE = 1 << 4, + BCH_READ_IN_RETRY = 1 << 5, +}; static inline void bch2_read_extent(struct bch_fs *c, - struct bch_read_bio *orig, + struct bch_read_bio *rbio, struct bkey_s_c k, struct extent_pick_ptr *pick, unsigned flags) { - bch2_read_extent_iter(c, orig, orig->bio.bi_iter, - k, pick, flags); + rbio->_state = 0; + __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, pick, flags); } -enum bch_read_flags { - BCH_READ_FORCE_BOUNCE = 1 << 0, - BCH_READ_RETRY_IF_STALE = 1 << 1, - BCH_READ_PROMOTE = 1 << 2, - BCH_READ_IS_LAST = 1 << 3, - BCH_READ_MAY_REUSE_BIO = 1 << 4, - BCH_READ_USER_MAPPED = 1 << 5, -}; - -void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); - -void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, - const struct bkey_i *); +static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + u64 inode) +{ + rbio->_state = 0; + __bch2_read(c, rbio, rbio->bio.bi_iter, inode, NULL, + BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE| + BCH_READ_USER_MAPPED); +} -int bch2_discard(struct bch_fs *, struct bpos, struct bpos, - struct bversion, struct disk_reservation *, - struct extent_insert_hook *, u64 *); +static inline struct bch_read_bio *rbio_init(struct bio *bio) +{ + struct bch_read_bio *rbio = to_rbio(bio); -void bch2_read_retry_work(struct work_struct *); -void bch2_wake_delayed_writes(unsigned long data); + rbio->_state = 0; + return rbio; +} #endif /* _BCACHE_IO_H */ diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index 3b73bcf..9842019 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -4,11 +4,20 @@ #include "btree_types.h" #include "buckets_types.h" #include "keylist_types.h" +#include "super_types.h" #include #include +struct extent_pick_ptr { + struct bch_extent_crc128 crc; + struct bch_extent_ptr ptr; + struct bch_dev *ca; +}; + struct bch_read_bio { + struct bch_fs *c; + /* * Reads will often have to be split, and if the extent being read from * was checksummed or compressed we'll also have to allocate bounce @@ -19,33 +28,37 @@ struct bch_read_bio { */ union { struct bch_read_bio *parent; - bio_end_io_t *orig_bi_end_io; + bio_end_io_t *end_io; }; /* - * Saved copy of parent->bi_iter, from submission time - allows us to + * Saved copy of bio->bi_iter, from submission time - allows us to * resubmit on IO error, and also to copy data back to the original bio * when we're bouncing: */ - struct bvec_iter parent_iter; + struct bvec_iter bvec_iter; unsigned submit_time_us; - u16 flags; + u8 flags; + union { + struct { u8 bounce:1, - split:1; + split:1, + process_context:1, + retry:2; + }; + u8 _state; + }; - struct bch_fs *c; - struct bch_dev *ca; - struct bch_extent_ptr ptr; - struct bch_extent_crc128 crc; + struct extent_pick_ptr pick; struct bversion version; - struct cache_promote_op *promote; + struct promote_op *promote; /* * If we have to retry the read (IO error, checksum failure, read stale * data (raced with allocator), we retry the portion of the parent bio - * that failed (i.e. this bio's portion, parent_iter). + * that failed (i.e. this bio's portion, bvec_iter). * * But we need to stash the inode somewhere: */ @@ -56,12 +69,6 @@ struct bch_read_bio { struct bio bio; }; -static inline struct bch_read_bio * -bch2_rbio_parent(struct bch_read_bio *rbio) -{ - return rbio->split ? rbio->parent : rbio; -} - struct bch_write_bio { struct bch_fs *c; struct bch_dev *ca; @@ -132,6 +139,8 @@ struct bch_write_op { int (*index_update_fn)(struct bch_write_op *); + struct bch_devs_mask failed; + struct keylist insert_keys; u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index bf8c152..6dc14ff 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -10,6 +10,7 @@ #include "buckets.h" #include "btree_gc.h" #include "btree_update.h" +#include "btree_update_interior.h" #include "btree_io.h" #include "checksum.h" #include "debug.h" @@ -150,7 +151,7 @@ static void journal_seq_blacklist_flush(struct journal *j, } for (i = 0;; i++) { - struct btree_interior_update *as; + struct btree_update *as; struct pending_btree_node_free *d; mutex_lock(&j->blacklist_lock); @@ -673,9 +674,9 @@ reread: sectors_read = min_t(unsigned, ret = submit_bio_wait(bio); - if (bch2_dev_fatal_io_err_on(ret, ca, - "journal read from sector %llu", - offset) || + if (bch2_dev_io_err_on(ret, ca, + "journal read from sector %llu", + offset) || bch2_meta_read_fault("journal")) return -EIO; @@ -1086,7 +1087,6 @@ static bool journal_entry_is_open(struct journal *j) void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *w = journal_prev_buf(j); atomic_dec_bug(&journal_seq_pin(j, w->data->seq)->count); @@ -1096,10 +1096,10 @@ void bch2_journal_buf_put_slowpath(struct journal *j, bool need_write_just_set) __bch2_time_stats_update(j->delay_time, j->need_write_time); #if 0 - closure_call(&j->io, journal_write, NULL, &c->cl); + closure_call(&j->io, journal_write, NULL, NULL); #else /* Shut sparse up: */ - closure_init(&j->io, &c->cl); + closure_init(&j->io, NULL); set_closure_fn(&j->io, journal_write, NULL); journal_write(&j->io); #endif @@ -1734,13 +1734,11 @@ void bch2_journal_pin_drop(struct journal *j, struct journal_entry_pin *pin) { unsigned long flags; - bool wakeup; - - if (!journal_pin_active(pin)) - return; + bool wakeup = false; spin_lock_irqsave(&j->pin_lock, flags); - wakeup = __journal_pin_drop(j, pin); + if (journal_pin_active(pin)) + wakeup = __journal_pin_drop(j, pin); spin_unlock_irqrestore(&j->pin_lock, flags); /* @@ -2099,17 +2097,23 @@ static void journal_write_compact(struct jset *jset) jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); } -static void journal_write_endio(struct bio *bio) +static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) { - struct bch_dev *ca = bio->bi_private; - struct journal *j = &ca->fs->journal; + /* we aren't holding j->lock: */ + unsigned new_size = READ_ONCE(j->buf_size_want); + void *new_buf; - if (bch2_dev_fatal_io_err_on(bio->bi_error, ca, "journal write") || - bch2_meta_write_fault("journal")) - bch2_journal_halt(j); + if (buf->size >= new_size) + return; - closure_put(&j->io); - percpu_ref_put(&ca->io_ref); + new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN); + if (!new_buf) + return; + + memcpy(new_buf, buf->data, buf->size); + kvpfree(buf->data, buf->size); + buf->data = new_buf; + buf->size = new_size; } static void journal_write_done(struct closure *cl) @@ -2119,6 +2123,7 @@ static void journal_write_done(struct closure *cl) __bch2_time_stats_update(j->write_time, j->write_start_time); + spin_lock(&j->lock); j->last_seq_ondisk = le64_to_cpu(w->data->last_seq); /* @@ -2130,46 +2135,68 @@ static void journal_write_done(struct closure *cl) */ mod_delayed_work(system_freezable_wq, &j->reclaim_work, 0); + /* also must come before signalling write completion: */ + closure_debug_destroy(cl); + BUG_ON(!j->reservations.prev_buf_unwritten); atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v, &j->reservations.counter); - /* - * XXX: this is racy, we could technically end up doing the wake up - * after the journal_buf struct has been reused for the next write - * (because we're clearing JOURNAL_IO_IN_FLIGHT) and wake up things that - * are waiting on the _next_ write, not this one. - * - * The wake up can't come before, because journal_flush_seq_async() is - * looking at JOURNAL_IO_IN_FLIGHT when it has to wait on a journal - * write that was already in flight. - * - * The right fix is to use a lock here, but using j.lock here means it - * has to be a spin_lock_irqsave() lock which then requires propagating - * the irq()ness to other locks and it's all kinds of nastiness. - */ - closure_wake_up(&w->wait); wake_up(&j->wait); + + if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) + mod_delayed_work(system_freezable_wq, &j->write_work, 0); + spin_unlock(&j->lock); } -static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) +static void journal_write_error(struct closure *cl) { - /* we aren't holding j->lock: */ - unsigned new_size = READ_ONCE(j->buf_size_want); - void *new_buf; + struct journal *j = container_of(cl, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bkey_s_extent e = bkey_i_to_s_extent(&j->key); - if (buf->size >= new_size) - return; + while (j->replicas_failed) { + unsigned idx = __fls(j->replicas_failed); - new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN); - if (!new_buf) - return; + bch2_extent_drop_ptr_idx(e, idx); + j->replicas_failed ^= 1 << idx; + } - memcpy(new_buf, buf->data, buf->size); - kvpfree(buf->data, buf->size); - buf->data = new_buf; - buf->size = new_size; + if (!bch2_extent_nr_ptrs(e.c)) { + bch_err(c, "unable to write journal to sufficient devices"); + goto err; + } + + if (bch2_check_mark_super(c, e.c, BCH_DATA_JOURNAL)) + goto err; + +out: + journal_write_done(cl); + return; +err: + bch2_fatal_error(c); + bch2_journal_halt(j); + goto out; +} + +static void journal_write_endio(struct bio *bio) +{ + struct bch_dev *ca = bio->bi_private; + struct journal *j = &ca->fs->journal; + + if (bch2_dev_io_err_on(bio->bi_error, ca, "journal write") || + bch2_meta_write_fault("journal")) { + /* Was this a flush or an actual journal write? */ + if (ca->journal.ptr_idx != U8_MAX) { + set_bit(ca->journal.ptr_idx, &j->replicas_failed); + set_closure_fn(&j->io, journal_write_error, + system_highpri_wq); + } + } + + closure_put(&j->io); + percpu_ref_put(&ca->io_ref); } static void journal_write(struct closure *cl) @@ -2181,7 +2208,7 @@ static void journal_write(struct closure *cl) struct jset *jset; struct bio *bio; struct bch_extent_ptr *ptr; - unsigned i, sectors, bytes; + unsigned i, sectors, bytes, ptr_idx = 0; journal_buf_realloc(j, w); jset = w->data; @@ -2231,7 +2258,7 @@ static void journal_write(struct closure *cl) bch2_journal_halt(j); bch_err(c, "Unable to allocate journal write"); bch2_fatal_error(c); - closure_return_with_destructor(cl, journal_write_done); + continue_at(cl, journal_write_done, system_highpri_wq); } if (bch2_check_mark_super(c, bkey_i_to_s_c_extent(&j->key), @@ -2255,6 +2282,7 @@ static void journal_write(struct closure *cl) atomic64_add(sectors, &ca->meta_sectors_written); + ca->journal.ptr_idx = ptr_idx++; bio = ca->journal.bio; bio_reset(bio); bio->bi_iter.bi_sector = ptr->offset; @@ -2277,6 +2305,7 @@ static void journal_write(struct closure *cl) !bch2_extent_has_device(bkey_i_to_s_c_extent(&j->key), i)) { percpu_ref_get(&ca->io_ref); + ca->journal.ptr_idx = U8_MAX; bio = ca->journal.bio; bio_reset(bio); bio->bi_bdev = ca->disk_sb.bdev; @@ -2290,10 +2319,10 @@ no_io: extent_for_each_ptr(bkey_i_to_s_extent(&j->key), ptr) ptr->offset += sectors; - closure_return_with_destructor(cl, journal_write_done); + continue_at(cl, journal_write_done, system_highpri_wq); err: bch2_inconsistent_error(c); - closure_return_with_destructor(cl, journal_write_done); + continue_at(cl, journal_write_done, system_highpri_wq); } static void journal_write_work(struct work_struct *work) @@ -2524,18 +2553,61 @@ void bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *pa spin_unlock(&j->lock); } +static int journal_seq_flushed(struct journal *j, u64 seq) +{ + struct journal_buf *buf; + int ret = 1; + + spin_lock(&j->lock); + BUG_ON(seq > atomic64_read(&j->seq)); + + if (seq == atomic64_read(&j->seq)) { + bool set_need_write = false; + + ret = 0; + + buf = journal_cur_buf(j); + + if (!test_and_set_bit(JOURNAL_NEED_WRITE, &j->flags)) { + j->need_write_time = local_clock(); + set_need_write = true; + } + + switch (journal_buf_switch(j, set_need_write)) { + case JOURNAL_ENTRY_ERROR: + ret = -EIO; + break; + case JOURNAL_ENTRY_CLOSED: + /* + * Journal entry hasn't been opened yet, but caller + * claims it has something (seq == j->seq): + */ + BUG(); + case JOURNAL_ENTRY_INUSE: + break; + case JOURNAL_UNLOCKED: + return 0; + } + } else if (seq + 1 == atomic64_read(&j->seq) && + j->reservations.prev_buf_unwritten) { + ret = bch2_journal_error(j); + } + + spin_unlock(&j->lock); + + return ret; +} + int bch2_journal_flush_seq(struct journal *j, u64 seq) { - struct closure cl; u64 start_time = local_clock(); + int ret, ret2; - closure_init_stack(&cl); - bch2_journal_flush_seq_async(j, seq, &cl); - closure_sync(&cl); + ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq))); bch2_time_stats_update(j->flush_seq_time, start_time); - return bch2_journal_error(j); + return ret ?: ret2 < 0 ? ret2 : 0; } void bch2_journal_meta_async(struct journal *j, struct closure *parent) diff --git a/libbcachefs/journal_types.h b/libbcachefs/journal_types.h index 3314fc0..e334245 100644 --- a/libbcachefs/journal_types.h +++ b/libbcachefs/journal_types.h @@ -139,6 +139,7 @@ struct journal { struct closure io; struct delayed_work write_work; + unsigned long replicas_failed; /* Sequence number of most recent journal entry (last entry in @pin) */ atomic64_t seq; @@ -227,6 +228,7 @@ struct journal_device { /* Bio for journal reads/writes to this device */ struct bio *bio; + u8 ptr_idx; /* for bch_journal_read_device */ struct closure read; diff --git a/libbcachefs/keylist.c b/libbcachefs/keylist.c index 51dd7ed..bc724e7 100644 --- a/libbcachefs/keylist.c +++ b/libbcachefs/keylist.c @@ -53,3 +53,14 @@ void bch2_keylist_pop_front(struct keylist *l) bkey_next(l->keys), bch_keylist_u64s(l)); } + +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_verify_keylist_sorted(struct keylist *l) +{ + struct bkey_i *k; + + for_each_keylist_key(l, k) + BUG_ON(bkey_next(k) != l->top && + bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0); +} +#endif diff --git a/libbcachefs/keylist.h b/libbcachefs/keylist.h index 6662805..87388c9 100644 --- a/libbcachefs/keylist.h +++ b/libbcachefs/keylist.h @@ -59,4 +59,10 @@ static inline struct bkey_i *bch2_keylist_front(struct keylist *l) #define keylist_single(k) \ ((struct keylist) { .keys = k, .top = bkey_next(k) }) +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_verify_keylist_sorted(struct keylist *); +#else +static inline void bch2_verify_keylist_sorted(struct keylist *l) {} +#endif + #endif /* _BCACHE_KEYLIST_H */ diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 78f6d3c..c665925 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -72,7 +72,7 @@ int bch2_move_data_off_device(struct bch_dev *ca) bch2_replicas_gc_start(c, 1 << BCH_DATA_USER); bch2_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE); - ctxt.avoid = ca; + __set_bit(ca->dev_idx, ctxt.avoid.d); /* * In theory, only one pass should be necessary as we've diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 8ef1a0b..73132a0 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -30,7 +30,7 @@ static struct bch_extent_ptr *bkey_find_ptr(struct bch_fs *c, } static struct bch_extent_ptr *bch2_migrate_matching_ptr(struct migrate_write *m, - struct bkey_s_extent e) + struct bkey_s_extent e) { const struct bch_extent_ptr *ptr; struct bch_extent_ptr *ret; @@ -138,11 +138,11 @@ out: } void bch2_migrate_write_init(struct bch_fs *c, - struct migrate_write *m, - struct write_point *wp, - struct bkey_s_c k, - const struct bch_extent_ptr *move_ptr, - unsigned flags) + struct migrate_write *m, + struct write_point *wp, + struct bkey_s_c k, + const struct bch_extent_ptr *move_ptr, + unsigned flags) { bkey_reassemble(&m->key, k); @@ -178,23 +178,18 @@ static void migrate_bio_init(struct moving_io *io, struct bio *bio, bch2_bio_map(bio, NULL); } -static void moving_io_destructor(struct closure *cl) +static void moving_io_free(struct moving_io *io) { - struct moving_io *io = container_of(cl, struct moving_io, cl); struct moving_context *ctxt = io->ctxt; struct bio_vec *bv; int i; - //if (io->replace.failures) - // trace_copy_collision(q, &io->key.k); - atomic_sub(io->write.key.k.size, &ctxt->sectors_in_flight); wake_up(&ctxt->wait); bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i) if (bv->bv_page) __free_page(bv->bv_page); - kfree(io); } @@ -204,27 +199,26 @@ static void moving_error(struct moving_context *ctxt, unsigned flag) //atomic_or(flag, &ctxt->error_flags); } -static void moving_io_after_write(struct closure *cl) +static void moving_write_done(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); - struct moving_context *ctxt = io->ctxt; if (io->write.op.error) - moving_error(ctxt, MOVING_FLAG_WRITE); + moving_error(io->ctxt, MOVING_FLAG_WRITE); - moving_io_destructor(cl); + //if (io->replace.failures) + // trace_copy_collision(q, &io->key.k); + + moving_io_free(io); } -static void write_moving(struct moving_io *io) +static void write_moving(struct closure *cl) { + struct moving_io *io = container_of(cl, struct moving_io, cl); struct bch_write_op *op = &io->write.op; - if (op->error) { - closure_return_with_destructor(&io->cl, moving_io_destructor); - } else { - closure_call(&op->cl, bch2_write, NULL, &io->cl); - closure_return_with_destructor(&io->cl, moving_io_after_write); - } + closure_call(&op->cl, bch2_write, NULL, &io->cl); + closure_return_with_destructor(&io->cl, moving_write_done); } static inline struct moving_io *next_pending_write(struct moving_context *ctxt) @@ -243,10 +237,8 @@ static void read_moving_endio(struct bio *bio) trace_move_read_done(&io->write.key.k); - if (bio->bi_error) { - io->write.op.error = bio->bi_error; + if (bio->bi_error) moving_error(io->ctxt, MOVING_FLAG_READ); - } io->read_completed = true; if (next_pending_write(ctxt)) @@ -255,43 +247,21 @@ static void read_moving_endio(struct bio *bio) closure_put(&ctxt->cl); } -static void __bch2_data_move(struct closure *cl) +int bch2_data_move(struct bch_fs *c, + struct moving_context *ctxt, + struct write_point *wp, + struct bkey_s_c k, + const struct bch_extent_ptr *move_ptr) { - struct moving_io *io = container_of(cl, struct moving_io, cl); - struct bch_fs *c = io->write.op.c; struct extent_pick_ptr pick; + struct moving_io *io; - bch2_extent_pick_ptr_avoiding(c, bkey_i_to_s_c(&io->write.key), - io->ctxt->avoid, &pick); + bch2_extent_pick_ptr(c, k, &ctxt->avoid, &pick); if (IS_ERR_OR_NULL(pick.ca)) - closure_return_with_destructor(cl, moving_io_destructor); - - bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0); - io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&io->write.key.k); - io->rbio.bio.bi_end_io = read_moving_endio; - - /* - * dropped by read_moving_endio() - guards against use after free of - * ctxt when doing wakeup - */ - closure_get(&io->ctxt->cl); - - bch2_read_extent(c, &io->rbio, - bkey_i_to_s_c(&io->write.key), - &pick, BCH_READ_IS_LAST); -} - -int bch2_data_move(struct bch_fs *c, - struct moving_context *ctxt, - struct write_point *wp, - struct bkey_s_c k, - const struct bch_extent_ptr *move_ptr) -{ - struct moving_io *io; + return pick.ca ? PTR_ERR(pick.ca) : 0; io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) * - DIV_ROUND_UP(k.k->size, PAGE_SECTORS), - GFP_KERNEL); + DIV_ROUND_UP(k.k->size, PAGE_SECTORS), GFP_KERNEL); if (!io) return -ENOMEM; @@ -299,6 +269,10 @@ int bch2_data_move(struct bch_fs *c, migrate_bio_init(io, &io->rbio.bio, k.k->size); + bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0); + io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); + io->rbio.bio.bi_end_io = read_moving_endio; + if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) { kfree(io); return -ENOMEM; @@ -318,7 +292,12 @@ int bch2_data_move(struct bch_fs *c, atomic_add(k.k->size, &ctxt->sectors_in_flight); list_add_tail(&io->list, &ctxt->reads); - closure_call(&io->cl, __bch2_data_move, NULL, &ctxt->cl); + /* + * dropped by read_moving_endio() - guards against use after free of + * ctxt when doing wakeup + */ + closure_get(&io->ctxt->cl); + bch2_read_extent(c, &io->rbio, k, &pick, 0); return 0; } @@ -328,8 +307,14 @@ static void do_pending_writes(struct moving_context *ctxt) while ((io = next_pending_write(ctxt))) { list_del(&io->list); + + if (io->rbio.bio.bi_error) { + moving_io_free(io); + continue; + } + trace_move_write(&io->write.key.k); - write_moving(io); + closure_call(&io->cl, write_moving, NULL, &ctxt->cl); } } diff --git a/libbcachefs/move.h b/libbcachefs/move.h index 094eac8..ed0b24c 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -46,7 +46,7 @@ struct moving_context { struct bch_ratelimit *rate; /* Try to avoid reading the following device */ - struct bch_dev *avoid; + struct bch_devs_mask avoid; struct list_head reads; diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 1eb27ae..b1bbf09 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -181,7 +181,7 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options) if (val) { id = bch2_opt_lookup(name); if (id < 0) - return -EINVAL; + continue; ret = parse_one_opt(id, val, &v); if (ret < 0) @@ -196,8 +196,9 @@ int bch2_parse_mount_opts(struct bch_opts *opts, char *options) v = 0; } - if (bch2_opt_table[id].type != BCH_OPT_BOOL) - return -EINVAL; + if (id < 0 || + bch2_opt_table[id].type != BCH_OPT_BOOL) + continue; } bch2_opt_set(opts, id, v); diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 0ddfad3..abcc933 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -700,23 +700,18 @@ static void write_super_endio(struct bio *bio) /* XXX: return errors directly */ - bch2_dev_fatal_io_err_on(bio->bi_error, ca, "superblock write"); + if (bch2_dev_io_err_on(bio->bi_error, ca, "superblock write")) + ca->sb_write_error = 1; closure_put(&ca->fs->sb_write); percpu_ref_put(&ca->io_ref); } -static bool write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) +static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) { struct bch_sb *sb = ca->disk_sb.sb; struct bio *bio = ca->disk_sb.bio; - if (idx >= sb->layout.nr_superblocks) - return false; - - if (!percpu_ref_tryget(&ca->io_ref)) - return false; - sb->offset = sb->layout.sb_offset[idx]; SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum); @@ -734,21 +729,23 @@ static bool write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); bch2_bio_map(bio, sb); + percpu_ref_get(&ca->io_ref); closure_bio_submit(bio, &c->sb_write); - return true; } void bch2_write_super(struct bch_fs *c) { struct closure *cl = &c->sb_write; struct bch_dev *ca; - unsigned i, super_idx = 0; + unsigned i, sb = 0, nr_wrote; const char *err; - bool wrote; + struct bch_devs_mask sb_written; + bool wrote, can_mount_without_written, can_mount_with_written; lockdep_assert_held(&c->sb_lock); closure_init_stack(cl); + memset(&sb_written, 0, sizeof(sb_written)); le64_add_cpu(&c->disk_sb->seq, 1); @@ -767,15 +764,53 @@ void bch2_write_super(struct bch_fs *c) test_bit(BCH_FS_ERROR, &c->flags)) goto out; + for_each_online_member(ca, c, i) { + __set_bit(ca->dev_idx, sb_written.d); + ca->sb_write_error = 0; + } + do { wrote = false; for_each_online_member(ca, c, i) - if (write_one_super(c, ca, super_idx)) + if (sb < ca->disk_sb.sb->layout.nr_superblocks) { + write_one_super(c, ca, sb); wrote = true; - + } closure_sync(cl); - super_idx++; + sb++; } while (wrote); + + for_each_online_member(ca, c, i) + if (ca->sb_write_error) + __clear_bit(ca->dev_idx, sb_written.d); + + nr_wrote = bitmap_weight(sb_written.d, BCH_SB_MEMBERS_MAX); + + can_mount_with_written = + bch2_have_enough_devs(c, + __bch2_replicas_status(c, sb_written), + BCH_FORCE_IF_DEGRADED); + + for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) + sb_written.d[i] = ~sb_written.d[i]; + + can_mount_without_written = + bch2_have_enough_devs(c, + __bch2_replicas_status(c, sb_written), + BCH_FORCE_IF_DEGRADED); + + /* + * If we would be able to mount _without_ the devices we successfully + * wrote superblocks to, we weren't able to write to enough devices: + * + * Exception: if we can mount without the successes because we haven't + * written anything (new filesystem), we continue if we'd be able to + * mount with the devices we did successfully write to: + */ + bch2_fs_fatal_err_on(!nr_wrote || + (can_mount_without_written && + !can_mount_with_written), c, + "Unable to write superblock to sufficient devices"); out: /* Make new options visible after they're persistent: */ bch2_sb_update(c); @@ -1087,7 +1122,7 @@ int bch2_check_mark_super(struct bch_fs *c, struct bkey_s_c_extent e, } struct replicas_status __bch2_replicas_status(struct bch_fs *c, - struct bch_dev *dev_to_offline) + struct bch_devs_mask online_devs) { struct bch_replicas_cpu_entry *e; struct bch_replicas_cpu *r; @@ -1114,8 +1149,7 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c, if (!replicas_test_dev(e, dev)) continue; - if (bch2_dev_is_online(c->devs[dev]) && - c->devs[dev] != dev_to_offline) + if (test_bit(dev, online_devs.d)) nr_online++; else nr_offline++; @@ -1137,7 +1171,32 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c, struct replicas_status bch2_replicas_status(struct bch_fs *c) { - return __bch2_replicas_status(c, NULL); + return __bch2_replicas_status(c, bch2_online_devs(c)); +} + +bool bch2_have_enough_devs(struct bch_fs *c, + struct replicas_status s, + unsigned flags) +{ + if ((s.replicas[BCH_DATA_JOURNAL].nr_offline || + s.replicas[BCH_DATA_BTREE].nr_offline) && + !(flags & BCH_FORCE_IF_METADATA_DEGRADED)) + return false; + + if ((!s.replicas[BCH_DATA_JOURNAL].nr_online || + !s.replicas[BCH_DATA_BTREE].nr_online) && + !(flags & BCH_FORCE_IF_METADATA_LOST)) + return false; + + if (s.replicas[BCH_DATA_USER].nr_offline && + !(flags & BCH_FORCE_IF_DATA_DEGRADED)) + return false; + + if (!s.replicas[BCH_DATA_USER].nr_online && + !(flags & BCH_FORCE_IF_DATA_LOST)) + return false; + + return true; } unsigned bch2_replicas_online(struct bch_fs *c, bool meta) diff --git a/libbcachefs/super-io.h b/libbcachefs/super-io.h index 65dd9fb..e5e865a 100644 --- a/libbcachefs/super-io.h +++ b/libbcachefs/super-io.h @@ -4,6 +4,7 @@ #include "extents.h" #include "eytzinger.h" #include "super_types.h" +#include "super.h" #include @@ -134,8 +135,9 @@ struct replicas_status { }; struct replicas_status __bch2_replicas_status(struct bch_fs *, - struct bch_dev *); + struct bch_devs_mask); struct replicas_status bch2_replicas_status(struct bch_fs *); +bool bch2_have_enough_devs(struct bch_fs *, struct replicas_status, unsigned); unsigned bch2_replicas_online(struct bch_fs *, bool); unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index c4cb0b2..ad38842 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -11,6 +11,7 @@ #include "btree_cache.h" #include "btree_gc.h" #include "btree_update.h" +#include "btree_update_interior.h" #include "btree_io.h" #include "chardev.h" #include "checksum.h" @@ -416,7 +417,6 @@ static void bch2_fs_exit(struct bch_fs *c) del_timer_sync(&c->foreground_write_wakeup); cancel_delayed_work_sync(&c->pd_controllers_update); cancel_work_sync(&c->read_only_work); - cancel_work_sync(&c->read_retry_work); for (i = 0; i < c->sb.nr_devices; i++) if (c->devs[i]) @@ -519,10 +519,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mutex_init(&c->bio_bounce_pages_lock); mutex_init(&c->zlib_workspace_lock); - bio_list_init(&c->read_retry_list); - spin_lock_init(&c->read_retry_lock); - INIT_WORK(&c->read_retry_work, bch2_read_retry_work); - bio_list_init(&c->btree_write_error_list); spin_lock_init(&c->btree_write_error_lock); INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work); @@ -584,7 +580,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) mempool_init_kmalloc_pool(&c->btree_reserve_pool, 1, sizeof(struct btree_reserve)) || mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, - sizeof(struct btree_interior_update)) || + sizeof(struct btree_update)) || mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || bioset_init(&c->btree_read_bio, 1, offsetof(struct btree_read_bio, bio)) || @@ -1120,7 +1116,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) spin_lock_init(&ca->freelist_lock); bch2_dev_moving_gc_init(ca); - INIT_WORK(&ca->io_error_work, bch2_nonfatal_io_error_work); + INIT_WORK(&ca->io_error_work, bch2_io_error_work); if (bch2_fs_init_fault("dev_alloc")) goto err; @@ -1262,31 +1258,6 @@ static int __bch2_dev_online(struct bch_fs *c, struct bcache_superblock *sb) /* Device management: */ -static bool have_enough_devs(struct bch_fs *c, - struct replicas_status s, - unsigned flags) -{ - if ((s.replicas[BCH_DATA_JOURNAL].nr_offline || - s.replicas[BCH_DATA_BTREE].nr_offline) && - !(flags & BCH_FORCE_IF_METADATA_DEGRADED)) - return false; - - if ((!s.replicas[BCH_DATA_JOURNAL].nr_online || - !s.replicas[BCH_DATA_BTREE].nr_online) && - !(flags & BCH_FORCE_IF_METADATA_LOST)) - return false; - - if (s.replicas[BCH_DATA_USER].nr_offline && - !(flags & BCH_FORCE_IF_DATA_DEGRADED)) - return false; - - if (!s.replicas[BCH_DATA_USER].nr_online && - !(flags & BCH_FORCE_IF_DATA_LOST)) - return false; - - return true; -} - /* * Note: this function is also used by the error paths - when a particular * device sees an error, we call it to determine whether we can just set the @@ -1299,6 +1270,7 @@ static bool have_enough_devs(struct bch_fs *c, bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, enum bch_member_state new_state, int flags) { + struct bch_devs_mask new_online_devs; struct replicas_status s; struct bch_dev *ca2; int i, nr_rw = 0, required; @@ -1331,19 +1303,12 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, return true; /* do we have enough devices to read from? */ - s = __bch2_replicas_status(c, ca); - - pr_info("replicas: j %u %u b %u %u d %u %u", - s.replicas[BCH_DATA_JOURNAL].nr_online, - s.replicas[BCH_DATA_JOURNAL].nr_offline, - - s.replicas[BCH_DATA_BTREE].nr_online, - s.replicas[BCH_DATA_BTREE].nr_offline, + new_online_devs = bch2_online_devs(c); + __clear_bit(ca->dev_idx, new_online_devs.d); - s.replicas[BCH_DATA_USER].nr_online, - s.replicas[BCH_DATA_USER].nr_offline); + s = __bch2_replicas_status(c, new_online_devs); - return have_enough_devs(c, s, flags); + return bch2_have_enough_devs(c, s, flags); default: BUG(); } @@ -1374,7 +1339,7 @@ static bool bch2_fs_may_start(struct bch_fs *c) s = bch2_replicas_status(c); - return have_enough_devs(c, s, flags); + return bch2_have_enough_devs(c, s, flags); } static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) diff --git a/libbcachefs/super.h b/libbcachefs/super.h index e4bb583..54f60c6 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -94,6 +94,18 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, __for_each_online_member(ca, c, iter, \ (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO)) +static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) +{ + struct bch_devs_mask devs; + struct bch_dev *ca; + unsigned i; + + memset(&devs, 0, sizeof(devs)); + for_each_online_member(ca, c, i) + __set_bit(ca->dev_idx, devs.d); + return devs; +} + struct bch_fs *bch2_bdev_to_fs(struct block_device *); struct bch_fs *bch2_uuid_to_fs(uuid_le); int bch2_congested(struct bch_fs *, int); diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h index 69c747d..9f79d8a 100644 --- a/libbcachefs/super_types.h +++ b/libbcachefs/super_types.h @@ -9,4 +9,8 @@ struct bcache_superblock { fmode_t mode; }; +struct bch_devs_mask { + unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; +}; + #endif /* _BCACHE_SUPER_TYPES_H */ diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 1986fdd..9e9ef4c 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -232,24 +232,36 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) return scnprintf(buf, PAGE_SIZE, "capacity:\t\t%llu\n" - "compressed:\n" + "1 replicas:\n" "\tmeta:\t\t%llu\n" "\tdirty:\t\t%llu\n" - "\tcached:\t\t%llu\n" - "uncompressed:\n" + "\treserved:\t%llu\n" + "2 replicas:\n" "\tmeta:\t\t%llu\n" "\tdirty:\t\t%llu\n" - "\tcached:\t\t%llu\n" - "persistent reserved sectors:\t%llu\n" - "online reserved sectors:\t%llu\n", + "\treserved:\t%llu\n" + "3 replicas:\n" + "\tmeta:\t\t%llu\n" + "\tdirty:\t\t%llu\n" + "\treserved:\t%llu\n" + "4 replicas:\n" + "\tmeta:\t\t%llu\n" + "\tdirty:\t\t%llu\n" + "\treserved:\t%llu\n" + "online reserved:\t%llu\n", c->capacity, - stats.s[S_COMPRESSED][S_META], - stats.s[S_COMPRESSED][S_DIRTY], - stats.s[S_COMPRESSED][S_CACHED], - stats.s[S_UNCOMPRESSED][S_META], - stats.s[S_UNCOMPRESSED][S_DIRTY], - stats.s[S_UNCOMPRESSED][S_CACHED], - stats.persistent_reserved, + stats.s[0].data[S_META], + stats.s[0].data[S_DIRTY], + stats.s[0].persistent_reserved, + stats.s[1].data[S_META], + stats.s[1].data[S_DIRTY], + stats.s[1].persistent_reserved, + stats.s[2].data[S_META], + stats.s[2].data[S_DIRTY], + stats.s[2].persistent_reserved, + stats.s[3].data[S_META], + stats.s[3].data[S_DIRTY], + stats.s[3].persistent_reserved, stats.online_reserved); } @@ -708,8 +720,8 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, stats.buckets_alloc, ca->mi.nbuckets - ca->mi.first_bucket, - stats.buckets_meta, ca->mi.nbuckets - ca->mi.first_bucket, - stats.buckets_dirty, ca->mi.nbuckets - ca->mi.first_bucket, + stats.buckets[S_META], ca->mi.nbuckets - ca->mi.first_bucket, + stats.buckets[S_DIRTY], ca->mi.nbuckets - ca->mi.first_bucket, __dev_buckets_available(ca, stats), ca->mi.nbuckets - ca->mi.first_bucket, c->freelist_wait.list.first ? "waiting" : "empty", c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE, @@ -749,11 +761,11 @@ SHOW(bch2_dev) sysfs_hprint(dirty_data, stats.sectors[S_DIRTY] << 9); sysfs_print(dirty_bytes, stats.sectors[S_DIRTY] << 9); - sysfs_print(dirty_buckets, stats.buckets_dirty); - sysfs_hprint(cached_data, stats.sectors[S_CACHED] << 9); - sysfs_print(cached_bytes, stats.sectors[S_CACHED] << 9); + sysfs_print(dirty_buckets, stats.buckets[S_DIRTY]); + sysfs_hprint(cached_data, stats.sectors_cached << 9); + sysfs_print(cached_bytes, stats.sectors_cached << 9); sysfs_print(cached_buckets, stats.buckets_cached); - sysfs_print(meta_buckets, stats.buckets_meta); + sysfs_print(meta_buckets, stats.buckets[S_META]); sysfs_print(alloc_buckets, stats.buckets_alloc); sysfs_print(available_buckets, dev_buckets_available(ca)); sysfs_print(free_buckets, dev_buckets_free(ca)); -- 2.39.2