From ea83a3985d28372d56ec7cea6e73907551869f63 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 13 Dec 2017 16:01:18 -0500 Subject: [PATCH] Update bcachefs sources to e57b5958cf bcachefs: fix for building in userspace --- .bcachefs_revision | 2 +- cmd_migrate.c | 2 +- include/trace/events/bcachefs.h | 226 +---- libbcachefs/alloc.c | 921 ++++++++------------ libbcachefs/alloc.h | 87 +- libbcachefs/alloc_types.h | 25 +- libbcachefs/bcachefs.h | 85 +- libbcachefs/bcachefs_format.h | 14 +- libbcachefs/bkey.c | 8 + libbcachefs/bset.h | 24 +- libbcachefs/btree_gc.c | 193 +++-- libbcachefs/btree_gc.h | 10 +- libbcachefs/btree_io.c | 8 +- libbcachefs/btree_io.h | 1 + libbcachefs/btree_locking.h | 2 +- libbcachefs/btree_types.h | 30 +- libbcachefs/btree_update_interior.c | 96 ++- libbcachefs/btree_update_leaf.c | 5 + libbcachefs/buckets.c | 350 ++++---- libbcachefs/buckets.h | 76 +- libbcachefs/buckets_types.h | 15 +- libbcachefs/checksum.c | 168 +++- libbcachefs/checksum.h | 36 +- libbcachefs/compress.c | 135 ++- libbcachefs/compress.h | 10 +- libbcachefs/extents.c | 539 ++++++------ libbcachefs/extents.h | 339 +++----- libbcachefs/extents_types.h | 27 + libbcachefs/eytzinger.h | 86 +- libbcachefs/fs-io.c | 193 ++++- libbcachefs/fs-io.h | 65 +- libbcachefs/fs.c | 26 +- libbcachefs/io.c | 1209 ++++++++++++++++----------- libbcachefs/io.h | 93 ++- libbcachefs/io_types.h | 53 +- libbcachefs/journal.c | 50 +- libbcachefs/keylist.h | 5 +- libbcachefs/migrate.c | 125 +-- libbcachefs/move.c | 466 ++++++----- libbcachefs/move.h | 80 +- libbcachefs/movinggc.c | 303 +++---- libbcachefs/movinggc.h | 28 +- libbcachefs/super-io.c | 5 + libbcachefs/super.c | 33 +- libbcachefs/super.h | 24 + libbcachefs/super_types.h | 29 + libbcachefs/sysfs.c | 80 +- libbcachefs/tier.c | 126 +-- libbcachefs/util.c | 135 ++- libbcachefs/util.h | 24 + 50 files changed, 3405 insertions(+), 3267 deletions(-) create mode 100644 libbcachefs/extents_types.h diff --git a/.bcachefs_revision b/.bcachefs_revision index 04ebc30..7724716 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -192d759a491f50d92c89c2e842639d2307c815a5 +e57b5958cf4e8530d26f7c36a6e1427fb284cc70 diff --git a/cmd_migrate.c b/cmd_migrate.c index d683a5f..58c0bb9 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -265,7 +265,7 @@ static void write_data(struct bch_fs *c, if (ret) die("error reserving space in new filesystem: %s", strerror(-ret)); - bch2_write_op_init(&op, c, res, NULL, 0, + bch2_write_op_init(&op, c, res, NULL, writepoint_hashed(0), POS(dst_inode->bi_inum, dst_offset >> 9), NULL, 0); closure_call(&op.cl, bch2_write, NULL, &cl); closure_sync(&cl); diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index 0c9f3de..bf187f5 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -98,23 +98,6 @@ DECLARE_EVENT_CLASS(bio, (unsigned long long)__entry->sector, __entry->nr_sector) ); -DECLARE_EVENT_CLASS(page_alloc_fail, - TP_PROTO(struct bch_fs *c, u64 size), - TP_ARGS(c, size), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(u64, size ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); - __entry->size = size; - ), - - TP_printk("%pU size %llu", __entry->uuid, __entry->size) -); - /* io.c: */ DEFINE_EVENT(bio, read_split, @@ -137,34 +120,6 @@ DEFINE_EVENT(bio, promote, TP_ARGS(bio) ); -TRACE_EVENT(write_throttle, - TP_PROTO(struct bch_fs *c, u64 inode, struct bio *bio, u64 delay), - TP_ARGS(c, inode, bio, delay), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(u64, inode ) - __field(sector_t, sector ) - __field(unsigned int, nr_sector ) - __array(char, rwbs, 6 ) - __field(u64, delay ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); - __entry->inode = inode; - __entry->sector = bio->bi_iter.bi_sector; - __entry->nr_sector = bio->bi_iter.bi_size >> 9; - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); - __entry->delay = delay; - ), - - TP_printk("%pU inode %llu %s %llu + %u delay %llu", - __entry->uuid, __entry->inode, - __entry->rwbs, (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->delay) -); - /* Journal */ DEFINE_EVENT(bch_fs, journal_full, @@ -439,16 +394,6 @@ TRACE_EVENT(alloc_batch, __entry->uuid, __entry->free, __entry->total) ); -DEFINE_EVENT(bch_dev, prio_write_start, - TP_PROTO(struct bch_dev *ca), - TP_ARGS(ca) -); - -DEFINE_EVENT(bch_dev, prio_write_end, - TP_PROTO(struct bch_dev *ca), - TP_ARGS(ca) -); - TRACE_EVENT(invalidate, TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors), TP_ARGS(ca, offset, sectors), @@ -502,174 +447,77 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, TP_ARGS(ca, reserve) ); -TRACE_EVENT(freelist_empty_fail, - TP_PROTO(struct bch_fs *c, enum alloc_reserve reserve, - struct closure *cl), - TP_ARGS(c, reserve, cl), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(enum alloc_reserve, reserve ) - __field(struct closure *, cl ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); - __entry->reserve = reserve; - __entry->cl = cl; - ), - - TP_printk("%pU reserve %d cl %p", __entry->uuid, __entry->reserve, - __entry->cl) -); - -DECLARE_EVENT_CLASS(open_bucket_alloc, - TP_PROTO(struct bch_fs *c, struct closure *cl), - TP_ARGS(c, cl), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(struct closure *, cl ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); - __entry->cl = cl; - ), - - TP_printk("%pU cl %p", - __entry->uuid, __entry->cl) -); - -DEFINE_EVENT(open_bucket_alloc, open_bucket_alloc, - TP_PROTO(struct bch_fs *c, struct closure *cl), - TP_ARGS(c, cl) -); - -DEFINE_EVENT(open_bucket_alloc, open_bucket_alloc_fail, - TP_PROTO(struct bch_fs *c, struct closure *cl), - TP_ARGS(c, cl) +DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail, + TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), + TP_ARGS(ca, reserve) ); /* Moving IO */ -DECLARE_EVENT_CLASS(moving_io, - TP_PROTO(struct bkey *k), - TP_ARGS(k), - - TP_STRUCT__entry( - __field(__u32, inode ) - __field(__u64, offset ) - __field(__u32, sectors ) - ), - - TP_fast_assign( - __entry->inode = k->p.inode; - __entry->offset = k->p.offset; - __entry->sectors = k->size; - ), - - TP_printk("%u:%llu sectors %u", - __entry->inode, __entry->offset, __entry->sectors) -); - -DEFINE_EVENT(moving_io, move_read, - TP_PROTO(struct bkey *k), - TP_ARGS(k) -); - -DEFINE_EVENT(moving_io, move_read_done, - TP_PROTO(struct bkey *k), +DEFINE_EVENT(bkey, move_extent, + TP_PROTO(const struct bkey *k), TP_ARGS(k) ); -DEFINE_EVENT(moving_io, move_write, - TP_PROTO(struct bkey *k), +DEFINE_EVENT(bkey, move_alloc_fail, + TP_PROTO(const struct bkey *k), TP_ARGS(k) ); -DEFINE_EVENT(moving_io, copy_collision, - TP_PROTO(struct bkey *k), +DEFINE_EVENT(bkey, move_race, + TP_PROTO(const struct bkey *k), TP_ARGS(k) ); -/* Copy GC */ - -DEFINE_EVENT(page_alloc_fail, moving_gc_alloc_fail, - TP_PROTO(struct bch_fs *c, u64 size), - TP_ARGS(c, size) -); - -DEFINE_EVENT(bch_dev, moving_gc_start, - TP_PROTO(struct bch_dev *ca), - TP_ARGS(ca) -); - -TRACE_EVENT(moving_gc_end, - TP_PROTO(struct bch_dev *ca, u64 sectors_moved, u64 keys_moved, - u64 buckets_moved), - TP_ARGS(ca, sectors_moved, keys_moved, buckets_moved), +TRACE_EVENT(move_data, + TP_PROTO(struct bch_fs *c, u64 sectors_moved, + u64 keys_moved), + TP_ARGS(c, sectors_moved, keys_moved), TP_STRUCT__entry( __array(char, uuid, 16 ) __field(u64, sectors_moved ) __field(u64, keys_moved ) - __field(u64, buckets_moved ) ), TP_fast_assign( - memcpy(__entry->uuid, ca->uuid.b, 16); + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); __entry->sectors_moved = sectors_moved; __entry->keys_moved = keys_moved; - __entry->buckets_moved = buckets_moved; ), - TP_printk("%pU sectors_moved %llu keys_moved %llu buckets_moved %llu", - __entry->uuid, __entry->sectors_moved, __entry->keys_moved, - __entry->buckets_moved) -); - -DEFINE_EVENT(bkey, gc_copy, - TP_PROTO(const struct bkey *k), - TP_ARGS(k) -); - -/* Tiering */ - -DEFINE_EVENT(page_alloc_fail, tiering_alloc_fail, - TP_PROTO(struct bch_fs *c, u64 size), - TP_ARGS(c, size) + TP_printk("%pU sectors_moved %llu keys_moved %llu", + __entry->uuid, __entry->sectors_moved, __entry->keys_moved) ); -DEFINE_EVENT(bch_fs, tiering_start, - TP_PROTO(struct bch_fs *c), - TP_ARGS(c) -); - -TRACE_EVENT(tiering_end, - TP_PROTO(struct bch_fs *c, u64 sectors_moved, - u64 keys_moved), - TP_ARGS(c, sectors_moved, keys_moved), +TRACE_EVENT(copygc, + TP_PROTO(struct bch_dev *ca, + u64 sectors_moved, u64 sectors_not_moved, + u64 buckets_moved, u64 buckets_not_moved), + TP_ARGS(ca, + sectors_moved, sectors_not_moved, + buckets_moved, buckets_not_moved), TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(u64, sectors_moved ) - __field(u64, keys_moved ) + __array(char, uuid, 16 ) + __field(u64, sectors_moved ) + __field(u64, sectors_not_moved ) + __field(u64, buckets_moved ) + __field(u64, buckets_not_moved ) ), TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); - __entry->sectors_moved = sectors_moved; - __entry->keys_moved = keys_moved; + memcpy(__entry->uuid, ca->uuid.b, 16); + __entry->sectors_moved = sectors_moved; + __entry->sectors_not_moved = sectors_not_moved; + __entry->buckets_moved = buckets_moved; + __entry->buckets_not_moved = buckets_moved; ), - TP_printk("%pU sectors_moved %llu keys_moved %llu", - __entry->uuid, __entry->sectors_moved, __entry->keys_moved) -); - -DEFINE_EVENT(bkey, tiering_copy, - TP_PROTO(const struct bkey *k), - TP_ARGS(k) + TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu", + __entry->uuid, + __entry->sectors_moved, __entry->sectors_not_moved, + __entry->buckets_moved, __entry->buckets_not_moved) ); #endif /* _TRACE_BCACHE_H */ diff --git a/libbcachefs/alloc.c b/libbcachefs/alloc.c index dc7348f..d29d871 100644 --- a/libbcachefs/alloc.c +++ b/libbcachefs/alloc.c @@ -56,6 +56,7 @@ #include "bcachefs.h" #include "alloc.h" #include "btree_update.h" +#include "btree_gc.h" #include "buckets.h" #include "checksum.h" #include "clock.h" @@ -76,7 +77,7 @@ #include #include -static void bch2_recalc_min_prio(struct bch_dev *, int); +static void bch2_recalc_min_prio(struct bch_fs *, struct bch_dev *, int); /* Ratelimiting/PD controllers */ @@ -92,8 +93,6 @@ static void pd_controllers_update(struct work_struct *work) u64 faster_tiers_size = 0; u64 faster_tiers_dirty = 0; - u64 fastest_tier_size = 0; - u64 fastest_tier_free = 0; u64 copygc_can_free = 0; rcu_read_lock(); @@ -105,7 +104,7 @@ static void pd_controllers_update(struct work_struct *work) -1); for_each_member_device_rcu(ca, c, iter, &c->tiers[i].devs) { - struct bch_dev_usage stats = bch2_dev_usage_read(ca); + struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); u64 size = bucket_to_sector(ca, ca->mi.nbuckets - ca->mi.first_bucket) << 9; @@ -125,18 +124,12 @@ static void pd_controllers_update(struct work_struct *work) fragmented = max(0LL, fragmented); - bch2_pd_controller_update(&ca->moving_gc_pd, + bch2_pd_controller_update(&ca->copygc_pd, free, fragmented, -1); faster_tiers_size += size; faster_tiers_dirty += dirty; - if (!c->fastest_tier || - c->fastest_tier == &c->tiers[i]) { - fastest_tier_size += size; - fastest_tier_free += free; - } - copygc_can_free += fragmented; } } @@ -157,14 +150,6 @@ static void pd_controllers_update(struct work_struct *work) if (c->fastest_tier) copygc_can_free = U64_MAX; - bch2_pd_controller_update(&c->foreground_write_pd, - min(copygc_can_free, - div_u64(fastest_tier_size * - c->foreground_target_percent, - 100)), - fastest_tier_free, - -1); - schedule_delayed_work(&c->pd_controllers_update, c->pd_controllers_update_seconds * HZ); } @@ -295,6 +280,8 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) struct journal_replay *r; struct btree_iter iter; struct bkey_s_c k; + struct bch_dev *ca; + unsigned i; int ret; if (!c->btree_roots[BTREE_ID_ALLOC].b) @@ -318,6 +305,11 @@ int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) bch2_alloc_read_key(c, bkey_i_to_s_c(k)); } + for_each_member_device(ca, c, i) { + bch2_recalc_min_prio(c, ca, READ); + bch2_recalc_min_prio(c, ca, WRITE); + } + return 0; } @@ -436,7 +428,7 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) if (gc_count != c->gc_count) ca->inc_gen_really_needs_gc = 0; - if ((ssize_t) (dev_buckets_available(ca) - + if ((ssize_t) (dev_buckets_available(c, ca) - ca->inc_gen_really_needs_gc) >= (ssize_t) fifo_free(&ca->free_inc)) break; @@ -451,9 +443,10 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) return ret; } -static void verify_not_on_freelist(struct bch_dev *ca, size_t bucket) +static void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, + size_t bucket) { - if (expensive_debug_checks(ca->fs)) { + if (expensive_debug_checks(c)) { size_t iter; long i; unsigned j; @@ -468,9 +461,8 @@ static void verify_not_on_freelist(struct bch_dev *ca, size_t bucket) /* Bucket heap / gen */ -void bch2_recalc_min_prio(struct bch_dev *ca, int rw) +void bch2_recalc_min_prio(struct bch_fs *c, struct bch_dev *ca, int rw) { - struct bch_fs *c = ca->fs; struct prio_clock *clock = &c->prio_clock[rw]; struct bucket *g; u16 max_delta = 1; @@ -478,14 +470,14 @@ void bch2_recalc_min_prio(struct bch_dev *ca, int rw) lockdep_assert_held(&c->bucket_lock); - /* Determine min prio for this particular cache */ + /* Determine min prio for this particular device */ for_each_bucket(g, ca) max_delta = max(max_delta, (u16) (clock->hand - g->prio[rw])); ca->min_prio[rw] = clock->hand - max_delta; /* - * This may possibly increase the min prio for the whole cache, check + * This may possibly increase the min prio for the whole device, check * that as well. */ max_delta = 1; @@ -511,7 +503,7 @@ static void bch2_rescale_prios(struct bch_fs *c, int rw) g->prio[rw] = clock->hand - (clock->hand - g->prio[rw]) / 2; - bch2_recalc_min_prio(ca, rw); + bch2_recalc_min_prio(c, ca, rw); } } @@ -588,20 +580,20 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, struct bucket *g, return can_inc_bucket_gen(ca, g); } -static void bch2_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g) +static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, + struct bucket *g) { - struct bch_fs *c = ca->fs; struct bucket_mark m; - spin_lock(&ca->freelist_lock); - if (!bch2_invalidate_bucket(ca, g, &m)) { - spin_unlock(&ca->freelist_lock); + spin_lock(&c->freelist_lock); + if (!bch2_invalidate_bucket(c, ca, g, &m)) { + spin_unlock(&c->freelist_lock); return; } - verify_not_on_freelist(ca, g - ca->buckets); + verify_not_on_freelist(c, ca, g - ca->buckets); BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets)); - spin_unlock(&ca->freelist_lock); + spin_unlock(&c->freelist_lock); g->prio[READ] = c->prio_clock[READ].hand; g->prio[WRITE] = c->prio_clock[WRITE].hand; @@ -641,9 +633,8 @@ static void bch2_invalidate_one_bucket(struct bch_dev *ca, struct bucket *g) * number wraparound. */ -static unsigned long bucket_sort_key(struct bch_dev *ca, - struct bucket *g, - struct bucket_mark m) +static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, + struct bucket *g, struct bucket_mark m) { /* * Time since last read, scaled to [0, 8) where larger value indicates @@ -651,14 +642,14 @@ static unsigned long bucket_sort_key(struct bch_dev *ca, */ unsigned long hotness = (g->prio[READ] - ca->min_prio[READ]) * 7 / - (ca->fs->prio_clock[READ].hand - ca->min_prio[READ]); + (c->prio_clock[READ].hand - ca->min_prio[READ]); /* How much we want to keep the data in this bucket: */ unsigned long data_wantness = (hotness + 1) * bucket_sectors_used(m); unsigned long needs_journal_commit = - bucket_needs_journal_commit(m, ca->fs->journal.last_seq_ondisk); + bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); return (data_wantness << 9) | (needs_journal_commit << 8) | @@ -672,16 +663,16 @@ static inline int bucket_alloc_cmp(alloc_heap *h, return (l.key > r.key) - (l.key < r.key); } -static void invalidate_buckets_lru(struct bch_dev *ca) +static void invalidate_buckets_lru(struct bch_fs *c, struct bch_dev *ca) { struct alloc_heap_entry e; struct bucket *g; ca->alloc_heap.used = 0; - mutex_lock(&ca->fs->bucket_lock); - bch2_recalc_min_prio(ca, READ); - bch2_recalc_min_prio(ca, WRITE); + mutex_lock(&c->bucket_lock); + bch2_recalc_min_prio(c, ca, READ); + bch2_recalc_min_prio(c, ca, WRITE); /* * Find buckets with lowest read priority, by building a maxheap sorted @@ -696,7 +687,7 @@ static void invalidate_buckets_lru(struct bch_dev *ca) e = (struct alloc_heap_entry) { .bucket = g - ca->buckets, - .key = bucket_sort_key(ca, g, m) + .key = bucket_sort_key(c, ca, g, m) }; heap_add_or_replace(&ca->alloc_heap, e, -bucket_alloc_cmp); @@ -710,12 +701,12 @@ static void invalidate_buckets_lru(struct bch_dev *ca) */ while (!fifo_full(&ca->free_inc) && heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp)) - bch2_invalidate_one_bucket(ca, &ca->buckets[e.bucket]); + bch2_invalidate_one_bucket(c, ca, &ca->buckets[e.bucket]); - mutex_unlock(&ca->fs->bucket_lock); + mutex_unlock(&c->bucket_lock); } -static void invalidate_buckets_fifo(struct bch_dev *ca) +static void invalidate_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) { struct bucket_mark m; struct bucket *g; @@ -730,14 +721,14 @@ static void invalidate_buckets_fifo(struct bch_dev *ca) m = READ_ONCE(g->mark); if (bch2_can_invalidate_bucket(ca, g, m)) - bch2_invalidate_one_bucket(ca, g); + bch2_invalidate_one_bucket(c, ca, g); if (++checked >= ca->mi.nbuckets) return; } } -static void invalidate_buckets_random(struct bch_dev *ca) +static void invalidate_buckets_random(struct bch_fs *c, struct bch_dev *ca) { struct bucket_mark m; struct bucket *g; @@ -752,27 +743,27 @@ static void invalidate_buckets_random(struct bch_dev *ca) m = READ_ONCE(g->mark); if (bch2_can_invalidate_bucket(ca, g, m)) - bch2_invalidate_one_bucket(ca, g); + bch2_invalidate_one_bucket(c, ca, g); if (++checked >= ca->mi.nbuckets / 2) return; } } -static void invalidate_buckets(struct bch_dev *ca) +static void invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) { ca->inc_gen_needs_gc = 0; ca->inc_gen_really_needs_gc = 0; switch (ca->mi.replacement) { case CACHE_REPLACEMENT_LRU: - invalidate_buckets_lru(ca); + invalidate_buckets_lru(c, ca); break; case CACHE_REPLACEMENT_FIFO: - invalidate_buckets_fifo(ca); + invalidate_buckets_fifo(c, ca); break; case CACHE_REPLACEMENT_RANDOM: - invalidate_buckets_random(ca); + invalidate_buckets_random(c, ca); break; } } @@ -812,7 +803,8 @@ static int bch2_invalidate_free_inc(struct bch_fs *c, struct bch_dev *ca, * Given an invalidated, ready to use bucket: issue a discard to it if enabled, * then add it to the freelist, waiting until there's room if necessary: */ -static void discard_invalidated_bucket(struct bch_dev *ca, long bucket) +static void discard_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, + long bucket) { if (ca->mi.discard && blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) @@ -830,15 +822,15 @@ static void discard_invalidated_bucket(struct bch_dev *ca, long bucket) * Don't remove from free_inc until after it's added to * freelist, so gc can find it: */ - spin_lock(&ca->freelist_lock); + spin_lock(&c->freelist_lock); for (i = 0; i < RESERVE_NR; i++) if (fifo_push(&ca->free[i], bucket)) { fifo_pop(&ca->free_inc, bucket); - closure_wake_up(&ca->fs->freelist_wait); + closure_wake_up(&c->freelist_wait); pushed = true; break; } - spin_unlock(&ca->freelist_lock); + spin_unlock(&c->freelist_lock); if (pushed) break; @@ -877,7 +869,7 @@ static int bch2_allocator_thread(void *arg) BUG_ON(fifo_empty(&ca->free_inc)); bucket = fifo_peek(&ca->free_inc); - discard_invalidated_bucket(ca, bucket); + discard_invalidated_bucket(c, ca, bucket); if (kthread_should_stop()) return 0; --ca->nr_invalidated; @@ -924,7 +916,7 @@ static int bch2_allocator_thread(void *arg) * another cache tier */ - invalidate_buckets(ca); + invalidate_buckets(c, ca); trace_alloc_batch(ca, fifo_used(&ca->free_inc), ca->free_inc.size); @@ -949,12 +941,12 @@ static int bch2_allocator_thread(void *arg) BUG_ON(ca->free_inc.front); - spin_lock(&ca->freelist_lock); + spin_lock(&c->freelist_lock); sort(ca->free_inc.data, ca->free_inc.back, sizeof(ca->free_inc.data[0]), size_t_cmp, NULL); - spin_unlock(&ca->freelist_lock); + spin_unlock(&c->freelist_lock); /* * free_inc is now full of newly-invalidated buckets: next, @@ -965,6 +957,55 @@ static int bch2_allocator_thread(void *arg) /* Allocation */ +/* + * Open buckets represent a bucket that's currently being allocated from. They + * serve two purposes: + * + * - They track buckets that have been partially allocated, allowing for + * sub-bucket sized allocations - they're used by the sector allocator below + * + * - They provide a reference to the buckets they own that mark and sweep GC + * can find, until the new allocation has a pointer to it inserted into the + * btree + * + * When allocating some space with the sector allocator, the allocation comes + * with a reference to an open bucket - the caller is required to put that + * reference _after_ doing the index update that makes its allocation reachable. + */ + +void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) +{ + struct bch_dev *ca = c->devs[ob->ptr.dev]; + + spin_lock(&ob->lock); + bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), false, + gc_pos_alloc(c, ob), 0); + ob->valid = false; + spin_unlock(&ob->lock); + + spin_lock(&c->freelist_lock); + ob->freelist = c->open_buckets_freelist; + c->open_buckets_freelist = ob - c->open_buckets; + c->open_buckets_nr_free++; + spin_unlock(&c->freelist_lock); + + closure_wake_up(&c->open_buckets_wait); +} + +static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) +{ + struct open_bucket *ob; + + BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); + + ob = c->open_buckets + c->open_buckets_freelist; + c->open_buckets_freelist = ob->freelist; + atomic_set(&ob->pin, 1); + + c->open_buckets_nr_free--; + return ob; +} + /* * XXX: allocation on startup is still sketchy. There is insufficient * synchronization for bch2_bucket_alloc_startup() to work correctly after @@ -994,7 +1035,7 @@ static long bch2_bucket_alloc_startup(struct bch_fs *c, struct bch_dev *ca) for_each_bucket(g, ca) if (!g->mark.touched_this_mount && is_available_bucket(g->mark) && - bch2_mark_alloc_bucket_startup(ca, g)) { + bch2_mark_alloc_bucket_startup(c, ca, g)) { r = g - ca->buckets; set_bit(r, ca->bucket_dirty); break; @@ -1004,69 +1045,105 @@ out: return r; } +static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) +{ + switch (reserve) { + case RESERVE_ALLOC: + return 0; + case RESERVE_BTREE: + return BTREE_NODE_RESERVE / 2; + default: + return BTREE_NODE_RESERVE; + } +} + /** * bch_bucket_alloc - allocate a single bucket from a specific device * * Returns index of bucket on success, 0 on failure * */ -long bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, - enum alloc_reserve reserve) +int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, + enum alloc_reserve reserve, + bool may_alloc_partial, + struct closure *cl) { - size_t r; + struct open_bucket *ob; + long bucket; + + spin_lock(&c->freelist_lock); + if (may_alloc_partial && + ca->open_buckets_partial_nr) { + int ret = ca->open_buckets_partial[--ca->open_buckets_partial_nr]; + c->open_buckets[ret].on_partial_list = false; + spin_unlock(&c->freelist_lock); + return ret; + } + + if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { + if (cl) + closure_wait(&c->open_buckets_wait, cl); + spin_unlock(&c->freelist_lock); + trace_open_bucket_alloc_fail(ca, reserve); + return OPEN_BUCKETS_EMPTY; + } - spin_lock(&ca->freelist_lock); - if (likely(fifo_pop(&ca->free[RESERVE_NONE], r))) + if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket))) goto out; switch (reserve) { case RESERVE_ALLOC: - if (fifo_pop(&ca->free[RESERVE_BTREE], r)) + if (fifo_pop(&ca->free[RESERVE_BTREE], bucket)) goto out; break; case RESERVE_BTREE: if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >= ca->free[RESERVE_BTREE].size && - fifo_pop(&ca->free[RESERVE_BTREE], r)) + fifo_pop(&ca->free[RESERVE_BTREE], bucket)) goto out; break; case RESERVE_MOVINGGC: - if (fifo_pop(&ca->free[RESERVE_MOVINGGC], r)) + if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket)) goto out; break; default: break; } - spin_unlock(&ca->freelist_lock); - if (unlikely(!ca->alloc_thread_started) && (reserve == RESERVE_ALLOC) && - (r = bch2_bucket_alloc_startup(c, ca)) >= 0) { - verify_not_on_freelist(ca, r); - goto out2; - } + (bucket = bch2_bucket_alloc_startup(c, ca)) >= 0) + goto out; + + spin_unlock(&c->freelist_lock); trace_bucket_alloc_fail(ca, reserve); - return -1; + return FREELIST_EMPTY; out: - verify_not_on_freelist(ca, r); - spin_unlock(&ca->freelist_lock); + verify_not_on_freelist(c, ca, bucket); + + ob = bch2_open_bucket_alloc(c); + + spin_lock(&ob->lock); + ob->valid = true; + ob->sectors_free = ca->mi.bucket_size; + ob->ptr = (struct bch_extent_ptr) { + .gen = ca->buckets[bucket].mark.gen, + .offset = bucket_to_sector(ca, bucket), + .dev = ca->dev_idx, + }; + spin_unlock(&ob->lock); + + spin_unlock(&c->freelist_lock); bch2_wake_allocator(ca); -out2: - ca->buckets[r].prio[READ] = c->prio_clock[READ].hand; - ca->buckets[r].prio[WRITE] = c->prio_clock[WRITE].hand; + + ca->buckets[bucket].prio[READ] = c->prio_clock[READ].hand; + ca->buckets[bucket].prio[WRITE] = c->prio_clock[WRITE].hand; trace_bucket_alloc(ca, reserve); - return r; + return ob - c->open_buckets; } -enum bucket_alloc_ret { - ALLOC_SUCCESS, - NO_DEVICES, /* -EROFS */ - FREELIST_EMPTY, /* Allocator thread not keeping up */ -}; - struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c, struct write_point *wp, struct bch_devs_mask *devs) @@ -1091,11 +1168,7 @@ struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c, break; } - memmove(&ret.devs[j + 1], - &ret.devs[j], - sizeof(ret.devs[0]) * (ret.nr - j)); - ret.nr++; - ret.devs[j] = i; + array_insert_item(ret.devs, ret.nr, j, i); } return ret; @@ -1112,63 +1185,46 @@ void bch2_wp_rescale(struct bch_fs *c, struct bch_dev *ca, static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp, - struct open_bucket *ob, unsigned nr_replicas, enum alloc_reserve reserve, - struct bch_devs_mask *devs) + struct bch_devs_mask *devs, + struct closure *cl) { enum bucket_alloc_ret ret = NO_DEVICES; struct dev_alloc_list devs_sorted; u64 buckets_free; unsigned i; - BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs)); + BUG_ON(nr_replicas > ARRAY_SIZE(wp->ptrs)); - if (ob->nr_ptrs >= nr_replicas) + if (wp->nr_ptrs >= nr_replicas) return ALLOC_SUCCESS; rcu_read_lock(); devs_sorted = bch2_wp_alloc_list(c, wp, devs); - spin_lock(&ob->lock); for (i = 0; i < devs_sorted.nr; i++) { struct bch_dev *ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); - struct open_bucket_ptr ptr; + int ob; if (!ca) continue; - if (wp->type == BCH_DATA_USER && - ca->open_buckets_partial_nr) { - ptr = ca->open_buckets_partial[--ca->open_buckets_partial_nr]; - } else { - long bucket = bch2_bucket_alloc(c, ca, reserve); - if (bucket < 0) { - ret = FREELIST_EMPTY; - continue; - } - - ptr = (struct open_bucket_ptr) { - .ptr.gen = ca->buckets[bucket].mark.gen, - .ptr.offset = bucket_to_sector(ca, bucket), - .ptr.dev = ca->dev_idx, - .sectors_free = ca->mi.bucket_size, - }; + ob = bch2_bucket_alloc(c, ca, reserve, + wp->type == BCH_DATA_USER, cl); + if (ob < 0) { + ret = ob; + if (ret == OPEN_BUCKETS_EMPTY) + break; + continue; } - /* - * open_bucket_add_buckets expects new pointers at the head of - * the list: - */ - BUG_ON(ob->nr_ptrs >= ARRAY_SIZE(ob->ptrs)); - memmove(&ob->ptrs[1], - &ob->ptrs[0], - ob->nr_ptrs * sizeof(ob->ptrs[0])); - ob->nr_ptrs++; - ob->ptrs[0] = ptr; - - buckets_free = U64_MAX, dev_buckets_free(ca); + BUG_ON(ob <= 0 || ob > U8_MAX); + BUG_ON(wp->nr_ptrs >= ARRAY_SIZE(wp->ptrs)); + wp->ptrs[wp->nr_ptrs++] = c->open_buckets + ob; + + buckets_free = U64_MAX, dev_buckets_free(c, ca); if (buckets_free) wp->next_alloc[ca->dev_idx] += div64_u64(U64_MAX, buckets_free * @@ -1179,20 +1235,21 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c, __clear_bit(ca->dev_idx, devs->d); - if (ob->nr_ptrs == nr_replicas) { + if (wp->nr_ptrs == nr_replicas) { ret = ALLOC_SUCCESS; break; } } - EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC); - spin_unlock(&ob->lock); + EBUG_ON(reserve == RESERVE_MOVINGGC && + ret != ALLOC_SUCCESS && + ret != OPEN_BUCKETS_EMPTY); rcu_read_unlock(); return ret; } static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp, - struct open_bucket *ob, unsigned nr_replicas, + unsigned nr_replicas, enum alloc_reserve reserve, struct bch_devs_mask *devs, struct closure *cl) @@ -1200,8 +1257,8 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp, bool waiting = false; while (1) { - switch (__bch2_bucket_alloc_set(c, wp, ob, nr_replicas, - reserve, devs)) { + switch (__bch2_bucket_alloc_set(c, wp, nr_replicas, + reserve, devs, cl)) { case ALLOC_SUCCESS: if (waiting) closure_wake_up(&c->freelist_wait); @@ -1214,10 +1271,6 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp, return -EROFS; case FREELIST_EMPTY: - if (!cl || waiting) - trace_freelist_empty_fail(c, - reserve, cl); - if (!cl) return -ENOSPC; @@ -1228,226 +1281,89 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp, closure_wait(&c->freelist_wait, cl); waiting = true; break; + case OPEN_BUCKETS_EMPTY: + return cl ? -EAGAIN : -ENOSPC; default: BUG(); } } } -/* Open buckets: */ - -/* - * Open buckets represent one or more buckets (on multiple devices) that are - * currently being allocated from. They serve two purposes: - * - * - They track buckets that have been partially allocated, allowing for - * sub-bucket sized allocations - they're used by the sector allocator below - * - * - They provide a reference to the buckets they own that mark and sweep GC - * can find, until the new allocation has a pointer to it inserted into the - * btree - * - * When allocating some space with the sector allocator, the allocation comes - * with a reference to an open bucket - the caller is required to put that - * reference _after_ doing the index update that makes its allocation reachable. - */ +/* Sector allocator */ -void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) +static void writepoint_drop_ptrs(struct bch_fs *c, + struct write_point *wp, + struct bch_devs_mask *devs, + unsigned nr_ptrs_dislike) { - const struct open_bucket_ptr *ptr; - u8 new_ob; + int i; - if (!atomic_dec_and_test(&ob->pin)) + if (!nr_ptrs_dislike) return; - down_read(&c->alloc_gc_lock); - spin_lock(&ob->lock); - - open_bucket_for_each_ptr(ob, ptr) { - struct bch_dev *ca = c->devs[ptr->ptr.dev]; + for (i = wp->nr_ptrs - 1; i >= 0; --i) { + struct open_bucket *ob = wp->ptrs[i]; + struct bch_dev *ca = c->devs[ob->ptr.dev]; - if (ptr->sectors_free) { - /* - * This is a ptr to a bucket that still has free space, - * but we don't want to use it - */ + if (nr_ptrs_dislike && !test_bit(ob->ptr.dev, devs->d)) { BUG_ON(ca->open_buckets_partial_nr >= ARRAY_SIZE(ca->open_buckets_partial)); - spin_lock(&ca->freelist_lock); - ca->open_buckets_partial[ca->open_buckets_partial_nr++] - = *ptr; - spin_unlock(&ca->freelist_lock); - } else { - bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), false); - } - } - ob->nr_ptrs = 0; - - spin_unlock(&ob->lock); - up_read(&c->alloc_gc_lock); - - new_ob = ob->new_ob; - ob->new_ob = 0; - - spin_lock(&c->open_buckets_lock); - ob->freelist = c->open_buckets_freelist; - c->open_buckets_freelist = ob - c->open_buckets; - c->open_buckets_nr_free++; - spin_unlock(&c->open_buckets_lock); - - closure_wake_up(&c->open_buckets_wait); - - if (new_ob) - bch2_open_bucket_put(c, c->open_buckets + new_ob); -} - -static struct open_bucket *bch2_open_bucket_get(struct bch_fs *c, - unsigned nr_reserved, - struct closure *cl) -{ - struct open_bucket *ret; - - spin_lock(&c->open_buckets_lock); - - if (c->open_buckets_nr_free > nr_reserved) { - BUG_ON(!c->open_buckets_freelist); - - ret = c->open_buckets + c->open_buckets_freelist; - c->open_buckets_freelist = ret->freelist; - atomic_set(&ret->pin, 1); /* XXX */ + spin_lock(&c->freelist_lock); + ob->on_partial_list = true; + ca->open_buckets_partial[ca->open_buckets_partial_nr++] = + ob - c->open_buckets; + spin_unlock(&c->freelist_lock); - BUG_ON(ret->new_ob); - BUG_ON(ret->nr_ptrs); - - c->open_buckets_nr_free--; - trace_open_bucket_alloc(c, cl); - } else { - trace_open_bucket_alloc_fail(c, cl); - - if (cl) { - closure_wait(&c->open_buckets_wait, cl); - ret = ERR_PTR(-EAGAIN); - } else - ret = ERR_PTR(-ENOSPC); - } - - spin_unlock(&c->open_buckets_lock); - - return ret; -} - -static unsigned open_bucket_sectors_free(struct bch_fs *c, - struct open_bucket *ob, - unsigned nr_replicas) -{ - unsigned sectors_free = UINT_MAX; - struct open_bucket_ptr *ptr; - - open_bucket_for_each_ptr(ob, ptr) - sectors_free = min(sectors_free, ptr->sectors_free); - - return sectors_free != UINT_MAX ? sectors_free : 0; -} - -static void open_bucket_move_ptrs(struct bch_fs *c, - struct open_bucket *dst, - struct open_bucket *src, - struct bch_devs_mask *devs, - unsigned nr_ptrs_dislike) -{ - bool moved_ptr = false; - int i; - - down_read(&c->alloc_gc_lock); - - if (dst < src) { - spin_lock(&dst->lock); - spin_lock_nested(&src->lock, 1); - } else { - spin_lock(&src->lock); - spin_lock_nested(&dst->lock, 1); - } + closure_wake_up(&c->open_buckets_wait); + closure_wake_up(&c->freelist_wait); - for (i = src->nr_ptrs - 1; i >= 0; --i) { - if (!src->ptrs[i].sectors_free) { - /* - * Don't do anything: leave the ptr on the old - * open_bucket for gc to find - */ - } else if (nr_ptrs_dislike && - !test_bit(src->ptrs[i].ptr.dev, devs->d)) { - /* - * We don't want this pointer; bch2_open_bucket_put() - * will stick it on ca->open_buckets_partial to be - * reused - */ + array_remove_item(wp->ptrs, wp->nr_ptrs, i); --nr_ptrs_dislike; - } else { - BUG_ON(dst->nr_ptrs >= ARRAY_SIZE(dst->ptrs)); - - dst->ptrs[dst->nr_ptrs++] = src->ptrs[i]; - - src->nr_ptrs--; - memmove(&src->ptrs[i], - &src->ptrs[i + 1], - (src->nr_ptrs - i) * sizeof(src->ptrs[0])); - - moved_ptr = true; } } - - if (moved_ptr) { - BUG_ON(src->new_ob); - - atomic_inc(&dst->pin); - src->new_ob = dst - c->open_buckets; - } - - spin_unlock(&dst->lock); - spin_unlock(&src->lock); - up_read(&c->alloc_gc_lock); } -static void verify_not_stale(struct bch_fs *c, const struct open_bucket *ob) +static void verify_not_stale(struct bch_fs *c, const struct write_point *wp) { #ifdef CONFIG_BCACHEFS_DEBUG - const struct open_bucket_ptr *ptr; + struct open_bucket *ob; + unsigned i; - open_bucket_for_each_ptr(ob, ptr) { - struct bch_dev *ca = c->devs[ptr->ptr.dev]; + writepoint_for_each_ptr(wp, ob, i) { + struct bch_dev *ca = c->devs[ob->ptr.dev]; - BUG_ON(ptr_stale(ca, &ptr->ptr)); + BUG_ON(ptr_stale(ca, &ob->ptr)); } #endif } -/* Sector allocator */ - static int open_bucket_add_buckets(struct bch_fs *c, - struct write_point *wp, struct bch_devs_mask *_devs, - struct open_bucket *ob, + struct write_point *wp, + struct bch_devs_list *devs_have, unsigned nr_replicas, enum alloc_reserve reserve, struct closure *cl) { struct bch_devs_mask devs = c->rw_devs[wp->type]; - struct open_bucket_ptr *ptr; + struct open_bucket *ob; + unsigned i; - if (ob->nr_ptrs >= nr_replicas) + if (wp->nr_ptrs >= nr_replicas) return 0; + /* Don't allocate from devices we already have pointers to: */ + for (i = 0; i < devs_have->nr; i++) + __clear_bit(devs_have->devs[i], devs.d); + + writepoint_for_each_ptr(wp, ob, i) + __clear_bit(ob->ptr.dev, devs.d); + if (_devs) bitmap_and(devs.d, devs.d, _devs->d, BCH_SB_MEMBERS_MAX); - /* Don't allocate from devices we already have pointers to: */ - open_bucket_for_each_ptr(ob, ptr) - if (ptr->sectors_free) - __clear_bit(ptr->ptr.dev, devs.d); - - return bch2_bucket_alloc_set(c, wp, ob, nr_replicas, - reserve, &devs, cl); + return bch2_bucket_alloc_set(c, wp, nr_replicas, reserve, &devs, cl); } static struct write_point *__writepoint_find(struct hlist_head *head, @@ -1455,15 +1371,9 @@ static struct write_point *__writepoint_find(struct hlist_head *head, { struct write_point *wp; - hlist_for_each_entry_rcu(wp, head, node) { - if (wp->write_point == write_point) - continue; - - mutex_lock(&wp->lock); + hlist_for_each_entry_rcu(wp, head, node) if (wp->write_point == write_point) return wp; - mutex_unlock(&wp->lock); - } return NULL; } @@ -1478,47 +1388,49 @@ static struct hlist_head *writepoint_hash(struct bch_fs *c, } static struct write_point *writepoint_find(struct bch_fs *c, - enum bch_data_type data_type, unsigned long write_point) { - struct write_point *wp, *oldest = NULL; + struct write_point *wp, *oldest; struct hlist_head *head; - switch (data_type) { - case BCH_DATA_BTREE: - wp = &c->btree_write_point; + if (!(write_point & 1UL)) { + wp = (struct write_point *) write_point; mutex_lock(&wp->lock); return wp; - case BCH_DATA_USER: - break; - default: - BUG(); } head = writepoint_hash(c, write_point); +restart_find: wp = __writepoint_find(head, write_point); - if (wp) - goto out; - - mutex_lock(&c->write_points_hash_lock); - wp = __writepoint_find(head, write_point); - if (wp) - goto out_unlock; + if (wp) { +lock_wp: + mutex_lock(&wp->lock); + if (wp->write_point == write_point) + goto out; + mutex_unlock(&wp->lock); + goto restart_find; + } + oldest = NULL; for (wp = c->write_points; wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) if (!oldest || time_before64(wp->last_used, oldest->last_used)) oldest = wp; - wp = oldest; - BUG_ON(!wp); + mutex_lock(&oldest->lock); + mutex_lock(&c->write_points_hash_lock); + wp = __writepoint_find(head, write_point); + if (wp && wp != oldest) { + mutex_unlock(&c->write_points_hash_lock); + mutex_unlock(&oldest->lock); + goto lock_wp; + } - mutex_lock(&wp->lock); + wp = oldest; hlist_del_rcu(&wp->node); wp->write_point = write_point; hlist_add_head_rcu(&wp->node, head); -out_unlock: mutex_unlock(&c->write_points_hash_lock); out: wp->last_used = sched_clock(); @@ -1529,97 +1441,81 @@ out: * Get us an open_bucket we can allocate from, return with it locked: */ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, - enum bch_data_type data_type, - struct bch_devs_mask *devs, - unsigned long write_point, - unsigned nr_replicas, - unsigned nr_replicas_required, - enum alloc_reserve reserve, - unsigned flags, - struct closure *cl) + struct bch_devs_mask *devs, + struct write_point_specifier write_point, + struct bch_devs_list *devs_have, + unsigned nr_replicas, + unsigned nr_replicas_required, + enum alloc_reserve reserve, + unsigned flags, + struct closure *cl) { - struct open_bucket *ob; struct write_point *wp; - struct open_bucket_ptr *ptr; - unsigned open_buckets_reserved = data_type == BCH_DATA_BTREE - ? 0 : BTREE_NODE_RESERVE; - unsigned nr_ptrs_empty = 0, nr_ptrs_dislike = 0; + struct open_bucket *ob; + unsigned i, nr_ptrs_dislike = 0, nr_ptrs_have = 0; int ret; - BUG_ON(!nr_replicas); + BUG_ON(!nr_replicas || !nr_replicas_required); - wp = writepoint_find(c, data_type, write_point); - BUG_ON(wp->type != data_type); - - wp->last_used = sched_clock(); - - ob = wp->ob; + wp = writepoint_find(c, write_point.v); /* does ob have ptrs we don't need? */ - open_bucket_for_each_ptr(ob, ptr) { - if (!ptr->sectors_free) - nr_ptrs_empty++; - else if (devs && !test_bit(ptr->ptr.dev, devs->d)) + writepoint_for_each_ptr(wp, ob, i) + if (bch2_dev_list_has_dev(*devs_have, ob->ptr.dev)) + nr_ptrs_have++; + else if (devs && !test_bit(ob->ptr.dev, devs->d)) nr_ptrs_dislike++; - } - ret = open_bucket_add_buckets(c, wp, devs, ob, - nr_replicas + nr_ptrs_empty + nr_ptrs_dislike, + ret = open_bucket_add_buckets(c, devs, wp, devs_have, + nr_replicas + nr_ptrs_have + nr_ptrs_dislike, reserve, cl); if (ret && ret != -EROFS) goto err; - if (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) - goto alloc_done; - - /* - * XXX: - * Should this allocation be _forced_ to used the specified device (e.g. - * internal migration), or should we fall back to allocating from all - * devices? - */ - ret = open_bucket_add_buckets(c, wp, NULL, ob, - nr_replicas + nr_ptrs_empty, - reserve, cl); - if (ret && ret != -EROFS) - goto err; -alloc_done: - if (ob->nr_ptrs - nr_ptrs_empty - - ((flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) ? nr_ptrs_dislike : 0) - < nr_replicas_required) { + if (wp->nr_ptrs < + nr_ptrs_have + nr_ptrs_dislike + nr_replicas_required) { ret = -EROFS; goto err; } + if ((int) wp->nr_ptrs - nr_ptrs_dislike < nr_replicas) + nr_ptrs_dislike = clamp_t(int, wp->nr_ptrs - nr_replicas, + 0, nr_ptrs_dislike); + + /* Remove pointers we don't want to use: */ + writepoint_drop_ptrs(c, wp, devs, nr_ptrs_dislike); + /* - * If ob->sectors_free == 0, one or more of the buckets ob points to is - * full. We can't drop pointers from an open bucket - garbage collection - * still needs to find them; instead, we must allocate a new open bucket - * and copy any pointers to non-full buckets into the new open bucket. + * Move pointers to devices we already have to end of open bucket + * pointer list - note that removing pointers we don't want to use might + * have changed nr_ptrs_have: */ - BUG_ON(ob->nr_ptrs - nr_ptrs_empty - nr_replicas > nr_ptrs_dislike); - nr_ptrs_dislike = ob->nr_ptrs - nr_ptrs_empty - nr_replicas; - - if (nr_ptrs_empty || nr_ptrs_dislike) { - ob = bch2_open_bucket_get(c, open_buckets_reserved, cl); - if (IS_ERR(ob)) { - ret = PTR_ERR(ob); - goto err; - } + if (nr_ptrs_have) { + i = nr_ptrs_have = 0; + while (i < wp->nr_ptrs - nr_ptrs_have) + if (bch2_dev_list_has_dev(*devs_have, wp->ptrs[i]->ptr.dev)) { + nr_ptrs_have++; + swap(wp->ptrs[i], wp->ptrs[wp->nr_ptrs - nr_ptrs_have]); + } else { + i++; + } + } - /* Remove pointers we don't want to use: */ + wp->nr_ptrs_can_use = + min_t(unsigned, nr_replicas, wp->nr_ptrs - nr_ptrs_have); - open_bucket_move_ptrs(c, ob, wp->ob, devs, nr_ptrs_dislike); - bch2_open_bucket_put(c, wp->ob); - wp->ob = ob; - } + BUG_ON(wp->nr_ptrs_can_use < nr_replicas_required || + wp->nr_ptrs_can_use > wp->nr_ptrs); + + wp->sectors_free = UINT_MAX; - BUG_ON(ob->nr_ptrs < nr_replicas_required); + for (i = 0; i < wp->nr_ptrs_can_use; i++) + wp->sectors_free = min(wp->sectors_free, + wp->ptrs[i]->sectors_free); - wp->sectors_free = open_bucket_sectors_free(c, ob, nr_replicas); + BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); - BUG_ON(!wp->sectors_free); - verify_not_stale(c, ob); + verify_not_stale(c, wp); return wp; err: @@ -1631,31 +1527,27 @@ err: * Append pointers to the space we just allocated to @k, and mark @sectors space * as allocated out of @ob */ -void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e, - unsigned nr_replicas, struct open_bucket *ob, - unsigned sectors) +void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, + struct bkey_i_extent *e, unsigned sectors) { - struct bch_extent_ptr tmp; - struct open_bucket_ptr *ptr; + unsigned i; - /* - * We're keeping any existing pointer k has, and appending new pointers: - * __bch2_write() will only write to the pointers we add here: - */ + BUG_ON(sectors > wp->sectors_free); + wp->sectors_free -= sectors; - for (ptr = ob->ptrs; - ptr < ob->ptrs + min_t(u8, ob->nr_ptrs, nr_replicas); ptr++) { - struct bch_dev *ca = c->devs[ptr->ptr.dev]; + for (i = 0; i < wp->nr_ptrs_can_use; i++) { + struct open_bucket *ob = wp->ptrs[i]; + struct bch_dev *ca = c->devs[ob->ptr.dev]; + struct bch_extent_ptr tmp = ob->ptr; - EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ptr->ptr.dev)); + EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev)); - tmp = ptr->ptr; tmp.cached = bkey_extent_is_cached(&e->k); - tmp.offset += ca->mi.bucket_size - ptr->sectors_free; + tmp.offset += ca->mi.bucket_size - ob->sectors_free; extent_ptr_append(e, tmp); - BUG_ON(sectors > ptr->sectors_free); - ptr->sectors_free -= sectors; + BUG_ON(sectors > ob->sectors_free); + ob->sectors_free -= sectors; } } @@ -1665,76 +1557,20 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct bkey_i_extent *e, */ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) { - struct open_bucket *ob = wp->ob, *new_ob = NULL; - struct open_bucket_ptr *ptr; - bool empty = false; - - open_bucket_for_each_ptr(ob, ptr) - empty |= !ptr->sectors_free; + int i; - if (empty) - new_ob = bch2_open_bucket_get(c, 0, NULL); + for (i = wp->nr_ptrs - 1; i >= 0; --i) { + struct open_bucket *ob = wp->ptrs[i]; - if (!IS_ERR_OR_NULL(new_ob)) { - /* writepoint's ref becomes our ref: */ - wp->ob = new_ob; - open_bucket_move_ptrs(c, new_ob, ob, 0, 0); - } else { - atomic_inc(&ob->pin); + if (!ob->sectors_free) { + array_remove_item(wp->ptrs, wp->nr_ptrs, i); + bch2_open_bucket_put(c, ob); + } } mutex_unlock(&wp->lock); } -/* - * Allocates some space in the cache to write to, and k to point to the newly - * allocated space, and updates k->size and k->offset (to point to the - * end of the newly allocated space). - * - * May allocate fewer sectors than @sectors, k->size indicates how many - * sectors were actually allocated. - * - * Return codes: - * - -EAGAIN: closure was added to waitlist - * - -ENOSPC: out of space and no closure provided - * - * @c - filesystem. - * @wp - write point to use for allocating sectors. - * @k - key to return the allocated space information. - * @cl - closure to wait for a bucket - */ -struct open_bucket *bch2_alloc_sectors(struct bch_fs *c, - enum bch_data_type data_type, - struct bch_devs_mask *devs, - unsigned long write_point, - struct bkey_i_extent *e, - unsigned nr_replicas, - unsigned nr_replicas_required, - enum alloc_reserve reserve, - unsigned flags, - struct closure *cl) -{ - struct write_point *wp; - struct open_bucket *ob; - - wp = bch2_alloc_sectors_start(c, data_type, devs, write_point, - nr_replicas, nr_replicas_required, - reserve, flags, cl); - if (IS_ERR_OR_NULL(wp)) - return ERR_CAST(wp); - - ob = wp->ob; - - if (e->k.size > wp->sectors_free) - bch2_key_resize(&e->k, wp->sectors_free); - - bch2_alloc_sectors_append_ptrs(c, e, nr_replicas, ob, e->k.size); - - bch2_alloc_sectors_done(c, wp); - - return ob; -} - /* Startup/shutdown (ro/rw): */ void bch2_recalc_capacity(struct bch_fs *c) @@ -1839,46 +1675,15 @@ set_capacity: closure_wake_up(&c->freelist_wait); } -static bool open_bucket_has_device(struct open_bucket *ob, - struct bch_dev *ca) -{ - struct open_bucket_ptr *ptr; - bool ret = false; - - spin_lock(&ob->lock); - open_bucket_for_each_ptr(ob, ptr) - ret |= ptr->ptr.dev == ca->dev_idx; - spin_unlock(&ob->lock); - - return ret; -} - static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca, struct write_point *wp) { - struct open_bucket *ob; - struct closure cl; + struct bch_devs_mask not_self; - closure_init_stack(&cl); -retry: - mutex_lock(&wp->lock); - if (!open_bucket_has_device(wp->ob, ca)) { - mutex_unlock(&wp->lock); - return; - } - - ob = bch2_open_bucket_get(c, 0, &cl); - if (IS_ERR(ob)) { - mutex_unlock(&wp->lock); - closure_sync(&cl); - goto retry; - - } - - open_bucket_move_ptrs(c, ob, wp->ob, &ca->self, ob->nr_ptrs); - bch2_open_bucket_put(c, wp->ob); - wp->ob = ob; + bitmap_complement(not_self.d, ca->self.d, BCH_SB_MEMBERS_MAX); + mutex_lock(&wp->lock); + writepoint_drop_ptrs(c, wp, ¬_self, wp->nr_ptrs); mutex_unlock(&wp->lock); } @@ -1889,9 +1694,13 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) for (ob = c->open_buckets; ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); - ob++) - if (atomic_read(&ob->pin)) - ret |= open_bucket_has_device(ob, ca); + ob++) { + spin_lock(&ob->lock); + if (ob->valid && !ob->on_partial_list && + ob->ptr.dev == ca->dev_idx) + ret = true; + spin_unlock(&ob->lock); + } return ret; } @@ -1899,13 +1708,10 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) /* device goes ro: */ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) { - struct closure cl; unsigned i; BUG_ON(ca->alloc_thread); - closure_init_stack(&cl); - /* First, remove device from allocation groups: */ clear_bit(ca->dev_idx, c->tiers[ca->mi.tier].devs.d); @@ -1920,6 +1726,9 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) /* Next, close write points that point to this device... */ for (i = 0; i < ARRAY_SIZE(c->write_points); i++) bch2_stop_write_point(c, ca, &c->write_points[i]); + + bch2_stop_write_point(c, ca, &ca->copygc_write_point); + bch2_stop_write_point(c, ca, &c->tiers[ca->mi.tier].wp); bch2_stop_write_point(c, ca, &c->btree_write_point); mutex_lock(&c->btree_reserve_cache_lock); @@ -1927,7 +1736,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) struct btree_alloc *a = &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - bch2_open_bucket_put(c, a->ob); + bch2_open_bucket_put_refs(c, &a->ob.nr, a->ob.refs); } mutex_unlock(&c->btree_reserve_cache_lock); @@ -1945,16 +1754,8 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) /* Now wait for any in flight writes: */ - while (1) { - closure_wait(&c->open_buckets_wait, &cl); - - if (!bch2_dev_has_open_write_point(c, ca)) { - closure_wake_up(&c->open_buckets_wait); - break; - } - - closure_sync(&cl); - } + closure_wait_event(&c->open_buckets_wait, + !bch2_dev_has_open_write_point(c, ca)); } /* device goes rw: */ @@ -2015,10 +1816,10 @@ void bch2_fs_allocator_init(struct bch_fs *c) { struct open_bucket *ob; struct write_point *wp; + unsigned i; mutex_init(&c->write_points_hash_lock); - init_rwsem(&c->alloc_gc_lock); - spin_lock_init(&c->open_buckets_lock); + spin_lock_init(&c->freelist_lock); bch2_prio_timer_init(c, READ); bch2_prio_timer_init(c, WRITE); @@ -2034,40 +1835,20 @@ void bch2_fs_allocator_init(struct bch_fs *c) c->open_buckets_freelist = ob - c->open_buckets; } - mutex_init(&c->btree_write_point.lock); - c->btree_write_point.type = BCH_DATA_BTREE; - c->btree_write_point.ob = bch2_open_bucket_get(c, 0, NULL); - BUG_ON(IS_ERR(c->btree_write_point.ob)); + writepoint_init(&c->btree_write_point, BCH_DATA_BTREE); + + for (i = 0; i < ARRAY_SIZE(c->tiers); i++) + writepoint_init(&c->tiers[i].wp, BCH_DATA_USER); for (wp = c->write_points; wp < c->write_points + ARRAY_SIZE(c->write_points); wp++) { - mutex_init(&wp->lock); - wp->type = BCH_DATA_USER; - wp->ob = bch2_open_bucket_get(c, 0, NULL); - wp->last_used = sched_clock(); + writepoint_init(wp, BCH_DATA_USER); + wp->last_used = sched_clock(); wp->write_point = (unsigned long) wp; hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); - - BUG_ON(IS_ERR(wp->ob)); } c->pd_controllers_update_seconds = 5; INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); - - spin_lock_init(&c->foreground_write_pd_lock); - bch2_pd_controller_init(&c->foreground_write_pd); - /* - * We do not want the write rate to have an effect on the computed - * rate, for two reasons: - * - * We do not call bch2_ratelimit_delay() at all if the write rate - * exceeds 1GB/s. In this case, the PD controller will think we are - * not "keeping up" and not change the rate. - */ - c->foreground_write_pd.backpressure = 0; - init_timer(&c->foreground_write_wakeup); - - c->foreground_write_wakeup.data = (unsigned long) c; - c->foreground_write_wakeup.function = bch2_wake_delayed_writes; } diff --git a/libbcachefs/alloc.h b/libbcachefs/alloc.h index 1ea747d..8dffb86 100644 --- a/libbcachefs/alloc.h +++ b/libbcachefs/alloc.h @@ -8,7 +8,7 @@ struct bkey; struct bucket; struct bch_dev; struct bch_fs; -struct dev_group; +struct bch_devs_List; struct dev_alloc_list { unsigned nr; @@ -24,33 +24,61 @@ void bch2_wp_rescale(struct bch_fs *, struct bch_dev *, int bch2_alloc_read(struct bch_fs *, struct list_head *); int bch2_alloc_replay_key(struct bch_fs *, struct bpos); -long bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve); +enum bucket_alloc_ret { + ALLOC_SUCCESS = 0, + OPEN_BUCKETS_EMPTY = -1, + FREELIST_EMPTY = -2, /* Allocator thread not keeping up */ + NO_DEVICES = -3, /* -EROFS */ +}; + +int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool, + struct closure *); + +void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); + +static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) +{ + if (atomic_dec_and_test(&ob->pin)) + __bch2_open_bucket_put(c, ob); +} + +static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs) +{ + unsigned i; + + for (i = 0; i < *nr; i++) + bch2_open_bucket_put(c, c->open_buckets + refs[i]); + + *nr = 0; +} + +static inline void bch2_open_bucket_get(struct bch_fs *c, + struct write_point *wp, + u8 *nr, u8 *refs) +{ + unsigned i; -void bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); + for (i = 0; i < wp->nr_ptrs_can_use; i++) { + struct open_bucket *ob = wp->ptrs[i]; + + atomic_inc(&ob->pin); + refs[(*nr)++] = ob - c->open_buckets; + } +} struct write_point *bch2_alloc_sectors_start(struct bch_fs *, - enum bch_data_type, struct bch_devs_mask *, - unsigned long, + struct write_point_specifier, + struct bch_devs_list *, unsigned, unsigned, enum alloc_reserve, unsigned, struct closure *); -void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct bkey_i_extent *, - unsigned, struct open_bucket *, unsigned); +void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, + struct bkey_i_extent *, unsigned); void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); -struct open_bucket *bch2_alloc_sectors(struct bch_fs *, - enum bch_data_type, - struct bch_devs_mask *, - unsigned long, - struct bkey_i_extent *, - unsigned, unsigned, - enum alloc_reserve, - unsigned, - struct closure *); - static inline void bch2_wake_allocator(struct bch_dev *ca) { struct task_struct *p; @@ -61,10 +89,20 @@ static inline void bch2_wake_allocator(struct bch_dev *ca) rcu_read_unlock(); } -#define open_bucket_for_each_ptr(_ob, _ptr) \ - for ((_ptr) = (_ob)->ptrs; \ - (_ptr) < (_ob)->ptrs + (_ob)->nr_ptrs; \ - (_ptr)++) +#define writepoint_for_each_ptr(_wp, _ob, _i) \ + for ((_i) = 0; \ + (_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true); \ + (_i)++) + +static inline struct write_point_specifier writepoint_hashed(unsigned long v) +{ + return (struct write_point_specifier) { .v = v | 1 }; +} + +static inline struct write_point_specifier writepoint_ptr(struct write_point *wp) +{ + return (struct write_point_specifier) { .v = (unsigned long) wp }; +} void bch2_recalc_capacity(struct bch_fs *); @@ -74,6 +112,13 @@ void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_stop(struct bch_dev *); int bch2_dev_allocator_start(struct bch_dev *); +static inline void writepoint_init(struct write_point *wp, + enum bch_data_type type) +{ + mutex_init(&wp->lock); + wp->type = type; +} + void bch2_fs_allocator_init(struct bch_fs *); extern const struct bkey_ops bch2_bkey_alloc_ops; diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index c48d0aa..90123ff 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -47,19 +47,14 @@ enum alloc_reserve { #define OPEN_BUCKETS_COUNT 256 #define WRITE_POINT_COUNT 32 -struct open_bucket_ptr { - struct bch_extent_ptr ptr; - unsigned sectors_free; -}; - struct open_bucket { spinlock_t lock; atomic_t pin; u8 freelist; - u8 new_ob; - u8 nr_ptrs; - - struct open_bucket_ptr ptrs[BCH_REPLICAS_MAX * 2]; + bool valid; + bool on_partial_list; + unsigned sectors_free; + struct bch_extent_ptr ptr; }; struct write_point { @@ -69,13 +64,23 @@ struct write_point { unsigned long write_point; enum bch_data_type type; + u8 nr_ptrs; + /* + * number of pointers in @ob we can't use, because we already had + * pointers to those devices: + */ + u8 nr_ptrs_can_use; /* calculated based on how many pointers we're actually going to use: */ unsigned sectors_free; - struct open_bucket *ob; + struct open_bucket *ptrs[BCH_REPLICAS_MAX * 2]; u64 next_alloc[BCH_SB_MEMBERS_MAX]; }; +struct write_point_specifier { + unsigned long v; +}; + struct alloc_heap_entry { size_t bucket; unsigned long key; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 58d4723..b679dd1 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -251,9 +251,6 @@ do { \ BCH_DEBUG_PARAM(debug_check_bkeys, \ "Run bkey_debugcheck (primarily checking GC/allocation "\ "information) when iterating over keys") \ - BCH_DEBUG_PARAM(version_stress_test, \ - "Assigns random version numbers to newly written " \ - "extents, to test overlapping extent cases") \ BCH_DEBUG_PARAM(verify_btree_ondisk, \ "Reread btree nodes at various points to verify the " \ "mergesort in the read path against modifications " \ @@ -310,8 +307,9 @@ struct crypto_blkcipher; struct crypto_ahash; enum gc_phase { - GC_PHASE_SB_METADATA = BTREE_ID_NR + 1, + GC_PHASE_SB = BTREE_ID_NR + 1, GC_PHASE_PENDING_DELETE, + GC_PHASE_ALLOC, GC_PHASE_DONE }; @@ -321,30 +319,6 @@ struct gc_pos { unsigned level; }; -struct bch_member_cpu { - u64 nbuckets; /* device size */ - u16 first_bucket; /* index of first bucket used */ - u16 bucket_size; /* sectors */ - u8 state; - u8 tier; - u8 replacement; - u8 discard; - u8 data_allowed; - u8 valid; -}; - -struct bch_replicas_cpu_entry { - u8 data_type; - u8 devs[BCH_SB_MEMBERS_MAX / 8]; -}; - -struct bch_replicas_cpu { - struct rcu_head rcu; - unsigned nr; - unsigned entry_size; - struct bch_replicas_cpu_entry entries[]; -}; - struct io_count { u64 sectors[2][BCH_DATA_NR]; }; @@ -372,7 +346,7 @@ struct bch_dev { struct bch_devs_mask self; - /* biosets used in cloned bios for replicas and moving_gc */ + /* biosets used in cloned bios for writing multiple replicas */ struct bio_set replica_set; struct task_struct *alloc_thread; @@ -392,7 +366,7 @@ struct bch_dev { unsigned nr_invalidated; bool alloc_thread_started; - struct open_bucket_ptr open_buckets_partial[BCH_REPLICAS_MAX * WRITE_POINT_COUNT]; + u8 open_buckets_partial[OPEN_BUCKETS_COUNT]; unsigned open_buckets_partial_nr; size_t fifo_last_bucket; @@ -422,18 +396,20 @@ struct bch_dev { bool allocator_invalidating_data; alloc_heap alloc_heap; - bucket_heap copygc_heap; - /* Moving GC: */ - struct task_struct *moving_gc_read; - - struct bch_pd_controller moving_gc_pd; + /* Copying GC: */ + struct task_struct *copygc_thread; + copygc_heap copygc_heap; + struct bch_pd_controller copygc_pd; + struct write_point copygc_write_point; struct journal_device journal; struct work_struct io_error_work; /* The rest of this all shows up in sysfs */ + atomic_t latency[2]; + struct io_count __percpu *io_done; }; @@ -473,6 +449,7 @@ struct bch_tier { struct bch_pd_controller pd; struct bch_devs_mask devs; + struct write_point wp; }; enum bch_fs_state { @@ -557,10 +534,7 @@ struct bch_fs { * when allocating btree reserves fail halfway through) - instead, we * can stick them here: */ - struct btree_alloc { - struct open_bucket *ob; - BKEY_PADDED(k); - } btree_reserve_cache[BTREE_NODE_RESERVE * 2]; + struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2]; unsigned btree_reserve_cache_nr; struct mutex btree_reserve_cache_lock; @@ -573,15 +547,9 @@ struct bch_fs { struct workqueue_struct *copygc_wq; /* ALLOCATION */ - struct rw_semaphore alloc_gc_lock; - struct bch_pd_controller foreground_write_pd; struct delayed_work pd_controllers_update; unsigned pd_controllers_update_seconds; - spinlock_t foreground_write_pd_lock; - struct bch_write_op *write_wait_head; - struct bch_write_op *write_wait_tail; - struct timer_list foreground_write_wakeup; /* * These contain all r/w devices - i.e. devices we can currently @@ -622,8 +590,8 @@ struct bch_fs { struct io_clock io_clock[2]; - /* SECTOR ALLOCATOR */ - spinlock_t open_buckets_lock; + /* ALLOCATOR */ + spinlock_t freelist_lock; u8 open_buckets_freelist; u8 open_buckets_nr_free; struct closure_waitlist open_buckets_wait; @@ -635,15 +603,6 @@ struct bch_fs { struct hlist_head write_points_hash[WRITE_POINT_COUNT]; struct mutex write_points_hash_lock; - /* - * This write point is used for migrating data off a device - * and can point to any other device. - * We can't use the normal write points because those will - * gang up n replicas, and for migration we want only one new - * replica. - */ - struct write_point migration_write_point; - /* GARBAGE COLLECTION */ struct task_struct *gc_thread; atomic_t kick_gc; @@ -688,6 +647,11 @@ struct bch_fs { atomic64_t key_version; + /* VFS IO PATH - fs-io.c */ + struct bio_set writepage_bioset; + struct bio_set dio_write_bioset; + struct bio_set dio_read_bioset; + struct bio_list btree_write_error_list; struct work_struct btree_write_error_work; spinlock_t btree_write_error_lock; @@ -728,19 +692,14 @@ struct bch_fs { /* The rest of this all shows up in sysfs */ atomic_long_t read_realloc_races; + atomic_long_t extent_migrate_done; + atomic_long_t extent_migrate_raced; unsigned btree_gc_periodic:1; - unsigned foreground_write_ratelimit_enabled:1; unsigned copy_gc_enabled:1; unsigned tiering_enabled:1; unsigned tiering_percent; - /* - * foreground writes will be throttled when the number of free - * buckets is below this percentage - */ - unsigned foreground_target_percent; - #define BCH_DEBUG_PARAM(name, description) bool name; BCH_DEBUG_PARAMS_ALL() #undef BCH_DEBUG_PARAM diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 16a1edd..2dc9a7e 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -344,11 +344,13 @@ struct bch_csum { enum bch_csum_type { BCH_CSUM_NONE = 0, - BCH_CSUM_CRC32C = 1, - BCH_CSUM_CRC64 = 2, + BCH_CSUM_CRC32C_NONZERO = 1, + BCH_CSUM_CRC64_NONZERO = 2, BCH_CSUM_CHACHA20_POLY1305_80 = 3, BCH_CSUM_CHACHA20_POLY1305_128 = 4, - BCH_CSUM_NR = 5, + BCH_CSUM_CRC32C = 5, + BCH_CSUM_CRC64 = 6, + BCH_CSUM_NR = 7, }; static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) @@ -550,7 +552,7 @@ BKEY_VAL_TYPE(reservation, BCH_RESERVATION); /* Maximum possible size of an entire extent value: */ /* There's a hack in the keylist code that needs to be fixed.. */ #define BKEY_EXTENT_VAL_U64s_MAX \ - (BKEY_EXTENT_PTR_U64s_MAX * BCH_REPLICAS_MAX) + (BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) /* * Maximum possible size of an entire extent, key + value: */ #define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) @@ -734,11 +736,13 @@ BKEY_VAL_TYPE(alloc, BCH_ALLOC); /* * Version 8: BCH_SB_ENCODED_EXTENT_MAX_BITS * BCH_MEMBER_DATA_ALLOWED + * Version 9: incompatible extent nonce change */ #define BCH_SB_VERSION_MIN 7 #define BCH_SB_VERSION_EXTENT_MAX 8 -#define BCH_SB_VERSION_MAX 8 +#define BCH_SB_VERSION_EXTENT_NONCE_V1 9 +#define BCH_SB_VERSION_MAX 9 #define BCH_SB_SECTOR 8 #define BCH_SB_LABEL_SIZE 32 diff --git a/libbcachefs/bkey.c b/libbcachefs/bkey.c index d33bc4e..73089a9 100644 --- a/libbcachefs/bkey.c +++ b/libbcachefs/bkey.c @@ -4,6 +4,14 @@ #include "bset.h" #include "util.h" +#undef EBUG_ON + +#ifdef DEBUG_BKEYS +#define EBUG_ON(cond) BUG_ON(cond) +#else +#define EBUG_ON(cond) +#endif + const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index a1337bf..c195cd9 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -146,6 +146,17 @@ * first key in that range of bytes again. */ +extern bool bch2_expensive_debug_checks; + +static inline bool btree_keys_expensive_checks(const struct btree *b) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + return bch2_expensive_debug_checks || *b->expensive_debug_checks; +#else + return false; +#endif +} + struct btree_node_iter; struct btree_node_iter_set; @@ -188,7 +199,7 @@ bkey_unpack_key_format_checked(const struct btree *b, compiled_unpack_fn unpack_fn = b->aux_data; unpack_fn(&dst, src); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + if (btree_keys_expensive_checks(b)) { struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); /* @@ -260,17 +271,6 @@ static inline struct bkey_s __bkey_disassemble(struct btree *b, #define for_each_bset(_b, _t) \ for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) -extern bool bch2_expensive_debug_checks; - -static inline bool btree_keys_expensive_checks(struct btree *b) -{ -#ifdef CONFIG_BCACHEFS_DEBUG - return bch2_expensive_debug_checks || *b->expensive_debug_checks; -#else - return false; -#endif -} - static inline bool bset_has_ro_aux_tree(struct bset_tree *t) { return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index b090196..1198fe3 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -111,19 +112,35 @@ u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *c, struct bkey_s_c k) /* * For runtime mark and sweep: */ -static u8 bch2_btree_mark_key(struct bch_fs *c, enum bkey_type type, - struct bkey_s_c k, unsigned flags) +static u8 bch2_gc_mark_key(struct bch_fs *c, enum bkey_type type, + struct bkey_s_c k, unsigned flags) { + struct gc_pos pos = { 0 }; + struct bch_fs_usage *stats; + u8 ret = 0; + + preempt_disable(); + stats = this_cpu_ptr(c->usage_percpu); switch (type) { case BKEY_TYPE_BTREE: - bch2_gc_mark_key(c, k, c->opts.btree_node_size, true, flags); - return 0; + bch2_mark_key(c, k, c->opts.btree_node_size, true, pos, stats, + 0, flags| + BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| + BCH_BUCKET_MARK_GC_LOCK_HELD); + break; case BKEY_TYPE_EXTENTS: - bch2_gc_mark_key(c, k, k.k->size, false, flags); - return bch2_btree_key_recalc_oldest_gen(c, k); + bch2_mark_key(c, k, k.k->size, false, pos, stats, + 0, flags| + BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| + BCH_BUCKET_MARK_GC_LOCK_HELD); + ret = bch2_btree_key_recalc_oldest_gen(c, k); + break; default: BUG(); } + preempt_enable(); + + return ret; } int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, @@ -182,7 +199,7 @@ int bch2_btree_mark_key_initial(struct bch_fs *c, enum bkey_type type, max_t(u64, k.k->version.lo, atomic64_read(&c->key_version))); - bch2_btree_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC); + bch2_gc_mark_key(c, type, k, BCH_BUCKET_MARK_NOATOMIC); fsck_err: return ret; } @@ -200,7 +217,7 @@ static unsigned btree_gc_mark_node(struct bch_fs *c, struct btree *b) btree_node_is_extents(b), &unpacked) { bch2_bkey_debugcheck(c, b, k); - stale = max(stale, bch2_btree_mark_key(c, type, k, 0)); + stale = max(stale, bch2_gc_mark_key(c, type, k, 0)); } return stale; @@ -267,123 +284,79 @@ static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id) mutex_lock(&c->btree_root_lock); b = c->btree_roots[btree_id].b; - bch2_btree_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0); + bch2_gc_mark_key(c, BKEY_TYPE_BTREE, bkey_i_to_s_c(&b->key), 0); gc_pos_set(c, gc_pos_btree_root(b->btree_id)); mutex_unlock(&c->btree_root_lock); return 0; } -static void bch2_mark_allocator_buckets(struct bch_fs *c) -{ - struct bch_dev *ca; - struct open_bucket *ob; - const struct open_bucket_ptr *ptr; - size_t i, j, iter; - unsigned ci; - - down_write(&c->alloc_gc_lock); - - for_each_member_device(ca, c, ci) { - spin_lock(&ca->freelist_lock); - - fifo_for_each_entry(i, &ca->free_inc, iter) - bch2_mark_alloc_bucket(ca, &ca->buckets[i], true); - - for (j = 0; j < RESERVE_NR; j++) - fifo_for_each_entry(i, &ca->free[j], iter) - bch2_mark_alloc_bucket(ca, &ca->buckets[i], true); - - for (ptr = ca->open_buckets_partial; - ptr < ca->open_buckets_partial + ca->open_buckets_partial_nr; - ptr++) - bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true); - - spin_unlock(&ca->freelist_lock); - } - - for (ob = c->open_buckets; - ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); - ob++) { - spin_lock(&ob->lock); - open_bucket_for_each_ptr(ob, ptr) { - ca = c->devs[ptr->ptr.dev]; - bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true); - } - spin_unlock(&ob->lock); - } - - up_write(&c->alloc_gc_lock); -} - -static void mark_metadata_sectors(struct bch_dev *ca, u64 start, u64 end, - enum bucket_data_type type) +static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, + u64 start, u64 end, + enum bucket_data_type type, + unsigned flags) { u64 b = sector_to_bucket(ca, start); do { - bch2_mark_metadata_bucket(ca, ca->buckets + b, type, true); + bch2_mark_metadata_bucket(c, ca, ca->buckets + b, type, + gc_phase(GC_PHASE_SB), flags); b++; } while (b < sector_to_bucket(ca, end)); } -static void bch2_dev_mark_superblocks(struct bch_dev *ca) +void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, + unsigned flags) { struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; unsigned i; + u64 b; + + lockdep_assert_held(&c->sb_lock); for (i = 0; i < layout->nr_superblocks; i++) { if (layout->sb_offset[i] == BCH_SB_SECTOR) - mark_metadata_sectors(ca, 0, BCH_SB_SECTOR, - BUCKET_SB); + mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, + BUCKET_SB, flags); - mark_metadata_sectors(ca, + mark_metadata_sectors(c, ca, layout->sb_offset[i], layout->sb_offset[i] + (1 << layout->sb_max_size_bits), - BUCKET_SB); + BUCKET_SB, flags); } -} - -/* - * Mark non btree metadata - prios, journal - */ -void bch2_mark_dev_metadata(struct bch_fs *c, struct bch_dev *ca) -{ - unsigned i; - u64 b; - - lockdep_assert_held(&c->sb_lock); - - bch2_dev_mark_superblocks(ca); spin_lock(&c->journal.lock); for (i = 0; i < ca->journal.nr; i++) { b = ca->journal.buckets[i]; - bch2_mark_metadata_bucket(ca, ca->buckets + b, - BUCKET_JOURNAL, true); + bch2_mark_metadata_bucket(c, ca, ca->buckets + b, + BUCKET_JOURNAL, + gc_phase(GC_PHASE_SB), flags); } spin_unlock(&c->journal.lock); } -static void bch2_mark_metadata(struct bch_fs *c) +static void bch2_mark_superblocks(struct bch_fs *c) { struct bch_dev *ca; unsigned i; mutex_lock(&c->sb_lock); - gc_pos_set(c, gc_phase(GC_PHASE_SB_METADATA)); + gc_pos_set(c, gc_phase(GC_PHASE_SB)); for_each_online_member(ca, c, i) - bch2_mark_dev_metadata(c, ca); + bch2_mark_dev_superblock(c, ca, + BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| + BCH_BUCKET_MARK_GC_LOCK_HELD); mutex_unlock(&c->sb_lock); } /* Also see bch2_pending_btree_node_free_insert_done() */ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) { + struct gc_pos pos = { 0 }; struct bch_fs_usage stats = { 0 }; struct btree_update *as; struct pending_btree_node_free *d; @@ -393,10 +366,11 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) for_each_pending_btree_node_free(c, as, d) if (d->index_update_done) - __bch2_mark_key(c, bkey_i_to_s_c(&d->key), - c->opts.btree_node_size, true, - &stats, 0, - BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); + bch2_mark_key(c, bkey_i_to_s_c(&d->key), + c->opts.btree_node_size, true, pos, + &stats, 0, + BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| + BCH_BUCKET_MARK_GC_LOCK_HELD); /* * Don't apply stats - pending deletes aren't tracked in * bch_alloc_stats: @@ -405,6 +379,51 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) mutex_unlock(&c->btree_interior_update_lock); } +static void bch2_mark_allocator_buckets(struct bch_fs *c) +{ + struct bch_dev *ca; + struct open_bucket *ob; + size_t i, j, iter; + unsigned ci; + + spin_lock(&c->freelist_lock); + gc_pos_set(c, gc_pos_alloc(c, NULL)); + + for_each_member_device(ca, c, ci) { + fifo_for_each_entry(i, &ca->free_inc, iter) + bch2_mark_alloc_bucket(c, ca, &ca->buckets[i], true, + gc_pos_alloc(c, NULL), + BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| + BCH_BUCKET_MARK_GC_LOCK_HELD); + + + + for (j = 0; j < RESERVE_NR; j++) + fifo_for_each_entry(i, &ca->free[j], iter) + bch2_mark_alloc_bucket(c, ca, &ca->buckets[i], true, + gc_pos_alloc(c, NULL), + BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| + BCH_BUCKET_MARK_GC_LOCK_HELD); + } + + spin_unlock(&c->freelist_lock); + + for (ob = c->open_buckets; + ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); + ob++) { + spin_lock(&ob->lock); + if (ob->valid) { + gc_pos_set(c, gc_pos_alloc(c, ob)); + ca = c->devs[ob->ptr.dev]; + bch2_mark_alloc_bucket(c, ca, PTR_BUCKET(ca, &ob->ptr), true, + gc_pos_alloc(c, ob), + BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE| + BCH_BUCKET_MARK_GC_LOCK_HELD); + } + spin_unlock(&ob->lock); + } +} + void bch2_gc_start(struct bch_fs *c) { struct bch_dev *ca; @@ -495,9 +514,6 @@ void bch2_gc(struct bch_fs *c) bch2_gc_start(c); - /* Walk allocator's references: */ - bch2_mark_allocator_buckets(c); - /* Walk btree: */ while (c->gc_pos.phase < (int) BTREE_ID_NR) { int ret = c->btree_roots[c->gc_pos.phase].b @@ -513,8 +529,9 @@ void bch2_gc(struct bch_fs *c) gc_pos_set(c, gc_phase(c->gc_pos.phase + 1)); } - bch2_mark_metadata(c); + bch2_mark_superblocks(c); bch2_mark_pending_btree_node_frees(c); + bch2_mark_allocator_buckets(c); for_each_member_device(ca, c, i) atomic_long_set(&ca->saturated_count, 0); @@ -570,7 +587,7 @@ static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, struct bkey_format new_format; memset(new_nodes, 0, sizeof(new_nodes)); - bch2_keylist_init(&keylist, NULL, 0); + bch2_keylist_init(&keylist, NULL); /* Count keys that are not deleted */ for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++) @@ -1023,8 +1040,6 @@ again: if (ret) return ret; - bch2_mark_metadata(c); - if (test_bit(BCH_FS_FIXED_GENS, &c->flags)) { if (iter++ > 2) { bch_info(c, "Unable to fix bucket gens, looping"); @@ -1043,6 +1058,8 @@ again: if (c->sb.encryption_type) atomic64_add(1 << 16, &c->key_version); + bch2_mark_superblocks(c); + gc_pos_set(c, gc_phase(GC_PHASE_DONE)); set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); diff --git a/libbcachefs/btree_gc.h b/libbcachefs/btree_gc.h index 27dcc06..4d1ab9d 100644 --- a/libbcachefs/btree_gc.h +++ b/libbcachefs/btree_gc.h @@ -13,7 +13,7 @@ int bch2_initial_gc(struct bch_fs *, struct list_head *); u8 bch2_btree_key_recalc_oldest_gen(struct bch_fs *, struct bkey_s_c); int bch2_btree_mark_key_initial(struct bch_fs *, enum bkey_type, struct bkey_s_c); -void bch2_mark_dev_metadata(struct bch_fs *, struct bch_dev *); +void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned); /* * For concurrent mark and sweep (with other index updates), we define a total @@ -88,6 +88,14 @@ static inline struct gc_pos gc_pos_btree_root(enum btree_id id) }; } +static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob) +{ + return (struct gc_pos) { + .phase = GC_PHASE_ALLOC, + .pos = POS(ob ? ob - c->open_buckets : 0, 0), + }; +} + static inline bool gc_will_visit(struct bch_fs *c, struct gc_pos pos) { unsigned seq; diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index d50e9e8..38c373c 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -146,9 +146,7 @@ static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) BUG_ON(iter->data->k > iter->data->end); if (iter->data->k == iter->data->end) - memmove(&iter->data[0], - &iter->data[1], - sizeof(iter->data[0]) * --iter->used); + array_remove_item(iter->data, iter->used, 0); else sort_iter_sift(iter, cmp); } @@ -1307,6 +1305,8 @@ static void btree_node_read_endio(struct bio *bio) struct btree_read_bio *rb = container_of(bio, struct btree_read_bio, bio); + bch2_latency_acct(rb->pick.ca, rb->start_time >> 10, READ); + INIT_WORK(&rb->work, btree_node_read_work); schedule_work(&rb->work); } @@ -1471,6 +1471,8 @@ static void btree_node_write_endio(struct bio *bio) struct bch_fs *c = wbio->c; struct bch_dev *ca = wbio->ca; + bch2_latency_acct(ca, wbio->submit_time_us, WRITE); + if (bch2_dev_io_err_on(bio->bi_error, ca, "btree write") || bch2_meta_write_fault("btree")) set_bit(wbio->ptr_idx, (unsigned long *) &orig->replicas_failed); diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index f3290f9..61165a6 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -10,6 +10,7 @@ struct btree_iter; struct btree_read_bio { struct bch_fs *c; + unsigned submit_time_us; u64 start_time; struct extent_pick_ptr pick; struct work_struct work; diff --git a/libbcachefs/btree_locking.h b/libbcachefs/btree_locking.h index 0c174e4..c271189 100644 --- a/libbcachefs/btree_locking.h +++ b/libbcachefs/btree_locking.h @@ -91,7 +91,7 @@ static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) { int lock_type = btree_node_locked_type(iter, level); - EBUG_ON(iter->flags & BTREE_ITER_UPTODATE); + EBUG_ON(!level && iter->flags & BTREE_ITER_UPTODATE); if (lock_type != BTREE_NODE_UNLOCKED) six_unlock_type(&iter->nodes[level]->lock, lock_type); diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 8b4df03..f1e06a3 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -55,6 +55,16 @@ struct btree_write { struct closure_waitlist wait; }; +struct btree_ob_ref { + u8 nr; + u8 refs[BCH_REPLICAS_MAX]; +}; + +struct btree_alloc { + struct btree_ob_ref ob; + BKEY_PADDED(k); +}; + struct btree { /* Hottest entries first */ struct rhash_head hash; @@ -118,7 +128,7 @@ struct btree { */ struct btree_update *will_make_reachable; - struct open_bucket *ob; + struct btree_ob_ref ob; /* lru list */ struct list_head list; @@ -317,18 +327,6 @@ struct btree_root { struct btree_iter; struct btree_node_iter; -enum extent_insert_hook_ret { - BTREE_HOOK_DO_INSERT, - BTREE_HOOK_NO_INSERT, - BTREE_HOOK_RESTART_TRANS, -}; - -struct extent_insert_hook { - enum extent_insert_hook_ret - (*fn)(struct extent_insert_hook *, struct bpos, struct bpos, - struct bkey_s_c, const struct bkey_i *); -}; - enum btree_insert_ret { BTREE_INSERT_OK, /* extent spanned multiple leaf nodes: have to traverse to next node: */ @@ -342,6 +340,12 @@ enum btree_insert_ret { BTREE_INSERT_NEED_GC_LOCK, }; +struct extent_insert_hook { + enum btree_insert_ret + (*fn)(struct extent_insert_hook *, struct bpos, struct bpos, + struct bkey_s_c, const struct bkey_i *); +}; + enum btree_gc_coalesce_fail_reason { BTREE_GC_COALESCE_FAIL_RESERVE_GET, BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index 2efb01c..1fe8fff 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -211,7 +211,7 @@ found: -c->opts.btree_node_size, true, b ? gc_pos_btree_node(b) : gc_pos_btree_root(as->btree_id), - &tmp, 0); + &tmp, 0, 0); /* * Don't apply tmp - pending deletes aren't tracked in * bch_alloc_stats: @@ -229,7 +229,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b, BUG_ON(btree_node_dirty(b)); BUG_ON(btree_node_need_write(b)); BUG_ON(b == btree_node_root(c, b)); - BUG_ON(b->ob); + BUG_ON(b->ob.nr); BUG_ON(!list_empty(&b->write_blocked)); BUG_ON(b->will_make_reachable); @@ -254,17 +254,17 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b, void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) { - struct open_bucket *ob = b->ob; + struct btree_ob_ref ob = b->ob; btree_update_drop_new_node(c, b); - b->ob = NULL; + b->ob.nr = 0; clear_btree_node_dirty(b); __btree_node_free(c, b, NULL); - bch2_open_bucket_put(c, ob); + bch2_open_bucket_put_refs(c, &ob.nr, ob.refs); } void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, @@ -287,7 +287,7 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c, bch2_mark_key(c, bkey_i_to_s_c(&pending->key), -c->opts.btree_node_size, true, gc_phase(GC_PHASE_PENDING_DELETE), - &stats, 0); + &stats, 0, 0); /* * Don't apply stats - pending deletes aren't tracked in * bch_alloc_stats: @@ -296,8 +296,7 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c, void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b) { - bch2_open_bucket_put(c, b->ob); - b->ob = NULL; + bch2_open_bucket_put_refs(c, &b->ob.nr, b->ob.refs); } static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, @@ -305,9 +304,12 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, struct closure *cl, unsigned flags) { - BKEY_PADDED(k) tmp; - struct open_bucket *ob; + struct write_point *wp; struct btree *b; + BKEY_PADDED(k) tmp; + struct bkey_i_extent *e; + struct btree_ob_ref ob; + struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; unsigned nr_reserve; enum alloc_reserve alloc_reserve; @@ -335,31 +337,41 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, mutex_unlock(&c->btree_reserve_cache_lock); retry: - /* alloc_sectors is weird, I suppose */ - bkey_extent_init(&tmp.k); - tmp.k.k.size = c->opts.btree_node_size, - - ob = bch2_alloc_sectors(c, BCH_DATA_BTREE, 0, 0, - bkey_i_to_extent(&tmp.k), - res->nr_replicas, - c->opts.metadata_replicas_required, - alloc_reserve, 0, cl); - if (IS_ERR(ob)) - return ERR_CAST(ob); - - if (tmp.k.k.size < c->opts.btree_node_size) { - bch2_open_bucket_put(c, ob); + wp = bch2_alloc_sectors_start(c, NULL, + writepoint_ptr(&c->btree_write_point), + &devs_have, + res->nr_replicas, + c->opts.metadata_replicas_required, + alloc_reserve, 0, cl); + if (IS_ERR(wp)) + return ERR_CAST(wp); + + if (wp->sectors_free < c->opts.btree_node_size) { + struct open_bucket *ob; + unsigned i; + + writepoint_for_each_ptr(wp, ob, i) + if (ob->sectors_free < c->opts.btree_node_size) + ob->sectors_free = 0; + + bch2_alloc_sectors_done(c, wp); goto retry; } + + e = bkey_extent_init(&tmp.k); + bch2_alloc_sectors_append_ptrs(c, wp, e, c->opts.btree_node_size); + + ob.nr = 0; + bch2_open_bucket_get(c, wp, &ob.nr, ob.refs); + bch2_alloc_sectors_done(c, wp); mem_alloc: b = bch2_btree_node_mem_alloc(c); /* we hold cannibalize_lock: */ BUG_ON(IS_ERR(b)); - BUG_ON(b->ob); + BUG_ON(b->ob.nr); bkey_copy(&b->key, &tmp.k); - b->key.k.size = 0; b->ob = ob; return b; @@ -466,11 +478,10 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; a->ob = b->ob; - b->ob = NULL; + b->ob.nr = 0; bkey_copy(&a->k, &b->key); } else { - bch2_open_bucket_put(c, b->ob); - b->ob = NULL; + bch2_btree_open_bucket_put(c, b); } __btree_node_free(c, b, NULL); @@ -857,10 +868,7 @@ static void __btree_interior_update_drop_new_node(struct btree *b) BUG(); found: - as->nr_new_nodes--; - memmove(&as->new_nodes[i], - &as->new_nodes[i + 1], - sizeof(struct btree *) * (as->nr_new_nodes - i)); + array_remove_item(as->new_nodes, as->nr_new_nodes, i); b->will_make_reachable = NULL; } @@ -1000,8 +1008,7 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id, as->reserve = reserve; INIT_LIST_HEAD(&as->write_blocked_list); - bch2_keylist_init(&as->parent_keys, as->inline_keys, - ARRAY_SIZE(as->inline_keys)); + bch2_keylist_init(&as->parent_keys, as->inline_keys); mutex_lock(&c->btree_interior_update_lock); list_add(&as->list, &c->btree_interior_update_list); @@ -1037,7 +1044,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) bch2_mark_key(c, bkey_i_to_s_c(&b->key), c->opts.btree_node_size, true, gc_pos_btree_root(b->btree_id), - &stats, 0); + &stats, 0, 0); if (old) bch2_btree_node_free_index(as, NULL, @@ -1121,7 +1128,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b if (bkey_extent_is_data(&insert->k)) bch2_mark_key(c, bkey_i_to_s_c(insert), c->opts.btree_node_size, true, - gc_pos_btree_node(b), &stats, 0); + gc_pos_btree_node(b), &stats, 0, 0); while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && !btree_iter_pos_cmp_packed(b, &insert->k.p, k, false)) @@ -1479,6 +1486,13 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, struct closure cl; int ret = 0; + /* + * We already have a disk reservation and open buckets pinned; this + * allocation must not block: + */ + if (iter->btree_id == BTREE_ID_EXTENTS) + btree_reserve_flags |= BTREE_INSERT_USE_RESERVE; + closure_init_stack(&cl); /* Hack, because gc and splitting nodes doesn't mix yet: */ @@ -1519,6 +1533,7 @@ int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, bch2_btree_iter_set_locks_want(iter, 1); out: up_read(&c->gc_lock); + closure_sync(&cl); return ret; } @@ -1904,7 +1919,7 @@ retry: bch2_mark_key(c, bkey_i_to_s_c(&new_key->k_i), c->opts.btree_node_size, true, gc_pos_btree_root(b->btree_id), - &stats, 0); + &stats, 0, 0); bch2_btree_node_free_index(as, NULL, bkey_i_to_s_c(&b->key), &stats); @@ -1928,6 +1943,7 @@ out: } bch2_btree_iter_unlock(&iter); up_read(&c->gc_lock); + closure_sync(&cl); return ret; err: if (as) @@ -1965,13 +1981,13 @@ int bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id, BTREE_INSERT_USE_RESERVE| BTREE_INSERT_USE_ALLOC_RESERVE, &cl); + closure_sync(&cl); + if (!IS_ERR(as)) break; if (PTR_ERR(as) == -ENOSPC) return PTR_ERR(as); - - closure_sync(&cl); } b = __btree_root_alloc(as, 0); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 6c490dd..e62e0d2 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -355,6 +355,11 @@ retry: multi_lock_write(c, trans); + if (race_fault()) { + ret = -EINTR; + goto unlock; + } + u64s = 0; trans_for_each_entry(trans, i) { /* Multiple inserts might go to same leaf: */ diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 6fdbb46..b73002d 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -101,9 +101,41 @@ static void bch2_fs_stats_verify(struct bch_fs *c) stats.online_reserved); } +static void bch2_dev_stats_verify(struct bch_dev *ca) +{ + struct bch_dev_usage stats = + __bch2_dev_usage_read(ca); + u64 n = ca->mi.nbuckets - ca->mi.first_bucket; + + BUG_ON(stats.buckets[S_META] > n); + BUG_ON(stats.buckets[S_DIRTY] > n); + BUG_ON(stats.buckets_cached > n); + BUG_ON(stats.buckets_alloc > n); + BUG_ON(stats.buckets_unavailable > n); +} + +static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) +{ + if (!(flags & BCH_DISK_RESERVATION_NOFAIL)) { + u64 used = __bch2_fs_sectors_used(c); + u64 cached = 0; + u64 avail = atomic64_read(&c->sectors_available); + int cpu; + + for_each_possible_cpu(cpu) + cached += per_cpu_ptr(c->usage_percpu, cpu)->available_cache; + + if (used + avail + cached > c->capacity) + panic("used %llu avail %llu cached %llu capacity %llu\n", + used, avail, cached, c->capacity); + } +} + #else static void bch2_fs_stats_verify(struct bch_fs *c) {} +static void bch2_dev_stats_verify(struct bch_dev *ca) {} +static void bch2_disk_reservations_verify(struct bch_fs *c, int flags) {} #endif @@ -171,11 +203,9 @@ struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *ca) return bch2_usage_read_raw(ca->usage_percpu); } -struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) +struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca) { - return bch2_usage_read_cached(ca->fs, - ca->usage_cached, - ca->usage_percpu); + return bch2_usage_read_cached(c, ca->usage_cached, ca->usage_percpu); } struct bch_fs_usage @@ -208,6 +238,11 @@ static inline int is_cached_bucket(struct bucket_mark m) !m.dirty_sectors && !!m.cached_sectors; } +static inline int is_unavailable_bucket(struct bucket_mark m) +{ + return !is_available_bucket(m); +} + static inline enum s_alloc bucket_type(struct bucket_mark m) { return is_meta_bucket(m) ? S_META : S_DIRTY; @@ -256,12 +291,15 @@ void bch2_fs_usage_apply(struct bch_fs *c, memset(stats, 0, sizeof(*stats)); } -static void bch2_dev_usage_update(struct bch_dev *ca, - struct bucket_mark old, struct bucket_mark new) +static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + struct bucket *g, struct bucket_mark old, + struct bucket_mark new) { - struct bch_fs *c = ca->fs; struct bch_dev_usage *dev_usage; + BUG_ON((g - ca->buckets) < ca->mi.first_bucket || + (g - ca->buckets) >= ca->mi.nbuckets); + bch2_fs_inconsistent_on(old.data_type && new.data_type && old.data_type != new.data_type, c, "different types of metadata in same bucket: %u, %u", @@ -270,38 +308,44 @@ static void bch2_dev_usage_update(struct bch_dev *ca, preempt_disable(); dev_usage = this_cpu_ptr(ca->usage_percpu); - dev_usage->sectors_cached += - (int) new.cached_sectors - (int) old.cached_sectors; - - dev_usage->sectors[bucket_type(old)] -= old.dirty_sectors; - dev_usage->sectors[bucket_type(new)] += new.dirty_sectors; - + dev_usage->buckets[S_META] += + is_meta_bucket(new) - is_meta_bucket(old); + dev_usage->buckets[S_DIRTY] += + is_dirty_bucket(new) - is_dirty_bucket(old); + dev_usage->buckets_cached += + is_cached_bucket(new) - is_cached_bucket(old); dev_usage->buckets_alloc += (int) new.owned_by_allocator - (int) old.owned_by_allocator; + dev_usage->buckets_unavailable += + is_unavailable_bucket(new) - is_unavailable_bucket(old); - dev_usage->buckets[S_META] += is_meta_bucket(new) - is_meta_bucket(old); - dev_usage->buckets[S_DIRTY] += is_dirty_bucket(new) - is_dirty_bucket(old); - dev_usage->buckets_cached += is_cached_bucket(new) - is_cached_bucket(old); + dev_usage->sectors[bucket_type(old)] -= old.dirty_sectors; + dev_usage->sectors[bucket_type(new)] += new.dirty_sectors; + dev_usage->sectors_cached += + (int) new.cached_sectors - (int) old.cached_sectors; preempt_enable(); if (!is_available_bucket(old) && is_available_bucket(new)) bch2_wake_allocator(ca); + + bch2_dev_stats_verify(ca); } -#define bucket_data_cmpxchg(ca, g, new, expr) \ +#define bucket_data_cmpxchg(c, ca, g, new, expr) \ ({ \ struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \ \ - bch2_dev_usage_update(ca, _old, new); \ + bch2_dev_usage_update(c, ca, g, _old, new); \ _old; \ }) -bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g, - struct bucket_mark *old) +bool bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, + struct bucket *g, struct bucket_mark *old) { struct bucket_mark new; - *old = bucket_data_cmpxchg(ca, g, new, ({ + lg_local_lock(&c->usage_lock); + *old = bucket_data_cmpxchg(c, ca, g, new, ({ if (!is_available_bucket(new)) return false; @@ -312,6 +356,7 @@ bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g, new.dirty_sectors = 0; new.gen++; })); + lg_local_unlock(&c->usage_lock); if (!old->owned_by_allocator && old->cached_sectors) trace_invalidate(ca, bucket_to_sector(ca, g - ca->buckets), @@ -319,11 +364,13 @@ bool bch2_invalidate_bucket(struct bch_dev *ca, struct bucket *g, return true; } -bool bch2_mark_alloc_bucket_startup(struct bch_dev *ca, struct bucket *g) +bool bch2_mark_alloc_bucket_startup(struct bch_fs *c, struct bch_dev *ca, + struct bucket *g) { struct bucket_mark new, old; - old = bucket_data_cmpxchg(ca, g, new, ({ + lg_local_lock(&c->usage_lock); + old = bucket_data_cmpxchg(c, ca, g, new, ({ if (new.touched_this_mount || !is_available_bucket(new)) return false; @@ -331,37 +378,32 @@ bool bch2_mark_alloc_bucket_startup(struct bch_dev *ca, struct bucket *g) new.owned_by_allocator = 1; new.touched_this_mount = 1; })); + lg_local_unlock(&c->usage_lock); return true; } -void bch2_mark_free_bucket(struct bch_dev *ca, struct bucket *g) +void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + struct bucket *g, bool owned_by_allocator, + struct gc_pos pos, unsigned flags) { struct bucket_mark old, new; - old = bucket_data_cmpxchg(ca, g, new, ({ - new.touched_this_mount = 1; - new.owned_by_allocator = 0; - new.data_type = 0; - new.cached_sectors = 0; - new.dirty_sectors = 0; - })); - - BUG_ON(bucket_became_unavailable(ca->fs, old, new)); -} - -void bch2_mark_alloc_bucket(struct bch_dev *ca, struct bucket *g, - bool owned_by_allocator) -{ - struct bucket_mark old, new; + lg_local_lock(&c->usage_lock); + if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && + gc_will_visit(c, pos)) { + lg_local_unlock(&c->usage_lock); + return; + } - old = bucket_data_cmpxchg(ca, g, new, ({ + old = bucket_data_cmpxchg(c, ca, g, new, ({ new.touched_this_mount = 1; new.owned_by_allocator = owned_by_allocator; })); + lg_local_unlock(&c->usage_lock); BUG_ON(!owned_by_allocator && !old.owned_by_allocator && - ca->fs->gc_pos.phase == GC_PHASE_DONE); + c->gc_pos.phase == GC_PHASE_DONE); } #define saturated_add(ca, dst, src, max) \ @@ -377,41 +419,49 @@ do { \ } \ } while (0) -void bch2_mark_metadata_bucket(struct bch_dev *ca, struct bucket *g, - enum bucket_data_type type, - bool may_make_unavailable) +void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + struct bucket *g, enum bucket_data_type type, + struct gc_pos pos, unsigned flags) { struct bucket_mark old, new; BUG_ON(!type); - old = bucket_data_cmpxchg(ca, g, new, ({ + lg_local_lock(&c->usage_lock); + if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && + gc_will_visit(c, pos)) { + lg_local_unlock(&c->usage_lock); + return; + } + + old = bucket_data_cmpxchg(c, ca, g, new, ({ saturated_add(ca, new.dirty_sectors, ca->mi.bucket_size, GC_MAX_SECTORS_USED); new.data_type = type; new.touched_this_mount = 1; })); + lg_local_unlock(&c->usage_lock); if (old.data_type != type && (old.data_type || old.cached_sectors || old.dirty_sectors)) - bch_err(ca->fs, "bucket %zu has multiple types of data (%u, %u)", + bch_err(c, "bucket %zu has multiple types of data (%u, %u)", g - ca->buckets, old.data_type, new.data_type); - BUG_ON(!may_make_unavailable && - bucket_became_unavailable(ca->fs, old, new)); + BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) && + bucket_became_unavailable(c, old, new)); } /* Reverting this until the copygc + compression issue is fixed: */ -static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors) +static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors) { if (!sectors) return 0; - return max(1U, DIV_ROUND_UP(sectors * crc_compressed_size(NULL, crc), - crc_uncompressed_size(NULL, crc))); + return max(1U, DIV_ROUND_UP(sectors * crc.compressed_size, + crc.uncompressed_size)); } /* @@ -420,12 +470,12 @@ static int __disk_sectors(const union bch_extent_crc *crc, unsigned sectors) * that with the gc pos seqlock held. */ static void bch2_mark_pointer(struct bch_fs *c, - struct bkey_s_c_extent e, - const union bch_extent_crc *crc, - const struct bch_extent_ptr *ptr, - s64 sectors, enum s_alloc type, - struct bch_fs_usage *stats, - u64 journal_seq, unsigned flags) + struct bkey_s_c_extent e, + const struct bch_extent_ptr *ptr, + struct bch_extent_crc_unpacked crc, + s64 sectors, enum s_alloc type, + struct bch_fs_usage *stats, + u64 journal_seq, unsigned flags) { struct bucket_mark old, new; unsigned saturated; @@ -435,7 +485,7 @@ static void bch2_mark_pointer(struct bch_fs *c, ? BUCKET_BTREE : BUCKET_DATA; u64 v; - if (crc_compression_type(crc)) { + if (crc.compression_type) { unsigned old_sectors, new_sectors; if (sectors > 0) { @@ -512,13 +562,13 @@ static void bch2_mark_pointer(struct bch_fs *c, old.counter, new.counter)) != old.counter); - bch2_dev_usage_update(ca, old, new); + bch2_dev_usage_update(c, ca, g, old, new); if (old.data_type != data_type && (old.data_type || old.cached_sectors || old.dirty_sectors)) - bch_err(ca->fs, "bucket %zu has multiple types of data (%u, %u)", + bch_err(c, "bucket %zu has multiple types of data (%u, %u)", g - ca->buckets, old.data_type, new.data_type); BUG_ON(!(flags & BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE) && @@ -535,71 +585,12 @@ static void bch2_mark_pointer(struct bch_fs *c, } } -static void bch2_mark_extent(struct bch_fs *c, struct bkey_s_c_extent e, - s64 sectors, bool metadata, - struct bch_fs_usage *stats, - u64 journal_seq, unsigned flags) -{ - const struct bch_extent_ptr *ptr; - const union bch_extent_crc *crc; - enum s_alloc type = metadata ? S_META : S_DIRTY; - unsigned replicas = 0; - - BUG_ON(metadata && bkey_extent_is_cached(e.k)); - BUG_ON(!sectors); - - extent_for_each_ptr_crc(e, ptr, crc) { - bch2_mark_pointer(c, e, crc, ptr, sectors, type, - stats, journal_seq, flags); - replicas += !ptr->cached; - } - - BUG_ON(replicas >= BCH_REPLICAS_MAX); - - if (replicas) - stats->s[replicas - 1].data[type] += sectors; -} - -void __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, - s64 sectors, bool metadata, - struct bch_fs_usage *stats, - u64 journal_seq, unsigned flags) -{ - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - bch2_mark_extent(c, bkey_s_c_to_extent(k), sectors, metadata, - stats, journal_seq, flags); - break; - case BCH_RESERVATION: { - struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); - - if (r.v->nr_replicas) - stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors; - break; - } - } -} - -void bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, - s64 sectors, bool metadata, unsigned flags) -{ - struct bch_fs_usage stats = { 0 }; - - __bch2_mark_key(c, k, sectors, metadata, &stats, 0, - flags|BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE); - - preempt_disable(); - bch2_usage_add(this_cpu_ptr(c->usage_percpu), &stats); - preempt_enable(); -} - void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, - s64 sectors, bool metadata, struct gc_pos gc_pos, - struct bch_fs_usage *stats, u64 journal_seq) + s64 sectors, bool metadata, + struct gc_pos pos, + struct bch_fs_usage *stats, + u64 journal_seq, unsigned flags) { - unsigned flags = gc_will_visit(c, gc_pos) - ? BCH_BUCKET_MARK_GC_WILL_VISIT : 0; /* * synchronization w.r.t. GC: * @@ -614,69 +605,104 @@ void bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, * To know whether we should mark a given reference (GC either isn't * running, or has already marked references at this position) we * construct a total order for everything GC walks. Then, we can simply - * compare the position of the reference we're marking - @gc_pos - with + * compare the position of the reference we're marking - @pos - with * GC's current position. If GC is going to mark this reference, GC's - * current position will be less than @gc_pos; if GC's current position - * is greater than @gc_pos GC has either already walked this position, - * or isn't running. + * current position will be less than @pos; if GC's current position is + * greater than @pos GC has either already walked this position, or + * isn't running. * * To avoid racing with GC's position changing, we have to deal with * - GC's position being set to GC_POS_MIN when GC starts: * usage_lock guards against this - * - GC's position overtaking @gc_pos: we guard against this with + * - GC's position overtaking @pos: we guard against this with * whatever lock protects the data structure the reference lives in * (e.g. the btree node lock, or the relevant allocator lock). */ + lg_local_lock(&c->usage_lock); - __bch2_mark_key(c, k, sectors, metadata, stats, journal_seq, flags); - bch2_fs_stats_verify(c); + if (!(flags & BCH_BUCKET_MARK_GC_LOCK_HELD) && + gc_will_visit(c, pos)) + flags |= BCH_BUCKET_MARK_GC_WILL_VISIT; + + switch (k.k->type) { + case BCH_EXTENT: + case BCH_EXTENT_CACHED: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const struct bch_extent_ptr *ptr; + struct bch_extent_crc_unpacked crc; + enum s_alloc type = metadata ? S_META : S_DIRTY; + unsigned replicas = 0; + + BUG_ON(metadata && bkey_extent_is_cached(e.k)); + BUG_ON(!sectors); + + extent_for_each_ptr_crc(e, ptr, crc) { + bch2_mark_pointer(c, e, ptr, crc, sectors, type, + stats, journal_seq, flags); + replicas += !ptr->cached; + } + + BUG_ON(replicas >= BCH_REPLICAS_MAX); + + if (replicas) + stats->s[replicas - 1].data[type] += sectors; + break; + } + case BCH_RESERVATION: { + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + + if (r.v->nr_replicas) + stats->s[r.v->nr_replicas - 1].persistent_reserved += sectors; + break; + } + } lg_local_unlock(&c->usage_lock); } -static u64 __recalc_sectors_available(struct bch_fs *c) -{ - return c->capacity - bch2_fs_sectors_used(c); -} +/* Disk reservations: */ -/* Used by gc when it's starting: */ -void bch2_recalc_sectors_available(struct bch_fs *c) +static u64 __recalc_sectors_available(struct bch_fs *c) { + u64 avail; int cpu; - lg_global_lock(&c->usage_lock); - for_each_possible_cpu(cpu) per_cpu_ptr(c->usage_percpu, cpu)->available_cache = 0; - atomic64_set(&c->sectors_available, - __recalc_sectors_available(c)); + avail = c->capacity - bch2_fs_sectors_used(c); + avail <<= RESERVE_FACTOR; + avail /= (1 << RESERVE_FACTOR) + 1; + return avail; +} + +/* Used by gc when it's starting: */ +void bch2_recalc_sectors_available(struct bch_fs *c) +{ + lg_global_lock(&c->usage_lock); + atomic64_set(&c->sectors_available, __recalc_sectors_available(c)); lg_global_unlock(&c->usage_lock); } -void bch2_disk_reservation_put(struct bch_fs *c, - struct disk_reservation *res) +void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) { - if (res->sectors) { - lg_local_lock(&c->usage_lock); - this_cpu_sub(c->usage_percpu->online_reserved, - res->sectors); + lg_local_lock(&c->usage_lock); + this_cpu_sub(c->usage_percpu->online_reserved, + res->sectors); - bch2_fs_stats_verify(c); - lg_local_unlock(&c->usage_lock); + bch2_fs_stats_verify(c); + lg_local_unlock(&c->usage_lock); - res->sectors = 0; - } + res->sectors = 0; } #define SECTORS_CACHE 1024 -int bch2_disk_reservation_add(struct bch_fs *c, - struct disk_reservation *res, - unsigned sectors, int flags) +int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, + unsigned sectors, int flags) { struct bch_fs_usage *stats; - u64 old, new, v; + u64 old, v, get; s64 sectors_available; int ret; @@ -685,27 +711,29 @@ int bch2_disk_reservation_add(struct bch_fs *c, lg_local_lock(&c->usage_lock); stats = this_cpu_ptr(c->usage_percpu); - if (sectors >= stats->available_cache) + if (sectors <= stats->available_cache) goto out; v = atomic64_read(&c->sectors_available); do { old = v; - if (old < sectors) { + get = min((u64) sectors + SECTORS_CACHE, old); + + if (get < sectors) { lg_local_unlock(&c->usage_lock); goto recalculate; } - - new = max_t(s64, 0, old - sectors - SECTORS_CACHE); } while ((v = atomic64_cmpxchg(&c->sectors_available, - old, new)) != old); + old, old - get)) != old); + + stats->available_cache += get; - stats->available_cache += old - new; out: stats->available_cache -= sectors; stats->online_reserved += sectors; res->sectors += sectors; + bch2_disk_reservations_verify(c, flags); bch2_fs_stats_verify(c); lg_local_unlock(&c->usage_lock); return 0; @@ -738,6 +766,8 @@ recalculate: stats->online_reserved += sectors; res->sectors += sectors; ret = 0; + + bch2_disk_reservations_verify(c, flags); } else { atomic64_set(&c->sectors_available, sectors_available); ret = -ENOSPC; diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 141aa4a..7d2b08c 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -95,37 +95,39 @@ static inline bool bucket_unused(struct bucket_mark mark) /* Per device stats: */ struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *); -struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); +struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *); static inline u64 __dev_buckets_available(struct bch_dev *ca, struct bch_dev_usage stats) { - return max_t(s64, 0, - ca->mi.nbuckets - ca->mi.first_bucket - - stats.buckets[S_META] - - stats.buckets[S_DIRTY] - - stats.buckets_alloc); + u64 total = ca->mi.nbuckets - ca->mi.first_bucket; + + if (WARN_ONCE(stats.buckets_unavailable > total, + "buckets_unavailable overflow\n")) + return 0; + + return total - stats.buckets_unavailable; } /* * Number of reclaimable buckets - only for use by the allocator thread: */ -static inline u64 dev_buckets_available(struct bch_dev *ca) +static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca) { - return __dev_buckets_available(ca, bch2_dev_usage_read(ca)); + return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca)); } static inline u64 __dev_buckets_free(struct bch_dev *ca, - struct bch_dev_usage stats) + struct bch_dev_usage stats) { return __dev_buckets_available(ca, stats) + fifo_used(&ca->free[RESERVE_NONE]) + fifo_used(&ca->free_inc); } -static inline u64 dev_buckets_free(struct bch_dev *ca) +static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca) { - return __dev_buckets_free(ca, bch2_dev_usage_read(ca)); + return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca)); } /* Cache set stats: */ @@ -133,7 +135,7 @@ static inline u64 dev_buckets_free(struct bch_dev *ca) struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *); struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *); void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, - struct disk_reservation *, struct gc_pos); + struct disk_reservation *, struct gc_pos); struct fs_usage_sum { u64 data; @@ -155,11 +157,18 @@ static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats) return sum; } +#define RESERVE_FACTOR 6 + +static u64 reserve_factor(u64 r) +{ + return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); +} + static inline u64 __bch2_fs_sectors_used(struct bch_fs *c) { struct fs_usage_sum sum = __fs_usage_sum(__bch2_fs_usage_read(c)); - return sum.data + sum.reserved + (sum.reserved >> 7); + return sum.data + reserve_factor(sum.reserved); } static inline u64 bch2_fs_sectors_used(struct bch_fs *c) @@ -184,30 +193,35 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m, void bch2_bucket_seq_cleanup(struct bch_fs *); -bool bch2_invalidate_bucket(struct bch_dev *, struct bucket *, - struct bucket_mark *); -bool bch2_mark_alloc_bucket_startup(struct bch_dev *, struct bucket *); -void bch2_mark_free_bucket(struct bch_dev *, struct bucket *); -void bch2_mark_alloc_bucket(struct bch_dev *, struct bucket *, bool); -void bch2_mark_metadata_bucket(struct bch_dev *, struct bucket *, - enum bucket_data_type, bool); +bool bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, + struct bucket *, struct bucket_mark *); +bool bch2_mark_alloc_bucket_startup(struct bch_fs *, struct bch_dev *, + struct bucket *); +void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, + struct bucket *, bool, + struct gc_pos, unsigned); +void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, + struct bucket *, enum bucket_data_type, + struct gc_pos, unsigned); #define BCH_BUCKET_MARK_NOATOMIC (1 << 0) -#define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 1) -#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 2) - -void __bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, - struct bch_fs_usage *, u64, unsigned); +#define BCH_BUCKET_MARK_MAY_MAKE_UNAVAILABLE (1 << 1) +#define BCH_BUCKET_MARK_GC_WILL_VISIT (1 << 2) +#define BCH_BUCKET_MARK_GC_LOCK_HELD (1 << 3) -void bch2_gc_mark_key(struct bch_fs *, struct bkey_s_c, - s64, bool, unsigned); -void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, - struct gc_pos, struct bch_fs_usage *, u64); +void bch2_mark_key(struct bch_fs *, struct bkey_s_c, s64, bool, struct gc_pos, + struct bch_fs_usage *, u64, unsigned); void bch2_recalc_sectors_available(struct bch_fs *); -void bch2_disk_reservation_put(struct bch_fs *, - struct disk_reservation *); +void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); + +static inline void bch2_disk_reservation_put(struct bch_fs *c, + struct disk_reservation *res) +{ + if (res->sectors) + __bch2_disk_reservation_put(c, res); +} #define BCH_DISK_RESERVATION_NOFAIL (1 << 0) #define BCH_DISK_RESERVATION_METADATA (1 << 1) diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 63f1b27..0bd8d2d 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -59,6 +59,7 @@ struct bch_dev_usage { u64 buckets[S_ALLOC_NR]; u64 buckets_cached; u64 buckets_alloc; + u64 buckets_unavailable; /* _compressed_ sectors: */ u64 sectors[S_ALLOC_NR]; @@ -79,13 +80,6 @@ struct bch_fs_usage { u64 available_cache; }; -struct bucket_heap_entry { - size_t bucket; - struct bucket_mark mark; -}; - -typedef HEAP(struct bucket_heap_entry) bucket_heap; - /* * A reservation for space on disk: */ @@ -95,4 +89,11 @@ struct disk_reservation { unsigned nr_replicas; }; +struct copygc_heap_entry { + u64 offset; + struct bucket_mark mark; +}; + +typedef HEAP(struct copygc_heap_entry) copygc_heap; + #endif /* _BUCKETS_TYPES_H */ diff --git a/libbcachefs/checksum.c b/libbcachefs/checksum.c index 01bdc86..0875585 100644 --- a/libbcachefs/checksum.c +++ b/libbcachefs/checksum.c @@ -141,10 +141,14 @@ static u64 bch2_checksum_init(unsigned type) switch (type) { case BCH_CSUM_NONE: return 0; - case BCH_CSUM_CRC32C: + case BCH_CSUM_CRC32C_NONZERO: return U32_MAX; - case BCH_CSUM_CRC64: + case BCH_CSUM_CRC64_NONZERO: return U64_MAX; + case BCH_CSUM_CRC32C: + return 0; + case BCH_CSUM_CRC64: + return 0; default: BUG(); } @@ -155,10 +159,14 @@ static u64 bch2_checksum_final(unsigned type, u64 crc) switch (type) { case BCH_CSUM_NONE: return 0; - case BCH_CSUM_CRC32C: + case BCH_CSUM_CRC32C_NONZERO: return crc ^ U32_MAX; - case BCH_CSUM_CRC64: + case BCH_CSUM_CRC64_NONZERO: return crc ^ U64_MAX; + case BCH_CSUM_CRC32C: + return crc; + case BCH_CSUM_CRC64: + return crc; default: BUG(); } @@ -169,8 +177,10 @@ static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t switch (type) { case BCH_CSUM_NONE: return 0; + case BCH_CSUM_CRC32C_NONZERO: case BCH_CSUM_CRC32C: return crc32c(crc, data, len); + case BCH_CSUM_CRC64_NONZERO: case BCH_CSUM_CRC64: return bch2_crc64_update(crc, data, len); default: @@ -243,6 +253,8 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, { switch (type) { case BCH_CSUM_NONE: + case BCH_CSUM_CRC32C_NONZERO: + case BCH_CSUM_CRC64_NONZERO: case BCH_CSUM_CRC32C: case BCH_CSUM_CRC64: { u64 crc = bch2_checksum_init(type); @@ -250,7 +262,7 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, crc = bch2_checksum_update(type, crc, data, len); crc = bch2_checksum_final(type, crc); - return (struct bch_csum) { .lo = crc }; + return (struct bch_csum) { .lo = cpu_to_le64(crc) }; } case BCH_CSUM_CHACHA20_POLY1305_80: @@ -281,28 +293,36 @@ void bch2_encrypt(struct bch_fs *c, unsigned type, do_encrypt(c->chacha20, nonce, data, len); } -struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, - struct nonce nonce, struct bio *bio) +static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, + struct nonce nonce, struct bio *bio, + struct bvec_iter *iter) { struct bio_vec bv; - struct bvec_iter iter; switch (type) { case BCH_CSUM_NONE: return (struct bch_csum) { 0 }; + case BCH_CSUM_CRC32C_NONZERO: + case BCH_CSUM_CRC64_NONZERO: case BCH_CSUM_CRC32C: case BCH_CSUM_CRC64: { u64 crc = bch2_checksum_init(type); - bio_for_each_contig_segment(bv, bio, iter) { +#ifdef CONFIG_HIGHMEM + __bio_for_each_segment(bv, bio, *iter, *iter) { void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; crc = bch2_checksum_update(type, crc, p, bv.bv_len); kunmap_atomic(p); } - +#else + __bio_for_each_contig_segment(bv, bio, *iter, *iter) + crc = bch2_checksum_update(type, crc, + page_address(bv.bv_page) + bv.bv_offset, + bv.bv_len); +#endif crc = bch2_checksum_final(type, crc); - return (struct bch_csum) { .lo = crc }; + return (struct bch_csum) { .lo = cpu_to_le64(crc) }; } case BCH_CSUM_CHACHA20_POLY1305_80: @@ -313,13 +333,19 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, gen_poly_key(c, desc, nonce); - bio_for_each_contig_segment(bv, bio, iter) { +#ifdef CONFIG_HIGHMEM + __bio_for_each_segment(bv, bio, *iter, *iter) { void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; crypto_shash_update(desc, p, bv.bv_len); kunmap_atomic(p); } - +#else + __bio_for_each_contig_segment(bv, bio, *iter, *iter) + crypto_shash_update(desc, + page_address(bv.bv_page) + bv.bv_offset, + bv.bv_len); +#endif crypto_shash_final(desc, digest); memcpy(&ret, digest, bch_crc_bytes[type]); @@ -330,6 +356,14 @@ struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, } } +struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, + struct nonce nonce, struct bio *bio) +{ + struct bvec_iter iter = bio->bi_iter; + + return __bch2_checksum_bio(c, type, nonce, bio, &iter); +} + void bch2_encrypt_bio(struct bch_fs *c, unsigned type, struct nonce nonce, struct bio *bio) { @@ -343,12 +377,12 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type, sg_init_table(sgl, ARRAY_SIZE(sgl)); - bio_for_each_contig_segment(bv, bio, iter) { + bio_for_each_segment(bv, bio, iter) { if (sg == sgl + ARRAY_SIZE(sgl)) { sg_mark_end(sg - 1); do_encrypt_sg(c->chacha20, nonce, sgl, bytes); - le32_add_cpu(nonce.d, bytes / CHACHA20_BLOCK_SIZE); + nonce = nonce_add(nonce, bytes); bytes = 0; sg_init_table(sgl, ARRAY_SIZE(sgl)); @@ -357,13 +391,115 @@ void bch2_encrypt_bio(struct bch_fs *c, unsigned type, sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset); bytes += bv.bv_len; - } sg_mark_end(sg - 1); do_encrypt_sg(c->chacha20, nonce, sgl, bytes); } +static inline bool bch2_checksum_mergeable(unsigned type) +{ + + switch (type) { + case BCH_CSUM_NONE: + case BCH_CSUM_CRC32C: + case BCH_CSUM_CRC64: + return true; + default: + return false; + } +} + +static struct bch_csum bch2_checksum_merge(unsigned type, + struct bch_csum a, + struct bch_csum b, size_t b_len) +{ + BUG_ON(!bch2_checksum_mergeable(type)); + + while (b_len) { + unsigned b = min(b_len, PAGE_SIZE); + + a.lo = bch2_checksum_update(type, a.lo, + page_address(ZERO_PAGE(0)), b); + b_len -= b; + } + + a.lo ^= b.lo; + a.hi ^= b.hi; + return a; +} + +int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, + struct bversion version, + struct bch_extent_crc_unpacked crc_old, + struct bch_extent_crc_unpacked *crc_a, + struct bch_extent_crc_unpacked *crc_b, + unsigned len_a, unsigned len_b, + unsigned new_csum_type) +{ + struct bvec_iter iter = bio->bi_iter; + struct nonce nonce = extent_nonce(version, crc_old); + struct bch_csum merged = { 0 }; + struct crc_split { + struct bch_extent_crc_unpacked *crc; + unsigned len; + unsigned csum_type; + struct bch_csum csum; + } splits[3] = { + { crc_a, len_a, new_csum_type }, + { crc_b, len_b, new_csum_type }, + { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type }, + }, *i; + bool mergeable = crc_old.csum_type == new_csum_type && + bch2_checksum_mergeable(new_csum_type); + unsigned crc_nonce = crc_old.nonce; + + BUG_ON(len_a + len_b > bio_sectors(bio)); + BUG_ON(crc_old.uncompressed_size != bio_sectors(bio)); + BUG_ON(crc_old.compression_type); + BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) != + bch2_csum_type_is_encryption(new_csum_type)); + + for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { + iter.bi_size = i->len << 9; + if (mergeable || i->crc) + i->csum = __bch2_checksum_bio(c, i->csum_type, + nonce, bio, &iter); + else + bio_advance_iter(bio, &iter, i->len << 9); + nonce = nonce_add(nonce, i->len << 9); + } + + if (mergeable) + for (i = splits; i < splits + ARRAY_SIZE(splits); i++) + merged = bch2_checksum_merge(new_csum_type, merged, + i->csum, i->len << 9); + else + merged = bch2_checksum_bio(c, crc_old.csum_type, + extent_nonce(version, crc_old), bio); + + if (bch2_crc_cmp(merged, crc_old.csum)) + return -EIO; + + for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { + if (i->crc) + *i->crc = (struct bch_extent_crc_unpacked) { + .csum_type = i->csum_type, + .compressed_size = i->len, + .uncompressed_size = i->len, + .offset = 0, + .live_size = i->len, + .nonce = crc_nonce, + .csum = i->csum, + }; + + if (bch2_csum_type_is_encryption(new_csum_type)) + crc_nonce += i->len; + } + + return 0; +} + #ifdef __KERNEL__ int bch2_request_key(struct bch_sb *sb, struct bch_key *key) { diff --git a/libbcachefs/checksum.h b/libbcachefs/checksum.h index e8f6ef4..1a08941 100644 --- a/libbcachefs/checksum.h +++ b/libbcachefs/checksum.h @@ -2,6 +2,7 @@ #define _BCACHEFS_CHECKSUM_H #include "bcachefs.h" +#include "extents_types.h" #include "super-io.h" #include @@ -36,7 +37,14 @@ void bch2_encrypt(struct bch_fs *, unsigned, struct nonce, void *data, size_t); struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, - struct nonce, struct bio *); + struct nonce, struct bio *); + +int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion, + struct bch_extent_crc_unpacked, + struct bch_extent_crc_unpacked *, + struct bch_extent_crc_unpacked *, + unsigned, unsigned, unsigned); + void bch2_encrypt_bio(struct bch_fs *, unsigned, struct nonce, struct bio *); @@ -49,15 +57,16 @@ int bch2_enable_encryption(struct bch_fs *, bool); void bch2_fs_encryption_exit(struct bch_fs *); int bch2_fs_encryption_init(struct bch_fs *); -static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type) +static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, + bool data) { switch (type) { case BCH_CSUM_OPT_NONE: return BCH_CSUM_NONE; case BCH_CSUM_OPT_CRC32C: - return BCH_CSUM_CRC32C; + return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO; case BCH_CSUM_OPT_CRC64: - return BCH_CSUM_CRC64; + return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO; default: BUG(); } @@ -70,7 +79,7 @@ static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c) ? BCH_CSUM_CHACHA20_POLY1305_128 : BCH_CSUM_CHACHA20_POLY1305_80; - return bch2_csum_opt_to_type(c->opts.data_checksum); + return bch2_csum_opt_to_type(c->opts.data_checksum, true); } static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) @@ -78,7 +87,7 @@ static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) if (c->sb.encryption_type) return BCH_CSUM_CHACHA20_POLY1305_128; - return bch2_csum_opt_to_type(c->opts.metadata_checksum); + return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); } static inline enum bch_compression_type @@ -134,6 +143,21 @@ static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) return nonce; } +static inline struct nonce extent_nonce(struct bversion version, + struct bch_extent_crc_unpacked crc) +{ + unsigned size = crc.compression_type ? crc.uncompressed_size : 0; + struct nonce nonce = (struct nonce) {{ + [0] = cpu_to_le32(size << 22), + [1] = cpu_to_le32(version.lo), + [2] = cpu_to_le32(version.lo >> 32), + [3] = cpu_to_le32(version.hi| + (crc.compression_type << 24))^BCH_NONCE_EXTENT, + }}; + + return nonce_add(nonce, crc.nonce << 9); +} + static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key) { return le64_to_cpu(key->magic) != BCH_KEY_MAGIC; diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index 7b45bb7..6407998 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -1,4 +1,5 @@ #include "bcachefs.h" +#include "checksum.h" #include "compress.h" #include "extents.h" #include "io.h" @@ -145,11 +146,11 @@ static inline void zlib_set_workspace(z_stream *strm, void *workspace) } static int __bio_uncompress(struct bch_fs *c, struct bio *src, - void *dst_data, struct bch_extent_crc128 crc) + void *dst_data, struct bch_extent_crc_unpacked crc) { struct bbuf src_data = { NULL }; size_t src_len = src->bi_iter.bi_size; - size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9; + size_t dst_len = crc.uncompressed_size << 9; int ret; src_data = bio_map_or_bounce(c, src, READ); @@ -212,65 +213,58 @@ err: } int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, - unsigned live_data_sectors, - struct bch_extent_crc128 crc) + struct bch_extent_crc_unpacked *crc) { - struct bbuf dst_data = { NULL }; - size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9; - int ret = -ENOMEM; + struct bbuf data = { NULL }; + size_t dst_len = crc->uncompressed_size << 9; - BUG_ON(DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS) > bio->bi_max_vecs); + /* bio must own its pages: */ + BUG_ON(!bio->bi_vcnt); + BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); - if (crc_uncompressed_size(NULL, &crc) > c->sb.encoded_extent_max || - crc_compressed_size(NULL, &crc) > c->sb.encoded_extent_max) + if (crc->uncompressed_size > c->sb.encoded_extent_max || + crc->compressed_size > c->sb.encoded_extent_max) { + bch_err(c, "error rewriting existing data: extent too big"); return -EIO; + } - dst_data = __bounce_alloc(c, dst_len, WRITE); - - ret = __bio_uncompress(c, bio, dst_data.b, crc); - if (ret) - goto err; - - while (bio->bi_vcnt < DIV_ROUND_UP(live_data_sectors, PAGE_SECTORS)) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; - - bv->bv_page = alloc_page(GFP_NOIO); - if (!bv->bv_page) - goto use_mempool; + data = __bounce_alloc(c, dst_len, WRITE); - bv->bv_len = PAGE_SIZE; - bv->bv_offset = 0; - bio->bi_vcnt++; + if (__bio_uncompress(c, bio, data.b, *crc)) { + bch_err(c, "error rewriting existing data: decompression error"); + bio_unmap_or_unbounce(c, data); + return -EIO; } - bio->bi_iter.bi_size = live_data_sectors << 9; -copy_data: - memcpy_to_bio(bio, bio->bi_iter, dst_data.b + (crc.offset << 9)); -err: - bio_unmap_or_unbounce(c, dst_data); - return ret; -use_mempool: /* - * We already allocated from mempool, we can't allocate from it again - * without freeing the pages we already allocated or else we could - * deadlock: + * might have to free existing pages and retry allocation from mempool - + * do this _after_ decompressing: */ + bch2_bio_alloc_more_pages_pool(c, bio, crc->live_size << 9); + + memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9)); - bch2_bio_free_pages_pool(c, bio); - bch2_bio_alloc_pages_pool(c, bio, live_data_sectors << 9); - goto copy_data; + crc->csum_type = 0; + crc->compression_type = 0; + crc->compressed_size = crc->live_size; + crc->uncompressed_size = crc->live_size; + crc->offset = 0; + crc->csum = (struct bch_csum) { 0, 0 }; + + bio_unmap_or_unbounce(c, data); + return 0; } int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, struct bio *dst, struct bvec_iter dst_iter, - struct bch_extent_crc128 crc) + struct bch_extent_crc_unpacked crc) { struct bbuf dst_data = { NULL }; - size_t dst_len = crc_uncompressed_size(NULL, &crc) << 9; + size_t dst_len = crc.uncompressed_size << 9; int ret = -ENOMEM; - if (crc_uncompressed_size(NULL, &crc) > c->sb.encoded_extent_max || - crc_compressed_size(NULL, &crc) > c->sb.encoded_extent_max) + if (crc.uncompressed_size > c->sb.encoded_extent_max || + crc.compressed_size > c->sb.encoded_extent_max) return -EIO; dst_data = dst_len == dst_iter.bi_size @@ -288,21 +282,25 @@ err: return ret; } -static int __bio_compress(struct bch_fs *c, - struct bio *dst, size_t *dst_len, - struct bio *src, size_t *src_len, - unsigned *compression_type) +static unsigned __bio_compress(struct bch_fs *c, + struct bio *dst, size_t *dst_len, + struct bio *src, size_t *src_len, + unsigned compression_type) { struct bbuf src_data = { NULL }, dst_data = { NULL }; unsigned pad; int ret = 0; + /* If it's only one block, don't bother trying to compress: */ + if (bio_sectors(src) <= c->opts.block_size) + goto err; + dst_data = bio_map_or_bounce(c, dst, WRITE); src_data = bio_map_or_bounce(c, src, READ); - switch (*compression_type) { + switch (compression_type) { case BCH_COMPRESSION_LZ4_OLD: - *compression_type = BCH_COMPRESSION_LZ4; + compression_type = BCH_COMPRESSION_LZ4; case BCH_COMPRESSION_LZ4: { void *workspace; @@ -403,19 +401,24 @@ zlib_err: if (dst_data.type != BB_NONE) memcpy_to_bio(dst, dst->bi_iter, dst_data.b); + + BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size); + BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size); + BUG_ON(*dst_len & (block_bytes(c) - 1)); + BUG_ON(*src_len & (block_bytes(c) - 1)); out: bio_unmap_or_unbounce(c, src_data); bio_unmap_or_unbounce(c, dst_data); - return ret; + return compression_type; err: - ret = -1; + compression_type = 0; goto out; } -void bch2_bio_compress(struct bch_fs *c, - struct bio *dst, size_t *dst_len, - struct bio *src, size_t *src_len, - unsigned *compression_type) +unsigned bch2_bio_compress(struct bch_fs *c, + struct bio *dst, size_t *dst_len, + struct bio *src, size_t *src_len, + unsigned compression_type) { unsigned orig_dst = dst->bi_iter.bi_size; unsigned orig_src = src->bi_iter.bi_size; @@ -423,29 +426,15 @@ void bch2_bio_compress(struct bch_fs *c, /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, c->sb.encoded_extent_max << 9); - /* Don't generate a bigger output than input: */ - dst->bi_iter.bi_size = - min(dst->bi_iter.bi_size, src->bi_iter.bi_size); + dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); + + compression_type = + __bio_compress(c, dst, dst_len, src, src_len, compression_type); - /* If it's only one block, don't bother trying to compress: */ - if (*compression_type != BCH_COMPRESSION_NONE && - bio_sectors(src) > c->opts.block_size && - !__bio_compress(c, dst, dst_len, src, src_len, compression_type)) - goto out; - - /* If compressing failed (didn't get smaller), just copy: */ - *compression_type = BCH_COMPRESSION_NONE; - *dst_len = *src_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); - bio_copy_data(dst, src); -out: dst->bi_iter.bi_size = orig_dst; src->bi_iter.bi_size = orig_src; - - BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size); - BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size); - BUG_ON(*dst_len & (block_bytes(c) - 1)); - BUG_ON(*src_len & (block_bytes(c) - 1)); + return compression_type; } /* doesn't write superblock: */ diff --git a/libbcachefs/compress.h b/libbcachefs/compress.h index ad1ba25..06fff6a 100644 --- a/libbcachefs/compress.h +++ b/libbcachefs/compress.h @@ -1,12 +1,14 @@ #ifndef _BCACHEFS_COMPRESS_H #define _BCACHEFS_COMPRESS_H +#include "extents_types.h" + int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, - unsigned, struct bch_extent_crc128); + struct bch_extent_crc_unpacked *); int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, - struct bvec_iter, struct bch_extent_crc128); -void bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, - struct bio *, size_t *, unsigned *); + struct bvec_iter, struct bch_extent_crc_unpacked); +unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, + struct bio *, size_t *, unsigned); int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); void bch2_fs_compress_exit(struct bch_fs *); diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 7d2f5cc..6e79f49 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -19,6 +19,7 @@ #include "inode.h" #include "journal.h" #include "super-io.h" +#include "util.h" #include "xattr.h" #include @@ -155,6 +156,44 @@ unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c k) return nr_ptrs; } +unsigned bch2_extent_is_compressed(struct bkey_s_c k) +{ + struct bkey_s_c_extent e; + const struct bch_extent_ptr *ptr; + struct bch_extent_crc_unpacked crc; + unsigned ret = 0; + + switch (k.k->type) { + case BCH_EXTENT: + case BCH_EXTENT_CACHED: + e = bkey_s_c_to_extent(k); + + extent_for_each_ptr_crc(e, ptr, crc) + if (!ptr->cached && + crc.compression_type != BCH_COMPRESSION_NONE && + crc.compressed_size < crc.live_size) + ret = max_t(unsigned, ret, crc.compressed_size); + } + + return ret; +} + +bool bch2_extent_matches_ptr(struct bch_fs *c, struct bkey_s_c_extent e, + struct bch_extent_ptr m, u64 offset) +{ + const struct bch_extent_ptr *ptr; + struct bch_extent_crc_unpacked crc; + + extent_for_each_ptr_crc(e, ptr, crc) + if (ptr->dev == m.dev && + ptr->gen == m.gen && + (s64) ptr->offset + crc.offset - bkey_start_offset(e.k) == + (s64) m.offset - offset) + return ptr; + + return NULL; +} + /* Doesn't cleanup redundant crcs */ void __bch2_extent_drop_ptr(struct bkey_s_extent e, struct bch_extent_ptr *ptr) { @@ -186,24 +225,30 @@ found: bch2_extent_drop_ptr(e, ptr); } -/* returns true if equal */ -static bool crc_cmp(union bch_extent_crc *l, union bch_extent_crc *r) +static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, + struct bch_extent_crc_unpacked n) { - return extent_crc_type(l) == extent_crc_type(r) && - !memcmp(l, r, extent_entry_bytes(to_entry(l))); + return !u.compression_type && + u.csum_type && + u.uncompressed_size > u.live_size && + bch2_csum_type_is_encryption(u.csum_type) == + bch2_csum_type_is_encryption(n.csum_type); } -/* Increment pointers after @crc by crc's offset until the next crc entry: */ -void bch2_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_crc *crc) +bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent e, + struct bch_extent_crc_unpacked n) { - union bch_extent_entry *entry; + struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *i; - extent_for_each_entry_from(e, entry, extent_entry_next(to_entry(crc))) { - if (!extent_entry_is_ptr(entry)) - return; + if (!n.csum_type) + return false; - entry->ptr.offset += crc_offset(crc); - } + extent_for_each_crc(e, crc, i) + if (can_narrow_crc(crc, n)) + return true; + + return false; } /* @@ -214,96 +259,50 @@ void bch2_extent_crc_narrow_pointers(struct bkey_s_extent e, union bch_extent_cr * not compressed, we can modify them to point to only the data that is * currently live (so that readers won't have to bounce) while we've got the * checksum we need: - * - * XXX: to guard against data being corrupted while in memory, instead of - * recomputing the checksum here, it would be better in the read path to instead - * of computing the checksum of the entire extent: - * - * | extent | - * - * compute the checksums of the live and dead data separately - * | dead data || live data || dead data | - * - * and then verify that crc_dead1 + crc_live + crc_dead2 == orig_crc, and then - * use crc_live here (that we verified was correct earlier) - * - * note: doesn't work with encryption */ -void bch2_extent_narrow_crcs(struct bkey_s_extent e) +bool bch2_extent_narrow_crcs(struct bkey_i_extent *e, + struct bch_extent_crc_unpacked n) { - union bch_extent_crc *crc; - bool have_wide = false, have_narrow = false; - struct bch_csum csum = { 0 }; - unsigned csum_type = 0; - - extent_for_each_crc(e, crc) { - if (crc_compression_type(crc) || - bch2_csum_type_is_encryption(crc_csum_type(crc))) - continue; - - if (crc_uncompressed_size(e.k, crc) != e.k->size) { - have_wide = true; - } else { - have_narrow = true; - csum = crc_csum(crc); - csum_type = crc_csum_type(crc); - } - } - - if (!have_wide || !have_narrow) - return; - - extent_for_each_crc(e, crc) { - if (crc_compression_type(crc)) - continue; - - if (crc_uncompressed_size(e.k, crc) != e.k->size) { - switch (extent_crc_type(crc)) { - case BCH_EXTENT_CRC_NONE: - BUG(); - case BCH_EXTENT_CRC32: - if (bch_crc_bytes[csum_type] > 4) - continue; - - bch2_extent_crc_narrow_pointers(e, crc); - crc->crc32._compressed_size = e.k->size - 1; - crc->crc32._uncompressed_size = e.k->size - 1; - crc->crc32.offset = 0; - crc->crc32.csum_type = csum_type; - crc->crc32.csum = csum.lo; + struct bch_extent_crc_unpacked u; + struct bch_extent_ptr *ptr; + union bch_extent_entry *i; + + /* Find a checksum entry that covers only live data: */ + if (!n.csum_type) + extent_for_each_crc(extent_i_to_s(e), u, i) + if (!u.compression_type && + u.csum_type && + u.live_size == u.uncompressed_size) { + n = u; break; - case BCH_EXTENT_CRC64: - if (bch_crc_bytes[csum_type] > 10) - continue; + } - bch2_extent_crc_narrow_pointers(e, crc); - crc->crc64._compressed_size = e.k->size - 1; - crc->crc64._uncompressed_size = e.k->size - 1; - crc->crc64.offset = 0; - crc->crc64.csum_type = csum_type; - crc->crc64.csum_lo = csum.lo; - crc->crc64.csum_hi = csum.hi; - break; - case BCH_EXTENT_CRC128: - if (bch_crc_bytes[csum_type] > 16) - continue; + if (!bch2_can_narrow_extent_crcs(extent_i_to_s_c(e), n)) + return false; - bch2_extent_crc_narrow_pointers(e, crc); - crc->crc128._compressed_size = e.k->size - 1; - crc->crc128._uncompressed_size = e.k->size - 1; - crc->crc128.offset = 0; - crc->crc128.csum_type = csum_type; - crc->crc128.csum = csum; - break; - } + BUG_ON(n.compression_type); + BUG_ON(n.offset); + BUG_ON(n.live_size != e->k.size); + + bch2_extent_crc_append(e, n); +restart_narrow_pointers: + extent_for_each_ptr_crc(extent_i_to_s(e), ptr, u) + if (can_narrow_crc(u, n)) { + ptr->offset += u.offset; + extent_ptr_append(e, *ptr); + __bch2_extent_drop_ptr(extent_i_to_s(e), ptr); + goto restart_narrow_pointers; } - } + + bch2_extent_drop_redundant_crcs(extent_i_to_s(e)); + return true; } void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e) { union bch_extent_entry *entry = e.v->start; union bch_extent_crc *crc, *prev = NULL; + struct bch_extent_crc_unpacked u, prev_u; while (entry != extent_entry_last(e)) { union bch_extent_entry *next = extent_entry_next(entry); @@ -313,6 +312,7 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e) goto next; crc = entry_to_crc(entry); + u = bch2_extent_crc_unpack(e.k, crc); if (next == extent_entry_last(e)) { /* crc entry with no pointers after it: */ @@ -324,20 +324,28 @@ void bch2_extent_drop_redundant_crcs(struct bkey_s_extent e) goto drop; } - if (prev && crc_cmp(crc, prev)) { + if (prev && !memcmp(&u, &prev_u, sizeof(u))) { /* identical to previous crc entry: */ goto drop; } if (!prev && - !crc_csum_type(crc) && - !crc_compression_type(crc)) { + !u.csum_type && + !u.compression_type) { /* null crc entry: */ - bch2_extent_crc_narrow_pointers(e, crc); + union bch_extent_entry *e2; + + extent_for_each_entry_from(e, e2, extent_entry_next(entry)) { + if (!extent_entry_is_ptr(e2)) + break; + + e2->ptr.offset += u.offset; + } goto drop; } prev = crc; + prev_u = u; next: entry = next; continue; @@ -453,7 +461,7 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf, { char *out = buf, *end = buf + size; const union bch_extent_entry *entry; - const union bch_extent_crc *crc; + struct bch_extent_crc_unpacked crc; const struct bch_extent_ptr *ptr; struct bch_dev *ca; bool first = true; @@ -468,13 +476,14 @@ static size_t extent_print_ptrs(struct bch_fs *c, char *buf, case BCH_EXTENT_ENTRY_crc32: case BCH_EXTENT_ENTRY_crc64: case BCH_EXTENT_ENTRY_crc128: - crc = entry_to_crc(entry); - - p("crc: c_size %u size %u offset %u csum %u compress %u", - crc_compressed_size(e.k, crc), - crc_uncompressed_size(e.k, crc), - crc_offset(crc), crc_csum_type(crc), - crc_compression_type(crc)); + crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry)); + + p("crc: c_size %u size %u offset %u nonce %u csum %u compress %u", + crc.compressed_size, + crc.uncompressed_size, + crc.offset, crc.nonce, + crc.csum_type, + crc.compression_type); break; case BCH_EXTENT_ENTRY_ptr: ptr = entry_to_ptr(entry); @@ -499,13 +508,24 @@ out: return out - buf; } +static inline bool dev_latency_better(struct bch_dev *dev1, + struct bch_dev *dev2) +{ + unsigned l1 = atomic_read(&dev1->latency[READ]); + unsigned l2 = atomic_read(&dev2->latency[READ]); + + /* Pick at random, biased in favor of the faster device: */ + + return bch2_rand_range(l1 + l2) > l1; +} + static void extent_pick_read_device(struct bch_fs *c, struct bkey_s_c_extent e, struct bch_devs_mask *avoid, struct extent_pick_ptr *pick) { - const union bch_extent_crc *crc; const struct bch_extent_ptr *ptr; + struct bch_extent_crc_unpacked crc; extent_for_each_ptr_crc(e, ptr, crc) { struct bch_dev *ca = c->devs[ptr->dev]; @@ -516,12 +536,18 @@ static void extent_pick_read_device(struct bch_fs *c, if (ca->mi.state == BCH_MEMBER_STATE_FAILED) continue; - if (avoid && test_bit(ca->dev_idx, avoid->d)) - continue; + if (avoid) { + if (test_bit(ca->dev_idx, avoid->d)) + continue; - if (pick->ca && pick->ca->mi.tier < ca->mi.tier) - continue; + if (pick->ca && + test_bit(pick->ca->dev_idx, avoid->d)) + goto use; + } + if (pick->ca && !dev_latency_better(ca, pick->ca)) + continue; +use: if (!percpu_ref_tryget(&ca->io_ref)) continue; @@ -530,11 +556,9 @@ static void extent_pick_read_device(struct bch_fs *c, *pick = (struct extent_pick_ptr) { .ptr = *ptr, + .crc = crc, .ca = ca, }; - - if (e.k->size) - pick->crc = crc_to_128(e.k, crc); } } @@ -557,14 +581,17 @@ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const union bch_extent_entry *entry; const struct bch_extent_ptr *ptr; - const union bch_extent_crc *crc; const char *reason; - extent_for_each_entry(e, entry) + extent_for_each_entry(e, entry) { if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) return "invalid extent entry type"; - extent_for_each_ptr_crc(e, ptr, crc) { + if (extent_entry_is_crc(entry)) + return "has crc field"; + } + + extent_for_each_ptr(e, ptr) { reason = extent_ptr_invalid(c, e, ptr, c->opts.btree_node_size, true); @@ -572,9 +599,6 @@ static const char *bch2_btree_ptr_invalid(const struct bch_fs *c, return reason; } - if (crc) - return "has crc field"; - return NULL; } @@ -699,28 +723,28 @@ static bool __bch2_cut_front(struct bpos where, struct bkey_s k) __set_bkey_deleted(k.k); else if (bkey_extent_is_data(k.k)) { struct bkey_s_extent e = bkey_s_to_extent(k); - struct bch_extent_ptr *ptr; - union bch_extent_crc *crc, *prev_crc = NULL; + union bch_extent_entry *entry; + bool seen_crc = false; - extent_for_each_ptr_crc(e, ptr, crc) { - switch (extent_crc_type(crc)) { - case BCH_EXTENT_CRC_NONE: - ptr->offset += e.k->size - len; + extent_for_each_entry(e, entry) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + if (!seen_crc) + entry->ptr.offset += e.k->size - len; break; - case BCH_EXTENT_CRC32: - if (prev_crc != crc) - crc->crc32.offset += e.k->size - len; + case BCH_EXTENT_ENTRY_crc32: + entry->crc32.offset += e.k->size - len; break; - case BCH_EXTENT_CRC64: - if (prev_crc != crc) - crc->crc64.offset += e.k->size - len; + case BCH_EXTENT_ENTRY_crc64: + entry->crc64.offset += e.k->size - len; break; - case BCH_EXTENT_CRC128: - if (prev_crc != crc) - crc->crc128.offset += e.k->size - len; + case BCH_EXTENT_ENTRY_crc128: + entry->crc128.offset += e.k->size - len; break; } - prev_crc = crc; + + if (extent_entry_is_crc(entry)) + seen_crc = true; } } @@ -989,7 +1013,7 @@ static void bch2_add_sectors(struct extent_insert_state *s, return; bch2_mark_key(c, k, sectors, false, gc_pos_btree_node(b), - &s->stats, s->trans->journal_res.seq); + &s->stats, s->trans->journal_res.seq, 0); } static void bch2_subtract_sectors(struct extent_insert_state *s, @@ -1123,7 +1147,7 @@ static void extent_insert_committed(struct extent_insert_state *s) if (!(s->trans->flags & BTREE_INSERT_JOURNAL_REPLAY) && bkey_cmp(s->committed, insert->k.p) && - bkey_extent_is_compressed(bkey_i_to_s_c(insert))) { + bch2_extent_is_compressed(bkey_i_to_s_c(insert))) { /* XXX: possibly need to increase our reservation? */ bch2_cut_subtract_back(s, s->committed, bkey_i_to_s(&split.k)); @@ -1152,46 +1176,24 @@ done: s->trans->did_work = true; } -static enum extent_insert_hook_ret +static enum btree_insert_ret __extent_insert_advance_pos(struct extent_insert_state *s, struct bpos next_pos, struct bkey_s_c k) { struct extent_insert_hook *hook = s->trans->hook; - enum extent_insert_hook_ret ret; -#if 0 - /* - * Currently disabled for encryption - broken with fcollapse. Will have - * to reenable when versions are exposed for send/receive - versions - * will have to be monotonic then: - */ - if (k.k && k.k->size && - !bversion_zero(s->insert->k->k.version) && - bversion_cmp(k.k->version, s->insert->k->k.version) > 0) { - ret = BTREE_HOOK_NO_INSERT; - } else -#endif + enum btree_insert_ret ret; + if (hook) ret = hook->fn(hook, s->committed, next_pos, k, s->insert->k); else - ret = BTREE_HOOK_DO_INSERT; + ret = BTREE_INSERT_OK; EBUG_ON(bkey_deleted(&s->insert->k->k) || !s->insert->k->k.size); - switch (ret) { - case BTREE_HOOK_DO_INSERT: - break; - case BTREE_HOOK_NO_INSERT: - extent_insert_committed(s); - bch2_cut_subtract_front(s, next_pos, bkey_i_to_s(s->insert->k)); - - bch2_btree_iter_set_pos_same_leaf(s->insert->iter, next_pos); - break; - case BTREE_HOOK_RESTART_TRANS: - return ret; - } + if (ret == BTREE_INSERT_OK) + s->committed = next_pos; - s->committed = next_pos; return ret; } @@ -1199,39 +1201,28 @@ __extent_insert_advance_pos(struct extent_insert_state *s, * Update iter->pos, marking how much of @insert we've processed, and call hook * fn: */ -static enum extent_insert_hook_ret +static enum btree_insert_ret extent_insert_advance_pos(struct extent_insert_state *s, struct bkey_s_c k) { struct btree *b = s->insert->iter->nodes[0]; struct bpos next_pos = bpos_min(s->insert->k->k.p, k.k ? k.k->p : b->key.k.p); + enum btree_insert_ret ret; + + if (race_fault()) + return BTREE_INSERT_NEED_TRAVERSE; /* hole? */ if (k.k && bkey_cmp(s->committed, bkey_start_pos(k.k)) < 0) { - bool have_uncommitted = bkey_cmp(s->committed, - bkey_start_pos(&s->insert->k->k)) > 0; - - switch (__extent_insert_advance_pos(s, bkey_start_pos(k.k), - bkey_s_c_null)) { - case BTREE_HOOK_DO_INSERT: - break; - case BTREE_HOOK_NO_INSERT: - /* - * we had to split @insert and insert the committed - * part - need to bail out and recheck journal - * reservation/btree node before we advance pos past @k: - */ - if (have_uncommitted) - return BTREE_HOOK_NO_INSERT; - break; - case BTREE_HOOK_RESTART_TRANS: - return BTREE_HOOK_RESTART_TRANS; - } + ret = __extent_insert_advance_pos(s, bkey_start_pos(k.k), + bkey_s_c_null); + if (ret != BTREE_INSERT_OK) + return ret; } /* avoid redundant calls to hook fn: */ if (!bkey_cmp(s->committed, next_pos)) - return BTREE_HOOK_DO_INSERT; + return BTREE_INSERT_OK; return __extent_insert_advance_pos(s, next_pos, k); } @@ -1245,7 +1236,7 @@ extent_insert_check_split_compressed(struct extent_insert_state *s, unsigned sectors; if (overlap == BCH_EXTENT_OVERLAP_MIDDLE && - (sectors = bkey_extent_is_compressed(k))) { + (sectors = bch2_extent_is_compressed(k))) { int flags = BCH_DISK_RESERVATION_BTREE_LOCKS_HELD; if (s->trans->flags & BTREE_INSERT_NOFAIL) @@ -1277,6 +1268,7 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert, struct btree_iter *iter = s->insert->iter; struct btree *b = iter->nodes[0]; struct btree_node_iter *node_iter = &iter->node_iters[0]; + enum btree_insert_ret ret; switch (overlap) { case BCH_EXTENT_OVERLAP_FRONT: @@ -1322,9 +1314,9 @@ extent_squash(struct extent_insert_state *s, struct bkey_i *insert, k.k->p = orig_pos; extent_save(b, node_iter, _k, k.k); - if (extent_insert_advance_pos(s, k.s_c) == - BTREE_HOOK_RESTART_TRANS) - return BTREE_INSERT_NEED_TRAVERSE; + ret = extent_insert_advance_pos(s, k.s_c); + if (ret != BTREE_INSERT_OK) + return ret; extent_insert_committed(s); /* @@ -1420,15 +1412,9 @@ bch2_delete_fixup_extent(struct extent_insert_state *s) if (ret != BTREE_INSERT_OK) goto stop; - switch (extent_insert_advance_pos(s, k.s_c)) { - case BTREE_HOOK_DO_INSERT: - break; - case BTREE_HOOK_NO_INSERT: - continue; - case BTREE_HOOK_RESTART_TRANS: - ret = BTREE_INSERT_NEED_TRAVERSE; + ret = extent_insert_advance_pos(s, k.s_c); + if (ret) goto stop; - } s->do_journal = true; @@ -1469,10 +1455,9 @@ next: bch2_btree_iter_set_pos_same_leaf(iter, s->committed); } - if (bkey_cmp(s->committed, insert->k.p) < 0 && - ret == BTREE_INSERT_OK && - extent_insert_advance_pos(s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS) - ret = BTREE_INSERT_NEED_TRAVERSE; + if (ret == BTREE_INSERT_OK && + bkey_cmp(s->committed, insert->k.p) < 0) + ret = extent_insert_advance_pos(s, bkey_s_c_null); stop: extent_insert_committed(s); @@ -1594,18 +1579,10 @@ bch2_insert_fixup_extent(struct btree_insert *trans, /* * Only call advance pos & call hook for nonzero size extents: - * If hook returned BTREE_HOOK_NO_INSERT, @insert->k no longer - * overlaps with @k: */ - switch (extent_insert_advance_pos(&s, k.s_c)) { - case BTREE_HOOK_DO_INSERT: - break; - case BTREE_HOOK_NO_INSERT: - continue; - case BTREE_HOOK_RESTART_TRANS: - ret = BTREE_INSERT_NEED_TRAVERSE; + ret = extent_insert_advance_pos(&s, k.s_c); + if (ret != BTREE_INSERT_OK) goto stop; - } if (k.k->size && (k.k->needs_whiteout || bset_written(b, bset(b, t)))) @@ -1623,10 +1600,9 @@ squash: goto stop; } - if (bkey_cmp(s.committed, insert->k->k.p) < 0 && - ret == BTREE_INSERT_OK && - extent_insert_advance_pos(&s, bkey_s_c_null) == BTREE_HOOK_RESTART_TRANS) - ret = BTREE_INSERT_NEED_TRAVERSE; + if (ret == BTREE_INSERT_OK && + bkey_cmp(s.committed, insert->k->k.p) < 0) + ret = extent_insert_advance_pos(&s, bkey_s_c_null); stop: extent_insert_committed(&s); /* @@ -1669,29 +1645,37 @@ static const char *bch2_extent_invalid(const struct bch_fs *c, case BCH_EXTENT_CACHED: { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const union bch_extent_entry *entry; - const union bch_extent_crc *crc; + struct bch_extent_crc_unpacked crc; const struct bch_extent_ptr *ptr; unsigned size_ondisk = e.k->size; const char *reason; + unsigned nonce = UINT_MAX; extent_for_each_entry(e, entry) { if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) return "invalid extent entry type"; if (extent_entry_is_crc(entry)) { - crc = entry_to_crc(entry); + crc = bch2_extent_crc_unpack(e.k, entry_to_crc(entry)); - if (crc_offset(crc) + e.k->size > - crc_uncompressed_size(e.k, crc)) + if (crc.offset + e.k->size > + crc.uncompressed_size) return "checksum offset + key size > uncompressed size"; - size_ondisk = crc_compressed_size(e.k, crc); + size_ondisk = crc.compressed_size; - if (!bch2_checksum_type_valid(c, crc_csum_type(crc))) + if (!bch2_checksum_type_valid(c, crc.csum_type)) return "invalid checksum type"; - if (crc_compression_type(crc) >= BCH_COMPRESSION_NR) + if (crc.compression_type >= BCH_COMPRESSION_NR) return "invalid compression type"; + + if (bch2_csum_type_is_encryption(crc.csum_type)) { + if (nonce == UINT_MAX) + nonce = crc.offset + crc.nonce; + else if (nonce != crc.offset + crc.nonce) + return "incorrect nonce"; + } } else { ptr = entry_to_ptr(entry); @@ -1864,102 +1848,75 @@ static unsigned PTR_TIER(struct bch_fs *c, } static void bch2_extent_crc_init(union bch_extent_crc *crc, - unsigned compressed_size, - unsigned uncompressed_size, - unsigned compression_type, - unsigned nonce, - struct bch_csum csum, unsigned csum_type) -{ - if (bch_crc_bytes[csum_type] <= 4 && - uncompressed_size <= CRC32_SIZE_MAX && - nonce <= CRC32_NONCE_MAX) { + struct bch_extent_crc_unpacked new) +{ +#define common_fields(_crc) \ + .csum_type = _crc.csum_type, \ + .compression_type = _crc.compression_type, \ + ._compressed_size = _crc.compressed_size - 1, \ + ._uncompressed_size = _crc.uncompressed_size - 1, \ + .offset = _crc.offset + + if (bch_crc_bytes[new.csum_type] <= 4 && + new.uncompressed_size <= CRC32_SIZE_MAX && + new.nonce <= CRC32_NONCE_MAX) { crc->crc32 = (struct bch_extent_crc32) { .type = 1 << BCH_EXTENT_ENTRY_crc32, - ._compressed_size = compressed_size - 1, - ._uncompressed_size = uncompressed_size - 1, - .offset = 0, - .compression_type = compression_type, - .csum_type = csum_type, - .csum = *((__le32 *) &csum.lo), + common_fields(new), + .csum = *((__le32 *) &new.csum.lo), }; return; } - if (bch_crc_bytes[csum_type] <= 10 && - uncompressed_size <= CRC64_SIZE_MAX && - nonce <= CRC64_NONCE_MAX) { + if (bch_crc_bytes[new.csum_type] <= 10 && + new.uncompressed_size <= CRC64_SIZE_MAX && + new.nonce <= CRC64_NONCE_MAX) { crc->crc64 = (struct bch_extent_crc64) { .type = 1 << BCH_EXTENT_ENTRY_crc64, - ._compressed_size = compressed_size - 1, - ._uncompressed_size = uncompressed_size - 1, - .offset = 0, - .nonce = nonce, - .compression_type = compression_type, - .csum_type = csum_type, - .csum_lo = csum.lo, - .csum_hi = *((__le16 *) &csum.hi), + common_fields(new), + .nonce = new.nonce, + .csum_lo = new.csum.lo, + .csum_hi = *((__le16 *) &new.csum.hi), }; return; } - if (bch_crc_bytes[csum_type] <= 16 && - uncompressed_size <= CRC128_SIZE_MAX && - nonce <= CRC128_NONCE_MAX) { + if (bch_crc_bytes[new.csum_type] <= 16 && + new.uncompressed_size <= CRC128_SIZE_MAX && + new.nonce <= CRC128_NONCE_MAX) { crc->crc128 = (struct bch_extent_crc128) { .type = 1 << BCH_EXTENT_ENTRY_crc128, - ._compressed_size = compressed_size - 1, - ._uncompressed_size = uncompressed_size - 1, - .offset = 0, - .nonce = nonce, - .compression_type = compression_type, - .csum_type = csum_type, - .csum = csum, + common_fields(new), + .nonce = new.nonce, + .csum = new.csum, }; return; } - +#undef common_fields BUG(); } void bch2_extent_crc_append(struct bkey_i_extent *e, - unsigned compressed_size, - unsigned uncompressed_size, - unsigned compression_type, - unsigned nonce, - struct bch_csum csum, unsigned csum_type) + struct bch_extent_crc_unpacked new) { - union bch_extent_crc *crc; + struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *i; - BUG_ON(compressed_size > uncompressed_size); - BUG_ON(uncompressed_size != e->k.size); - BUG_ON(!compressed_size || !uncompressed_size); + BUG_ON(new.compressed_size > new.uncompressed_size); + BUG_ON(new.live_size != e->k.size); + BUG_ON(!new.compressed_size || !new.uncompressed_size); /* * Look up the last crc entry, so we can check if we need to add * another: */ - extent_for_each_crc(extent_i_to_s(e), crc) + extent_for_each_crc(extent_i_to_s(e), crc, i) ; - if (!crc && !csum_type && !compression_type) - return; - - if (crc && - crc_compressed_size(&e->k, crc) == compressed_size && - crc_uncompressed_size(&e->k, crc) == uncompressed_size && - crc_offset(crc) == 0 && - crc_nonce(crc) == nonce && - crc_csum_type(crc) == csum_type && - crc_compression_type(crc) == compression_type && - crc_csum(crc).lo == csum.lo && - crc_csum(crc).hi == csum.hi) + if (!memcmp(&crc, &new, sizeof(crc))) return; - bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), - compressed_size, - uncompressed_size, - compression_type, - nonce, csum, csum_type); + bch2_extent_crc_init((void *) extent_entry_last(extent_i_to_s(e)), new); __extent_entry_push(e); } @@ -2011,16 +1968,22 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) } void bch2_extent_mark_replicas_cached(struct bch_fs *c, - struct bkey_s_extent e, - unsigned nr_cached) + struct bkey_s_extent e) { struct bch_extent_ptr *ptr; + unsigned tier = 0, nr_cached = 0, nr_good = 0; bool have_higher_tier; - unsigned tier = 0; - if (!nr_cached) + extent_for_each_ptr(e, ptr) + if (!ptr->cached && + c->devs[ptr->dev]->mi.state != BCH_MEMBER_STATE_FAILED) + nr_good++; + + if (nr_good <= c->opts.data_replicas) return; + nr_cached = nr_good - c->opts.data_replicas; + do { have_higher_tier = false; diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index 634159f..1ec2db5 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -3,7 +3,7 @@ #include "bcachefs.h" #include "bkey.h" -#include "io_types.h" +#include "extents_types.h" struct bch_fs; struct journal_res; @@ -38,11 +38,17 @@ bch2_insert_fixup_extent(struct btree_insert *, struct btree_insert_entry *); bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); -void bch2_extent_mark_replicas_cached(struct bch_fs *, - struct bkey_s_extent, unsigned); +void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent); + +const struct bch_extent_ptr * +bch2_extent_has_device(struct bkey_s_c_extent, unsigned); unsigned bch2_extent_nr_ptrs(struct bkey_s_c_extent); unsigned bch2_extent_nr_dirty_ptrs(struct bkey_s_c); +unsigned bch2_extent_is_compressed(struct bkey_s_c); + +bool bch2_extent_matches_ptr(struct bch_fs *, struct bkey_s_c_extent, + struct bch_extent_ptr, u64); static inline bool bkey_extent_is_data(const struct bkey *k) { @@ -67,6 +73,12 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k) } } +static inline bool bch2_extent_is_fully_allocated(struct bkey_s_c k) +{ + return bkey_extent_is_allocation(k.k) && + !bch2_extent_is_compressed(k); +} + static inline bool bkey_extent_is_cached(const struct bkey *k) { return k->type == BCH_EXTENT_CACHED; @@ -170,6 +182,8 @@ union bch_extent_crc { (struct bch_extent_ptr *) (_entry)); \ }) +/* checksum entries: */ + enum bch_extent_crc_type { BCH_EXTENT_CRC_NONE, BCH_EXTENT_CRC32, @@ -208,6 +222,50 @@ __extent_crc_type(const union bch_extent_crc *crc) : __extent_crc_type((union bch_extent_crc *) _crc); \ }) +static inline struct bch_extent_crc_unpacked +bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) +{ +#define common_fields(_crc) \ + .csum_type = _crc.csum_type, \ + .compression_type = _crc.compression_type, \ + .compressed_size = _crc._compressed_size + 1, \ + .uncompressed_size = _crc._uncompressed_size + 1, \ + .offset = _crc.offset, \ + .live_size = k->size + + switch (extent_crc_type(crc)) { + case BCH_EXTENT_CRC_NONE: + return (struct bch_extent_crc_unpacked) { + .compressed_size = k->size, + .uncompressed_size = k->size, + .live_size = k->size, + }; + case BCH_EXTENT_CRC32: + return (struct bch_extent_crc_unpacked) { + common_fields(crc->crc32), + .csum.lo = crc->crc32.csum, + }; + case BCH_EXTENT_CRC64: + return (struct bch_extent_crc_unpacked) { + common_fields(crc->crc64), + .nonce = crc->crc64.nonce, + .csum.lo = crc->crc64.csum_lo, + .csum.hi = crc->crc64.csum_hi, + }; + case BCH_EXTENT_CRC128: + return (struct bch_extent_crc_unpacked) { + common_fields(crc->crc128), + .nonce = crc->crc128.nonce, + .csum = crc->crc128.csum, + }; + default: + BUG(); + } +#undef common_fields +} + +/* Extent entry iteration: */ + #define extent_entry_next(_entry) \ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) @@ -226,7 +284,7 @@ __extent_crc_type(const union bch_extent_crc *crc) /* Iterate over crcs only: */ -#define extent_crc_next(_e, _p) \ +#define __extent_crc_next(_e, _p) \ ({ \ typeof(&(_e).v->start[0]) _entry = _p; \ \ @@ -237,25 +295,41 @@ __extent_crc_type(const union bch_extent_crc *crc) entry_to_crc(_entry < extent_entry_last(_e) ? _entry : NULL); \ }) -#define extent_for_each_crc(_e, _crc) \ - for ((_crc) = extent_crc_next(_e, (_e).v->start); \ +#define __extent_for_each_crc(_e, _crc) \ + for ((_crc) = __extent_crc_next(_e, (_e).v->start); \ (_crc); \ - (_crc) = extent_crc_next(_e, extent_entry_next(to_entry(_crc)))) + (_crc) = __extent_crc_next(_e, extent_entry_next(to_entry(_crc)))) + +#define extent_crc_next(_e, _crc, _iter) \ +({ \ + extent_for_each_entry_from(_e, _iter, _iter) \ + if (extent_entry_is_crc(_iter)) { \ + (_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_iter));\ + break; \ + } \ + \ + (_iter) < extent_entry_last(_e); \ +}) + +#define extent_for_each_crc(_e, _crc, _iter) \ + for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \ + (_iter) = (_e).v->start; \ + extent_crc_next(_e, _crc, _iter); \ + (_iter) = extent_entry_next(_iter)) /* Iterate over pointers, with crcs: */ -#define extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter) \ +#define extent_ptr_crc_next(_e, _ptr, _crc) \ ({ \ __label__ out; \ typeof(&(_e).v->start[0]) _entry; \ \ extent_for_each_entry_from(_e, _entry, to_entry(_ptr)) \ if (extent_entry_is_crc(_entry)) { \ - (_crc) = entry_to_crc(_entry); \ + (_crc) = bch2_extent_crc_unpack((_e).k, entry_to_crc(_entry));\ } else { \ _ptr = entry_to_ptr(_entry); \ - if (_filter) \ - goto out; \ + goto out; \ } \ \ _ptr = NULL; \ @@ -263,35 +337,26 @@ out: \ _ptr; \ }) -#define extent_for_each_ptr_crc_filter(_e, _ptr, _crc, _filter) \ - for ((_crc) = NULL, \ +#define extent_for_each_ptr_crc(_e, _ptr, _crc) \ + for ((_crc) = bch2_extent_crc_unpack((_e).k, NULL), \ (_ptr) = &(_e).v->start->ptr; \ - ((_ptr) = extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter));\ + ((_ptr) = extent_ptr_crc_next(_e, _ptr, _crc)); \ (_ptr)++) -#define extent_for_each_ptr_crc(_e, _ptr, _crc) \ - extent_for_each_ptr_crc_filter(_e, _ptr, _crc, true) - /* Iterate over pointers only, and from a given position: */ -#define extent_ptr_next_filter(_e, _ptr, _filter) \ +#define extent_ptr_next(_e, _ptr) \ ({ \ - typeof(__entry_to_crc(&(_e).v->start[0])) _crc; \ + struct bch_extent_crc_unpacked _crc; \ \ - extent_ptr_crc_next_filter(_e, _crc, _ptr, _filter); \ + extent_ptr_crc_next(_e, _ptr, _crc); \ }) -#define extent_ptr_next(_e, _ptr) \ - extent_ptr_next_filter(_e, _ptr, true) - -#define extent_for_each_ptr_filter(_e, _ptr, _filter) \ +#define extent_for_each_ptr(_e, _ptr) \ for ((_ptr) = &(_e).v->start->ptr; \ - ((_ptr) = extent_ptr_next_filter(_e, _ptr, _filter)); \ + ((_ptr) = extent_ptr_next(_e, _ptr)); \ (_ptr)++) -#define extent_for_each_ptr(_e, _ptr) \ - extent_for_each_ptr_filter(_e, _ptr, true) - #define extent_ptr_prev(_e, _ptr) \ ({ \ typeof(&(_e).v->start->ptr) _p; \ @@ -315,8 +380,8 @@ out: \ (_ptr); \ (_ptr) = extent_ptr_prev(_e, _ptr)) -void bch2_extent_crc_append(struct bkey_i_extent *, unsigned, unsigned, - unsigned, unsigned, struct bch_csum, unsigned); +void bch2_extent_crc_append(struct bkey_i_extent *, + struct bch_extent_crc_unpacked); static inline void __extent_entry_push(struct bkey_i_extent *e) { @@ -336,226 +401,26 @@ static inline void extent_ptr_append(struct bkey_i_extent *e, __extent_entry_push(e); } -static inline struct bch_extent_crc128 crc_to_128(const struct bkey *k, - const union bch_extent_crc *crc) +static inline struct bch_devs_list bch2_extent_devs(struct bkey_s_c_extent e) { - EBUG_ON(!k->size); - - switch (extent_crc_type(crc)) { - case BCH_EXTENT_CRC_NONE: - return (struct bch_extent_crc128) { - ._compressed_size = k->size - 1, - ._uncompressed_size = k->size - 1, - }; - case BCH_EXTENT_CRC32: - return (struct bch_extent_crc128) { - .type = 1 << BCH_EXTENT_ENTRY_crc128, - ._compressed_size = crc->crc32._compressed_size, - ._uncompressed_size = crc->crc32._uncompressed_size, - .offset = crc->crc32.offset, - .csum_type = crc->crc32.csum_type, - .compression_type = crc->crc32.compression_type, - .csum.lo = crc->crc32.csum, - }; - case BCH_EXTENT_CRC64: - return (struct bch_extent_crc128) { - .type = 1 << BCH_EXTENT_ENTRY_crc128, - ._compressed_size = crc->crc64._compressed_size, - ._uncompressed_size = crc->crc64._uncompressed_size, - .offset = crc->crc64.offset, - .nonce = crc->crc64.nonce, - .csum_type = crc->crc64.csum_type, - .compression_type = crc->crc64.compression_type, - .csum.lo = crc->crc64.csum_lo, - .csum.hi = crc->crc64.csum_hi, - }; - case BCH_EXTENT_CRC128: - return crc->crc128; - default: - BUG(); - } -} - -#define crc_compressed_size(_k, _crc) \ -({ \ - unsigned _size = 0; \ - \ - switch (extent_crc_type(_crc)) { \ - case BCH_EXTENT_CRC_NONE: \ - _size = ((const struct bkey *) (_k))->size; \ - break; \ - case BCH_EXTENT_CRC32: \ - _size = ((struct bch_extent_crc32 *) _crc) \ - ->_compressed_size + 1; \ - break; \ - case BCH_EXTENT_CRC64: \ - _size = ((struct bch_extent_crc64 *) _crc) \ - ->_compressed_size + 1; \ - break; \ - case BCH_EXTENT_CRC128: \ - _size = ((struct bch_extent_crc128 *) _crc) \ - ->_compressed_size + 1; \ - break; \ - } \ - _size; \ -}) - -#define crc_uncompressed_size(_k, _crc) \ -({ \ - unsigned _size = 0; \ - \ - switch (extent_crc_type(_crc)) { \ - case BCH_EXTENT_CRC_NONE: \ - _size = ((const struct bkey *) (_k))->size; \ - break; \ - case BCH_EXTENT_CRC32: \ - _size = ((struct bch_extent_crc32 *) _crc) \ - ->_uncompressed_size + 1; \ - break; \ - case BCH_EXTENT_CRC64: \ - _size = ((struct bch_extent_crc64 *) _crc) \ - ->_uncompressed_size + 1; \ - break; \ - case BCH_EXTENT_CRC128: \ - _size = ((struct bch_extent_crc128 *) _crc) \ - ->_uncompressed_size + 1; \ - break; \ - } \ - _size; \ -}) - -static inline unsigned crc_offset(const union bch_extent_crc *crc) -{ - switch (extent_crc_type(crc)) { - case BCH_EXTENT_CRC_NONE: - return 0; - case BCH_EXTENT_CRC32: - return crc->crc32.offset; - case BCH_EXTENT_CRC64: - return crc->crc64.offset; - case BCH_EXTENT_CRC128: - return crc->crc128.offset; - default: - BUG(); - } -} - -static inline unsigned crc_nonce(const union bch_extent_crc *crc) -{ - switch (extent_crc_type(crc)) { - case BCH_EXTENT_CRC_NONE: - case BCH_EXTENT_CRC32: - return 0; - case BCH_EXTENT_CRC64: - return crc->crc64.nonce; - case BCH_EXTENT_CRC128: - return crc->crc128.nonce; - default: - BUG(); - } -} - -static inline unsigned crc_csum_type(const union bch_extent_crc *crc) -{ - switch (extent_crc_type(crc)) { - case BCH_EXTENT_CRC_NONE: - return 0; - case BCH_EXTENT_CRC32: - return crc->crc32.csum_type; - case BCH_EXTENT_CRC64: - return crc->crc64.csum_type; - case BCH_EXTENT_CRC128: - return crc->crc128.csum_type; - default: - BUG(); - } -} - -static inline unsigned crc_compression_type(const union bch_extent_crc *crc) -{ - switch (extent_crc_type(crc)) { - case BCH_EXTENT_CRC_NONE: - return 0; - case BCH_EXTENT_CRC32: - return crc->crc32.compression_type; - case BCH_EXTENT_CRC64: - return crc->crc64.compression_type; - case BCH_EXTENT_CRC128: - return crc->crc128.compression_type; - default: - BUG(); - } -} - -static inline struct bch_csum crc_csum(const union bch_extent_crc *crc) -{ - switch (extent_crc_type(crc)) { - case BCH_EXTENT_CRC_NONE: - return (struct bch_csum) { 0 }; - case BCH_EXTENT_CRC32: - return (struct bch_csum) { .lo = crc->crc32.csum }; - case BCH_EXTENT_CRC64: - return (struct bch_csum) { - .lo = crc->crc64.csum_lo, - .hi = crc->crc64.csum_hi, - }; - case BCH_EXTENT_CRC128: - return crc->crc128.csum; - default: - BUG(); - } -} - -static inline unsigned bkey_extent_is_compressed(struct bkey_s_c k) -{ - struct bkey_s_c_extent e; + struct bch_devs_list ret = (struct bch_devs_list) { 0 }; const struct bch_extent_ptr *ptr; - const union bch_extent_crc *crc; - unsigned ret = 0; - switch (k.k->type) { - case BCH_EXTENT: - case BCH_EXTENT_CACHED: - e = bkey_s_c_to_extent(k); - - extent_for_each_ptr_crc(e, ptr, crc) - if (!ptr->cached && - crc_compression_type(crc) != BCH_COMPRESSION_NONE && - crc_compressed_size(e.k, crc) < k.k->size) - ret = max_t(unsigned, ret, - crc_compressed_size(e.k, crc)); - } + extent_for_each_ptr(e, ptr) + ret.devs[ret.nr++] = ptr->dev; return ret; } -static inline unsigned extent_current_nonce(struct bkey_s_c_extent e) -{ - const union bch_extent_crc *crc; - - extent_for_each_crc(e, crc) - if (bch2_csum_type_is_encryption(crc_csum_type(crc))) - return crc_offset(crc) + crc_nonce(crc); - - return 0; -} - -void bch2_extent_narrow_crcs(struct bkey_s_extent); +bool bch2_can_narrow_extent_crcs(struct bkey_s_c_extent, + struct bch_extent_crc_unpacked); +bool bch2_extent_narrow_crcs(struct bkey_i_extent *, struct bch_extent_crc_unpacked); void bch2_extent_drop_redundant_crcs(struct bkey_s_extent); void __bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *); void bch2_extent_drop_ptr(struct bkey_s_extent, struct bch_extent_ptr *); void bch2_extent_drop_ptr_idx(struct bkey_s_extent, unsigned); -const struct bch_extent_ptr * -bch2_extent_has_device(struct bkey_s_c_extent, unsigned); -struct bch_extent_ptr * -bch2_extent_find_ptr(struct bch_fs *, struct bkey_s_extent, - struct bch_extent_ptr); -struct bch_extent_ptr * -bch2_extent_find_matching_ptr(struct bch_fs *, struct bkey_s_extent, - struct bkey_s_c_extent); - bool bch2_cut_front(struct bpos, struct bkey_i *); bool bch2_cut_back(struct bpos, struct bkey *); void bch2_key_resize(struct bkey *, unsigned); diff --git a/libbcachefs/extents_types.h b/libbcachefs/extents_types.h new file mode 100644 index 0000000..15805cd --- /dev/null +++ b/libbcachefs/extents_types.h @@ -0,0 +1,27 @@ +#ifndef _BCACHEFS_EXTENTS_TYPES_H +#define _BCACHEFS_EXTENTS_TYPES_H + +#include "bcachefs_format.h" + +struct bch_extent_crc_unpacked { + u8 csum_type; + u8 compression_type; + + u16 compressed_size; + u16 uncompressed_size; + + u16 offset; + u16 live_size; + + u16 nonce; + + struct bch_csum csum; +}; + +struct extent_pick_ptr { + struct bch_extent_ptr ptr; + struct bch_extent_crc_unpacked crc; + struct bch_dev *ca; +}; + +#endif /* _BCACHEFS_EXTENTS_TYPES_H */ diff --git a/libbcachefs/eytzinger.h b/libbcachefs/eytzinger.h index 04dcfc5..66fa227 100644 --- a/libbcachefs/eytzinger.h +++ b/libbcachefs/eytzinger.h @@ -80,7 +80,7 @@ static inline unsigned eytzinger1_prev(unsigned i, unsigned size) EBUG_ON(i >= size); if (eytzinger1_left_child(i) < size) { - i = eytzinger1_left_child(i); + i = eytzinger1_left_child(i) + 1; i <<= __fls(size) - __fls(i); i -= 1; @@ -163,38 +163,6 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) (_i) != 0; \ (_i) = eytzinger1_next((_i), (_size))) -#if 0 -void eytzinger0_test(void) -{ - unsigned i, j, size; - - for (size = 2; - size < 65536000; - size++) { - if (!(size % 4096)) - printk(KERN_INFO "tree size %u\n", size); - - assert(eytzinger1_prev(0, size) == eytzinger1_last(size)); - assert(eytzinger1_next(0, size) == eytzinger1_first(size)); - - assert(eytzinger1_prev(eytzinger1_first(size), size) == 0); - assert(eytzinger1_next(eytzinger1_last(size), size) == 0); - - eytzinger1_for_each(j, size) { - assert(from_inorder(i, size) == j); - assert(to_inorder(j, size) == i); - - if (j != eytzinger1_last(size)) { - unsigned next = eytzinger1_next(j, size); - - assert(eytzinger1_prev(next, size) == j); - } - } - } - -} -#endif - /* Zero based indexing version: */ static inline unsigned eytzinger0_child(unsigned i, unsigned child) @@ -214,27 +182,29 @@ static inline unsigned eytzinger0_right_child(unsigned i) return eytzinger0_child(i, 1); } -#if 0 static inline unsigned eytzinger0_first(unsigned size) { + return eytzinger1_first(size + 1) - 1; } static inline unsigned eytzinger0_last(unsigned size) { + return eytzinger1_last(size + 1) - 1; } static inline unsigned eytzinger0_next(unsigned i, unsigned size) { + return eytzinger1_next(i + 1, size + 1) - 1; } static inline unsigned eytzinger0_prev(unsigned i, unsigned size) { + return eytzinger1_prev(i + 1, size + 1) - 1; } -#endif static inline unsigned eytzinger0_extra(unsigned size) { - return (size + 1 - rounddown_pow_of_two(size)) << 1; + return eytzinger1_extra(size + 1); } static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, @@ -259,10 +229,41 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size)); } +#define eytzinger0_for_each(_i, _size) \ + for ((_i) = eytzinger0_first((_size)); \ + (_i) != -1; \ + (_i) = eytzinger0_next((_i), (_size))) + typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); +/* return greatest node <= @search, or -1 if not found */ +static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, + eytzinger_cmp_fn cmp, const void *search) +{ + unsigned i, n = 0; + + if (!nr) + return -1; + + do { + i = n; + n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); + } while (n < nr); + + if (n & 1) { + /* @i was greater than @search, return previous node: */ + + if (i == eytzinger0_first(nr)) + return -1; + + return eytzinger0_prev(i, nr); + } else { + return i; + } +} + static inline size_t eytzinger0_find(void *base, size_t nr, size_t size, - eytzinger_cmp_fn cmp, void *search) + eytzinger_cmp_fn cmp, const void *search) { size_t i = 0; int res; @@ -271,17 +272,6 @@ static inline size_t eytzinger0_find(void *base, size_t nr, size_t size, (res = cmp(search, base + i * size, size))) i = eytzinger0_child(i, res > 0); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { - bool found1 = i < nr, found2 = false; - size_t j; - - for (j = 0; j < nr; j++) - if (!cmp(base + j * size, search, size)) - found2 = true; - - BUG_ON(found1 != found2); - } - return i; } diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index 8b41be8..298e359 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -26,9 +26,67 @@ #include #include -struct bio_set *bch2_writepage_bioset; -struct bio_set *bch2_dio_read_bioset; -struct bio_set *bch2_dio_write_bioset; +struct i_sectors_hook { + struct extent_insert_hook hook; + s64 sectors; + struct bch_inode_info *inode; +}; + +struct bchfs_write_op { + struct bch_inode_info *inode; + s64 sectors_added; + bool is_dio; + bool unalloc; + u64 new_i_size; + + /* must be last: */ + struct bch_write_op op; +}; + +static inline void bch2_fswrite_op_init(struct bchfs_write_op *op, + struct bch_inode_info *inode, + bool is_dio) +{ + op->inode = inode; + op->sectors_added = 0; + op->is_dio = is_dio; + op->unalloc = false; + op->new_i_size = U64_MAX; +} + +struct bch_writepage_io { + struct closure cl; + + /* must be last: */ + struct bchfs_write_op op; +}; + +struct dio_write { + struct closure cl; + struct kiocb *req; + struct bch_fs *c; + long written; + long error; + loff_t offset; + + struct disk_reservation res; + + struct iovec *iovec; + struct iovec inline_vecs[UIO_FASTIOV]; + struct iov_iter iter; + + struct task_struct *task; + + /* must be last: */ + struct bchfs_write_op iop; +}; + +struct dio_read { + struct closure cl; + struct kiocb *req; + long ret; + struct bch_read_bio rbio; +}; /* pagecache_block must be held */ static int write_invalidate_inode_pages_range(struct address_space *mapping, @@ -101,7 +159,7 @@ static inline void i_size_dirty_get(struct bch_inode_info *inode) /* i_sectors accounting: */ -static enum extent_insert_hook_ret +static enum btree_insert_ret i_sectors_hook_fn(struct extent_insert_hook *hook, struct bpos committed_pos, struct bpos next_pos, @@ -119,7 +177,7 @@ i_sectors_hook_fn(struct extent_insert_hook *hook, h->sectors += sectors * sign; - return BTREE_HOOK_DO_INSERT; + return BTREE_INSERT_OK; } static int inode_set_i_sectors_dirty(struct bch_inode_info *inode, @@ -208,7 +266,7 @@ struct bchfs_extent_trans_hook { bool need_inode_update; }; -static enum extent_insert_hook_ret +static enum btree_insert_ret bchfs_extent_update_hook(struct extent_insert_hook *hook, struct bpos committed_pos, struct bpos next_pos, @@ -224,6 +282,10 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook, u64 offset = min(next_pos.offset << 9, h->op->new_i_size); bool do_pack = false; + if (h->op->unalloc && + !bch2_extent_is_fully_allocated(k)) + return BTREE_INSERT_ENOSPC; + BUG_ON((next_pos.offset << 9) > round_up(offset, PAGE_SIZE)); /* XXX: inode->i_size locking */ @@ -232,7 +294,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook, if (!h->need_inode_update) { h->need_inode_update = true; - return BTREE_HOOK_RESTART_TRANS; + return BTREE_INSERT_NEED_TRAVERSE; } h->inode_u.bi_size = offset; @@ -247,7 +309,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook, if (sectors) { if (!h->need_inode_update) { h->need_inode_update = true; - return BTREE_HOOK_RESTART_TRANS; + return BTREE_INSERT_NEED_TRAVERSE; } h->inode_u.bi_sectors += sectors; @@ -267,7 +329,7 @@ bchfs_extent_update_hook(struct extent_insert_hook *hook, if (do_pack) bch2_inode_pack(&h->inode_p, &h->inode_u); - return BTREE_HOOK_DO_INSERT; + return BTREE_INSERT_OK; } static int bchfs_write_index_update(struct bch_write_op *wop) @@ -352,12 +414,16 @@ static int bchfs_write_index_update(struct bch_write_op *wop) BTREE_INSERT_NOFAIL|BTREE_INSERT_ATOMIC, BTREE_INSERT_ENTRY(&extent_iter, k)); } + + BUG_ON(bkey_cmp(extent_iter.pos, bkey_start_pos(&k->k))); + BUG_ON(!ret != !k->k.size); err: if (ret == -EINTR) continue; if (ret) break; + BUG_ON(bkey_cmp(extent_iter.pos, k->k.p) < 0); bch2_keylist_pop_front(keys); } while (!bch2_keylist_empty(keys)); @@ -748,8 +814,7 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter, if (bkey_extent_is_allocation(k.k)) bch2_add_page_sectors(bio, k); - if (!bkey_extent_is_allocation(k.k) || - bkey_extent_is_compressed(k)) + if (!bch2_extent_is_fully_allocated(k)) bch2_mark_pages_unalloc(bio); if (pick.ca) { @@ -759,7 +824,8 @@ static void bchfs_read(struct bch_fs *c, struct btree_iter *iter, trace_read_split(&rbio->bio); } - bch2_read_extent(c, rbio, k, &pick, flags); + bch2_read_extent(c, rbio, bkey_s_c_to_extent(k), + &pick, flags); } else { zero_fill_bio(bio); @@ -963,22 +1029,20 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, alloc_io: w->io = container_of(bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, - bch2_writepage_bioset), + &c->writepage_bioset), struct bch_writepage_io, op.op.wbio.bio); closure_init(&w->io->cl, NULL); - w->io->op.inode = inode; - w->io->op.sectors_added = 0; - w->io->op.is_dio = false; + bch2_fswrite_op_init(&w->io->op, inode, false); bch2_write_op_init(&w->io->op.op, c, (struct disk_reservation) { .nr_replicas = c->opts.data_replicas, }, c->fastest_devs, - inode->ei_last_dirtied, + writepoint_hashed(inode->ei_last_dirtied), POS(inum, 0), &inode->ei_journal_seq, - BCH_WRITE_THROTTLE); + 0); w->io->op.op.index_update_fn = bchfs_write_index_update; } @@ -1409,7 +1473,7 @@ static int bch2_direct_IO_read(struct bch_fs *c, struct kiocb *req, bio = bio_alloc_bioset(GFP_KERNEL, iov_iter_npages(iter, BIO_MAX_PAGES), - bch2_dio_read_bioset); + &c->dio_read_bioset); bio->bi_end_io = bch2_direct_IO_read_endio; @@ -1541,20 +1605,19 @@ static void bch2_do_direct_IO_write(struct dio_write *dio) return; } - dio->iop.inode = inode; dio->iop.sectors_added = 0; - dio->iop.is_dio = true; - dio->iop.new_i_size = U64_MAX; bch2_write_op_init(&dio->iop.op, dio->c, dio->res, dio->c->fastest_devs, - (unsigned long) dio->task, + writepoint_hashed((unsigned long) dio->task), POS(inode->v.i_ino, (dio->offset + dio->written) >> 9), &inode->ei_journal_seq, - flags|BCH_WRITE_THROTTLE); + flags); dio->iop.op.index_update_fn = bchfs_write_index_update; - dio->res.sectors -= bio_sectors(bio); - dio->iop.op.res.sectors = bio_sectors(bio); + if (!dio->iop.unalloc) { + dio->res.sectors -= bio_sectors(bio); + dio->iop.op.res.sectors = bio_sectors(bio); + } task_io_account_write(bio->bi_iter.bi_size); @@ -1589,6 +1652,31 @@ static void bch2_dio_write_loop_async(struct closure *cl) } } +static int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, + u64 size) +{ + struct btree_iter iter; + struct bpos end = pos; + struct bkey_s_c k; + int ret = 0; + + end.offset += size; + + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, pos, + BTREE_ITER_WITH_HOLES, k) { + if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) + break; + + if (!bch2_extent_is_fully_allocated(k)) { + ret = -ENOSPC; + break; + } + } + bch2_btree_iter_unlock(&iter); + + return ret; +} + static int bch2_direct_IO_write(struct bch_fs *c, struct kiocb *req, struct file *file, struct bch_inode_info *inode, @@ -1610,17 +1698,18 @@ static int bch2_direct_IO_write(struct bch_fs *c, bio = bio_alloc_bioset(GFP_KERNEL, iov_iter_npages(iter, BIO_MAX_PAGES), - bch2_dio_write_bioset); + &c->dio_write_bioset); dio = container_of(bio, struct dio_write, iop.op.wbio.bio); - dio->req = req; - dio->c = c; - dio->written = 0; - dio->error = 0; - dio->offset = offset; - dio->iovec = NULL; - dio->iter = *iter; - dio->task = current; closure_init(&dio->cl, NULL); + dio->req = req; + dio->c = c; + dio->written = 0; + dio->error = 0; + dio->offset = offset; + dio->iovec = NULL; + dio->iter = *iter; + dio->task = current; + bch2_fswrite_op_init(&dio->iop, inode, true); if (offset + iter->count > inode->v.i_size) sync = true; @@ -1635,9 +1724,15 @@ static int bch2_direct_IO_write(struct bch_fs *c, */ ret = bch2_disk_reservation_get(c, &dio->res, iter->count >> 9, 0); if (unlikely(ret)) { - closure_debug_destroy(&dio->cl); - bio_put(bio); - return ret; + if (bch2_check_range_allocated(c, POS(inode->v.i_ino, + offset >> 9), + iter->count >> 9)) { + closure_debug_destroy(&dio->cl); + bio_put(bio); + return ret; + } + + dio->iop.unalloc = true; } inode_dio_begin(&inode->v); @@ -2318,7 +2413,7 @@ static long bch2_fallocate(struct bch_inode_info *inode, int mode, reservation.v.nr_replicas = bch2_extent_nr_dirty_ptrs(k); if (reservation.v.nr_replicas < replicas || - bkey_extent_is_compressed(k)) { + bch2_extent_is_compressed(k)) { ret = bch2_disk_reservation_get(c, &disk_res, sectors, 0); if (ret) @@ -2564,4 +2659,24 @@ loff_t bch2_llseek(struct file *file, loff_t offset, int whence) return -EINVAL; } +void bch2_fs_fsio_exit(struct bch_fs *c) +{ + bioset_exit(&c->dio_write_bioset); + bioset_exit(&c->dio_read_bioset); + bioset_exit(&c->writepage_bioset); +} + +int bch2_fs_fsio_init(struct bch_fs *c) +{ + if (bioset_init(&c->writepage_bioset, + 4, offsetof(struct bch_writepage_io, op.op.wbio.bio)) || + bioset_init(&c->dio_read_bioset, + 4, offsetof(struct dio_read, rbio.bio)) || + bioset_init(&c->dio_write_bioset, + 4, offsetof(struct dio_write, iop.op.wbio.bio))) + return -ENOMEM; + + return 0; +} + #endif /* NO_BCACHEFS_FS */ diff --git a/libbcachefs/fs-io.h b/libbcachefs/fs-io.h index 505cea7..30d1ea9 100644 --- a/libbcachefs/fs-io.h +++ b/libbcachefs/fs-io.h @@ -1,7 +1,11 @@ #ifndef _BCACHEFS_FS_IO_H #define _BCACHEFS_FS_IO_H +#ifndef NO_BCACHEFS_FS + #include "buckets.h" +#include "io_types.h" + #include int bch2_set_page_dirty(struct page *); @@ -35,60 +39,11 @@ int bch2_releasepage(struct page *, gfp_t); int bch2_migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); -struct i_sectors_hook { - struct extent_insert_hook hook; - s64 sectors; - struct bch_inode_info *inode; -}; - -struct bchfs_write_op { - struct bch_inode_info *inode; - s64 sectors_added; - bool is_dio; - u64 new_i_size; - - /* must be last: */ - struct bch_write_op op; -}; - -struct bch_writepage_io { - struct closure cl; - - /* must be last: */ - struct bchfs_write_op op; -}; - -extern struct bio_set *bch2_writepage_bioset; - -struct dio_write { - struct closure cl; - struct kiocb *req; - struct bch_fs *c; - long written; - long error; - loff_t offset; - - struct disk_reservation res; - - struct iovec *iovec; - struct iovec inline_vecs[UIO_FASTIOV]; - struct iov_iter iter; - - struct task_struct *task; - - /* must be last: */ - struct bchfs_write_op iop; -}; - -extern struct bio_set *bch2_dio_write_bioset; - -struct dio_read { - struct closure cl; - struct kiocb *req; - long ret; - struct bch_read_bio rbio; -}; - -extern struct bio_set *bch2_dio_read_bioset; +void bch2_fs_fsio_exit(struct bch_fs *); +int bch2_fs_fsio_init(struct bch_fs *); +#else +static inline void bch2_fs_fsio_exit(struct bch_fs *c) {} +static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; } +#endif #endif /* _BCACHEFS_FS_IO_H */ diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 081ae14..43688cd 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -654,17 +654,17 @@ static int bch2_fill_extent(struct fiemap_extent_info *info, if (bkey_extent_is_data(&k->k)) { struct bkey_s_c_extent e = bkey_i_to_s_c_extent(k); const struct bch_extent_ptr *ptr; - const union bch_extent_crc *crc; + struct bch_extent_crc_unpacked crc; int ret; extent_for_each_ptr_crc(e, ptr, crc) { int flags2 = 0; u64 offset = ptr->offset; - if (crc_compression_type(crc)) + if (crc.compression_type) flags2 |= FIEMAP_EXTENT_ENCODED; else - offset += crc_offset(crc); + offset += crc.offset; if ((offset & (PAGE_SECTORS - 1)) || (e.k->size & (PAGE_SECTORS - 1))) @@ -1336,12 +1336,6 @@ MODULE_ALIAS_FS("bcachefs"); void bch2_vfs_exit(void) { unregister_filesystem(&bcache_fs_type); - if (bch2_dio_write_bioset) - bioset_free(bch2_dio_write_bioset); - if (bch2_dio_read_bioset) - bioset_free(bch2_dio_read_bioset); - if (bch2_writepage_bioset) - bioset_free(bch2_writepage_bioset); if (bch2_inode_cache) kmem_cache_destroy(bch2_inode_cache); } @@ -1354,20 +1348,6 @@ int __init bch2_vfs_init(void) if (!bch2_inode_cache) goto err; - bch2_writepage_bioset = - bioset_create(4, offsetof(struct bch_writepage_io, op.op.wbio.bio)); - if (!bch2_writepage_bioset) - goto err; - - bch2_dio_read_bioset = bioset_create(4, offsetof(struct dio_read, rbio.bio)); - if (!bch2_dio_read_bioset) - goto err; - - bch2_dio_write_bioset = - bioset_create(4, offsetof(struct dio_write, iop.op.wbio.bio)); - if (!bch2_dio_write_bioset) - goto err; - ret = register_filesystem(&bcache_fs_type); if (ret) goto err; diff --git a/libbcachefs/io.c b/libbcachefs/io.c index e5fc72d..0c41e41 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -29,6 +29,29 @@ /* Allocate, free from mempool: */ +void bch2_latency_acct(struct bch_dev *ca, unsigned submit_time_us, int rw) +{ + u64 now = local_clock(); + unsigned io_latency = (now >> 10) - submit_time_us; + atomic_t *latency = &ca->latency[rw]; + unsigned old, new, v = atomic_read(latency); + + do { + old = v; + + /* + * If the io latency was reasonably close to the current + * latency, skip doing the update and atomic operation - most of + * the time: + */ + if (abs((int) (old - io_latency)) < (old >> 1) && + now & ~(~0 << 5)) + break; + + new = ewma_add((u64) old, io_latency, 6); + } while ((v = atomic_cmpxchg(latency, old, new)) != old); +} + void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) { struct bio_vec *bv; @@ -63,10 +86,12 @@ pool_alloc: } void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, - size_t bytes) + size_t bytes) { bool using_mempool = false; + BUG_ON(DIV_ROUND_UP(bytes, PAGE_SIZE) > bio->bi_max_vecs); + bio->bi_iter.bi_size = bytes; while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) @@ -76,7 +101,35 @@ void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, mutex_unlock(&c->bio_bounce_pages_lock); } -/* Bios with headers */ +void bch2_bio_alloc_more_pages_pool(struct bch_fs *c, struct bio *bio, + size_t bytes) +{ + while (bio->bi_vcnt < DIV_ROUND_UP(bytes, PAGE_SIZE)) { + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; + + BUG_ON(bio->bi_vcnt >= bio->bi_max_vecs); + + bv->bv_page = alloc_page(GFP_NOIO); + if (!bv->bv_page) { + /* + * We already allocated from mempool, we can't allocate from it again + * without freeing the pages we already allocated or else we could + * deadlock: + */ + bch2_bio_free_pages_pool(c, bio); + bch2_bio_alloc_pages_pool(c, bio, bytes); + return; + } + + bv->bv_len = PAGE_SIZE; + bv->bv_offset = 0; + bio->bi_vcnt++; + } + + bio->bi_iter.bi_size = bytes; +} + +/* Writes */ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, enum bch_data_type type, @@ -137,17 +190,6 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, } } -/* IO errors */ - -/* Writes */ - -static struct workqueue_struct *index_update_wq(struct bch_write_op *op) -{ - return op->alloc_reserve == RESERVE_MOVINGGC - ? op->c->copygc_wq - : op->c->wq; -} - static void __bch2_write(struct closure *); static void bch2_write_done(struct closure *cl) @@ -176,7 +218,7 @@ static u64 keylist_sectors(struct keylist *keys) return ret; } -static int bch2_write_index_default(struct bch_write_op *op) +int bch2_write_index_default(struct bch_write_op *op) { struct keylist *keys = &op->insert_keys; struct btree_iter iter; @@ -202,7 +244,6 @@ static void bch2_write_index(struct closure *cl) struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; struct keylist *keys = &op->insert_keys; - unsigned i; op->flags |= BCH_WRITE_LOOPED; @@ -220,13 +261,7 @@ static void bch2_write_index(struct closure *cl) } } - for (i = 0; i < ARRAY_SIZE(op->open_buckets); i++) - if (op->open_buckets[i]) { - bch2_open_bucket_put(c, - c->open_buckets + - op->open_buckets[i]); - op->open_buckets[i] = 0; - } + bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets); if (!(op->flags & BCH_WRITE_DONE)) continue_at(cl, __bch2_write, op->io_wq); @@ -287,6 +322,8 @@ static void bch2_write_endio(struct bio *bio) struct bch_fs *c = wbio->c; struct bch_dev *ca = wbio->ca; + bch2_latency_acct(ca, wbio->submit_time_us, WRITE); + if (bch2_dev_io_err_on(bio->bi_error, ca, "data write")) { set_bit(ca->dev_idx, op->failed.d); set_closure_fn(cl, bch2_write_io_error, index_update_wq(op)); @@ -307,179 +344,364 @@ static void bch2_write_endio(struct bio *bio) closure_put(cl); } -static struct nonce extent_nonce(struct bversion version, - unsigned nonce, - unsigned uncompressed_size, - unsigned compression_type) -{ - return (struct nonce) {{ - [0] = cpu_to_le32((nonce << 12) | - (uncompressed_size << 22)), - [1] = cpu_to_le32(version.lo), - [2] = cpu_to_le32(version.lo >> 32), - [3] = cpu_to_le32(version.hi| - (compression_type << 24))^BCH_NONCE_EXTENT, - }}; -} - static void init_append_extent(struct bch_write_op *op, - unsigned compressed_size, - unsigned uncompressed_size, - unsigned compression_type, - unsigned nonce, - struct bch_csum csum, unsigned csum_type, - struct open_bucket *ob) + struct write_point *wp, + struct bversion version, + struct bch_extent_crc_unpacked crc) { struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top); - op->pos.offset += uncompressed_size; + op->pos.offset += crc.uncompressed_size; e->k.p = op->pos; - e->k.size = uncompressed_size; - e->k.version = op->version; + e->k.size = crc.uncompressed_size; + e->k.version = version; bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED); - bch2_extent_crc_append(e, compressed_size, - uncompressed_size, - compression_type, - nonce, csum, csum_type); - - bch2_alloc_sectors_append_ptrs(op->c, e, op->nr_replicas, - ob, compressed_size); + bch2_extent_crc_append(e, crc); + bch2_alloc_sectors_append_ptrs(op->c, wp, e, crc.compressed_size); bkey_extent_set_cached(&e->k, (op->flags & BCH_WRITE_CACHED)); bch2_keylist_push(&op->insert_keys); } -static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) +static struct bio *bch2_write_bio_alloc(struct bch_fs *c, + struct write_point *wp, + struct bio *src, + bool *page_alloc_failed) { - struct bch_fs *c = op->c; - struct bio *orig = &op->wbio.bio; - struct bio *bio; struct bch_write_bio *wbio; - unsigned key_to_write_offset = op->insert_keys.top_p - - op->insert_keys.keys_p; - struct bkey_i *key_to_write; - unsigned csum_type = op->csum_type; - unsigned compression_type = op->compression_type; - int ret, more; + struct bio *bio; + unsigned output_available = + min(wp->sectors_free << 9, src->bi_iter.bi_size); + unsigned pages = DIV_ROUND_UP(output_available, PAGE_SIZE); + + bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write); + wbio = wbio_init(bio); + wbio->bounce = true; + wbio->put_bio = true; + /* copy WRITE_SYNC flag */ + wbio->bio.bi_opf = src->bi_opf; + + /* + * We can't use mempool for more than c->sb.encoded_extent_max + * worth of pages, but we'd like to allocate more if we can: + */ + while (bio->bi_iter.bi_size < output_available) { + unsigned len = min_t(unsigned, PAGE_SIZE, + output_available - bio->bi_iter.bi_size); + struct page *p; + + p = alloc_page(GFP_NOIO); + if (!p) { + unsigned pool_max = + min_t(unsigned, output_available, + c->sb.encoded_extent_max << 9); + + if (bio_sectors(bio) < pool_max) + bch2_bio_alloc_pages_pool(c, bio, pool_max); + break; + } + + bio->bi_io_vec[bio->bi_vcnt++] = (struct bio_vec) { + .bv_page = p, + .bv_len = len, + .bv_offset = 0, + }; + bio->bi_iter.bi_size += len; + } - /* don't refetch csum type/compression type */ - barrier(); + *page_alloc_failed = bio->bi_vcnt < pages; + return bio; +} + +static int bch2_write_rechecksum(struct bch_fs *c, + struct bch_write_op *op, + unsigned new_csum_type) +{ + struct bio *bio = &op->wbio.bio; + struct bch_extent_crc_unpacked new_crc; + int ret; - BUG_ON(!bio_sectors(orig)); + /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ + + if (bch2_csum_type_is_encryption(op->crc.csum_type) != + bch2_csum_type_is_encryption(new_csum_type)) + new_csum_type = op->crc.csum_type; + + ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, + NULL, &new_crc, + op->crc.offset, op->crc.live_size, + new_csum_type); + if (ret) + return ret; + + bio_advance(bio, op->crc.offset << 9); + bio->bi_iter.bi_size = op->crc.live_size << 9; + op->crc = new_crc; + return 0; +} + +static int bch2_write_decrypt(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct nonce nonce = extent_nonce(op->version, op->crc); + struct bch_csum csum; + + if (!bch2_csum_type_is_encryption(op->crc.csum_type)) + return 0; + + /* + * If we need to decrypt data in the write path, we'll no longer be able + * to verify the existing checksum (poly1305 mac, in this case) after + * it's decrypted - this is the last point we'll be able to reverify the + * checksum: + */ + csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); + if (bch2_crc_cmp(op->crc.csum, csum)) + return -EIO; + + bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); + op->crc.csum_type = 0; + op->crc.csum = (struct bch_csum) { 0, 0 }; + return 0; +} + +static enum prep_encoded_ret { + PREP_ENCODED_OK, + PREP_ENCODED_ERR, + PREP_ENCODED_CHECKSUM_ERR, + PREP_ENCODED_DO_WRITE, +} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) +{ + struct bch_fs *c = op->c; + struct bio *bio = &op->wbio.bio; - /* Need to decompress data? */ - if ((op->flags & BCH_WRITE_DATA_COMPRESSED) && - (crc_uncompressed_size(NULL, &op->crc) != op->size || - crc_compressed_size(NULL, &op->crc) > wp->sectors_free)) { - int ret; + if (!(op->flags & BCH_WRITE_DATA_ENCODED)) + return PREP_ENCODED_OK; - ret = bch2_bio_uncompress_inplace(c, orig, op->size, op->crc); - if (ret) - return ret; + BUG_ON(bio_sectors(bio) != op->crc.compressed_size); - op->flags &= ~BCH_WRITE_DATA_COMPRESSED; + /* Can we just write the entire extent as is? */ + if (op->crc.uncompressed_size == op->crc.live_size && + op->crc.compressed_size <= wp->sectors_free && + op->crc.compression_type == op->compression_type) { + if (!op->crc.compression_type && + op->csum_type != op->crc.csum_type && + bch2_write_rechecksum(c, op, op->csum_type)) + return PREP_ENCODED_CHECKSUM_ERR; + + return PREP_ENCODED_DO_WRITE; } - if (op->flags & BCH_WRITE_DATA_COMPRESSED) { - init_append_extent(op, - crc_compressed_size(NULL, &op->crc), - crc_uncompressed_size(NULL, &op->crc), - op->crc.compression_type, - op->crc.nonce, - op->crc.csum, - op->crc.csum_type, - wp->ob); - - bio = orig; - wbio = wbio_init(bio); - more = 0; - } else if (csum_type != BCH_CSUM_NONE || - compression_type != BCH_COMPRESSION_NONE) { - /* all units here in bytes */ - unsigned total_output = 0, output_available = - min(wp->sectors_free << 9, orig->bi_iter.bi_size); - unsigned crc_nonce = bch2_csum_type_is_encryption(csum_type) - ? op->nonce : 0; + /* + * If the data is compressed and we couldn't write the entire extent as + * is, we have to decompress it: + */ + if (op->crc.compression_type) { struct bch_csum csum; - struct nonce nonce; - bio = bio_alloc_bioset(GFP_NOIO, - DIV_ROUND_UP(output_available, PAGE_SIZE), - &c->bio_write); - wbio = wbio_init(bio); - wbio->bounce = true; - wbio->put_bio = true; - /* copy WRITE_SYNC flag */ - wbio->bio.bi_opf = orig->bi_opf; + if (bch2_write_decrypt(op)) + return PREP_ENCODED_CHECKSUM_ERR; - /* - * XXX: can't use mempool for more than - * BCH_COMPRESSED_EXTENT_MAX worth of pages - */ - bch2_bio_alloc_pages_pool(c, bio, output_available); + /* Last point we can still verify checksum: */ + csum = bch2_checksum_bio(c, op->crc.csum_type, + extent_nonce(op->version, op->crc), + bio); + if (bch2_crc_cmp(op->crc.csum, csum)) + return PREP_ENCODED_CHECKSUM_ERR; - do { - unsigned fragment_compression_type = compression_type; - size_t dst_len, src_len; + if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) + return PREP_ENCODED_ERR; + } - bch2_bio_compress(c, bio, &dst_len, - orig, &src_len, - &fragment_compression_type); + /* + * No longer have compressed data after this point - data might be + * encrypted: + */ - nonce = extent_nonce(op->version, - crc_nonce, - src_len >> 9, - fragment_compression_type); + /* + * If the data is checksummed and we're only writing a subset, + * rechecksum and adjust bio to point to currently live data: + */ + if ((op->crc.live_size != op->crc.uncompressed_size || + op->crc.csum_type != op->csum_type) && + bch2_write_rechecksum(c, op, op->csum_type)) + return PREP_ENCODED_CHECKSUM_ERR; - swap(bio->bi_iter.bi_size, dst_len); - bch2_encrypt_bio(c, csum_type, nonce, bio); + /* + * If we want to compress the data, it has to be decrypted: + */ + if ((op->compression_type || + bch2_csum_type_is_encryption(op->crc.csum_type) != + bch2_csum_type_is_encryption(op->csum_type)) && + bch2_write_decrypt(op)) + return PREP_ENCODED_CHECKSUM_ERR; - csum = bch2_checksum_bio(c, csum_type, nonce, bio); - swap(bio->bi_iter.bi_size, dst_len); + return PREP_ENCODED_OK; +} - init_append_extent(op, - dst_len >> 9, src_len >> 9, - fragment_compression_type, - crc_nonce, csum, csum_type, wp->ob); +static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) +{ + struct bch_fs *c = op->c; + struct bio *src = &op->wbio.bio, *dst = src; + struct bvec_iter saved_iter; + struct bkey_i *key_to_write; + unsigned key_to_write_offset = op->insert_keys.top_p - + op->insert_keys.keys_p; + unsigned total_output = 0; + bool bounce = false, page_alloc_failed = false; + int ret, more = 0; - total_output += dst_len; - bio_advance(bio, dst_len); - bio_advance(orig, src_len); - } while (bio->bi_iter.bi_size && - orig->bi_iter.bi_size && - !bch2_keylist_realloc(&op->insert_keys, - op->inline_keys, - ARRAY_SIZE(op->inline_keys), - BKEY_EXTENT_U64s_MAX)); + BUG_ON(!bio_sectors(src)); - BUG_ON(total_output > output_available); + switch (bch2_write_prep_encoded_data(op, wp)) { + case PREP_ENCODED_OK: + break; + case PREP_ENCODED_ERR: + ret = -EIO; + goto err; + case PREP_ENCODED_CHECKSUM_ERR: + goto csum_err; + case PREP_ENCODED_DO_WRITE: + init_append_extent(op, wp, op->version, op->crc); + goto do_write; + } - memset(&bio->bi_iter, 0, sizeof(bio->bi_iter)); - bio->bi_iter.bi_size = total_output; + if (op->compression_type || + (op->csum_type && + !(op->flags & BCH_WRITE_PAGES_STABLE)) || + (bch2_csum_type_is_encryption(op->csum_type) && + !(op->flags & BCH_WRITE_PAGES_OWNED))) { + dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed); + bounce = true; + } - /* - * Free unneeded pages after compressing: - */ - while (bio->bi_vcnt * PAGE_SIZE > - round_up(bio->bi_iter.bi_size, PAGE_SIZE)) - mempool_free(bio->bi_io_vec[--bio->bi_vcnt].bv_page, - &c->bio_bounce_pages); + saved_iter = dst->bi_iter; - more = orig->bi_iter.bi_size != 0; - } else { - bio = bio_next_split(orig, wp->sectors_free, GFP_NOIO, - &c->bio_write); - wbio = wbio_init(bio); - wbio->put_bio = bio != orig; + do { + struct bch_extent_crc_unpacked crc = + (struct bch_extent_crc_unpacked) { 0 }; + struct bversion version = op->version; + size_t dst_len, src_len; + + if (page_alloc_failed && + bio_sectors(dst) < wp->sectors_free && + bio_sectors(dst) < c->sb.encoded_extent_max) + break; - init_append_extent(op, bio_sectors(bio), bio_sectors(bio), - compression_type, 0, - (struct bch_csum) { 0 }, csum_type, wp->ob); + BUG_ON(op->compression_type && + (op->flags & BCH_WRITE_DATA_ENCODED) && + bch2_csum_type_is_encryption(op->crc.csum_type)); + BUG_ON(op->compression_type && !bounce); + + crc.compression_type = op->compression_type + ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, + op->compression_type) + : 0; + if (!crc.compression_type) { + dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); + dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); + + if (op->csum_type) + dst_len = min_t(unsigned, dst_len, + c->sb.encoded_extent_max << 9); + + if (bounce) { + swap(dst->bi_iter.bi_size, dst_len); + bio_copy_data(dst, src); + swap(dst->bi_iter.bi_size, dst_len); + } - more = bio != orig; + src_len = dst_len; + } + + BUG_ON(!src_len || !dst_len); + + if (bch2_csum_type_is_encryption(op->csum_type)) { + if (bversion_zero(version)) { + version.lo = atomic64_inc_return(&c->key_version) + 1; + } else { + crc.nonce = op->nonce; + op->nonce += src_len >> 9; + } + } + + if ((op->flags & BCH_WRITE_DATA_ENCODED) && + !crc.compression_type && + bch2_csum_type_is_encryption(op->crc.csum_type) == + bch2_csum_type_is_encryption(op->csum_type)) { + /* + * Note: when we're using rechecksum(), we need to be + * checksumming @src because it has all the data our + * existing checksum covers - if we bounced (because we + * were trying to compress), @dst will only have the + * part of the data the new checksum will cover. + * + * But normally we want to be checksumming post bounce, + * because part of the reason for bouncing is so the + * data can't be modified (by userspace) while it's in + * flight. + */ + if (bch2_rechecksum_bio(c, src, version, op->crc, + &crc, &op->crc, + src_len >> 9, + bio_sectors(src) - (src_len >> 9), + op->csum_type)) + goto csum_err; + } else { + if ((op->flags & BCH_WRITE_DATA_ENCODED) && + bch2_rechecksum_bio(c, src, version, op->crc, + NULL, &op->crc, + src_len >> 9, + bio_sectors(src) - (src_len >> 9), + op->crc.csum_type)) + goto csum_err; + + crc.compressed_size = dst_len >> 9; + crc.uncompressed_size = src_len >> 9; + crc.live_size = src_len >> 9; + + swap(dst->bi_iter.bi_size, dst_len); + bch2_encrypt_bio(c, op->csum_type, + extent_nonce(version, crc), dst); + crc.csum = bch2_checksum_bio(c, op->csum_type, + extent_nonce(version, crc), dst); + crc.csum_type = op->csum_type; + swap(dst->bi_iter.bi_size, dst_len); + } + + init_append_extent(op, wp, version, crc); + + if (dst != src) + bio_advance(dst, dst_len); + bio_advance(src, src_len); + total_output += dst_len; + } while (dst->bi_iter.bi_size && + src->bi_iter.bi_size && + wp->sectors_free && + !bch2_keylist_realloc(&op->insert_keys, + op->inline_keys, + ARRAY_SIZE(op->inline_keys), + BKEY_EXTENT_U64s_MAX)); + + more = src->bi_iter.bi_size != 0; + + dst->bi_iter = saved_iter; + + if (!bounce && more) { + dst = bio_split(src, total_output >> 9, + GFP_NOIO, &c->bio_write); + wbio_init(dst)->put_bio = true; } + dst->bi_iter.bi_size = total_output; + + /* Free unneeded pages after compressing: */ + if (bounce) + while (dst->bi_vcnt > DIV_ROUND_UP(dst->bi_iter.bi_size, PAGE_SIZE)) + mempool_free(dst->bi_io_vec[--dst->bi_vcnt].bv_page, + &c->bio_bounce_pages); +do_write: /* might have done a realloc... */ key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); @@ -487,30 +709,40 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp) ret = bch2_check_mark_super(c, bkey_i_to_s_c_extent(key_to_write), BCH_DATA_USER); if (ret) - return ret; + goto err; - bio->bi_end_io = bch2_write_endio; - bio->bi_private = &op->cl; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + dst->bi_end_io = bch2_write_endio; + dst->bi_private = &op->cl; + bio_set_op_attrs(dst, REQ_OP_WRITE, 0); - closure_get(bio->bi_private); + closure_get(dst->bi_private); - bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER, + bch2_submit_wbio_replicas(to_wbio(dst), c, BCH_DATA_USER, key_to_write); return more; +csum_err: + bch_err(c, "error verifying existing checksum while " + "rewriting existing data (memory corruption?)"); + ret = -EIO; +err: + if (bounce) { + bch2_bio_free_pages_pool(c, dst); + bio_put(dst); + } + + return ret; } static void __bch2_write(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; - unsigned open_bucket_nr = 0; struct write_point *wp; - struct open_bucket *ob; int ret; do { - if (open_bucket_nr == ARRAY_SIZE(op->open_buckets)) + if (op->open_buckets_nr + op->nr_replicas > + ARRAY_SIZE(op->open_buckets)) continue_at(cl, bch2_write_index, index_update_wq(op)); /* for the device pointers and 1 for the chksum */ @@ -520,11 +752,12 @@ static void __bch2_write(struct closure *cl) BKEY_EXTENT_U64s_MAX)) continue_at(cl, bch2_write_index, index_update_wq(op)); - wp = bch2_alloc_sectors_start(c, BCH_DATA_USER, + wp = bch2_alloc_sectors_start(c, op->devs, op->write_point, + &op->devs_have, op->nr_replicas, - c->opts.data_replicas_required, + op->nr_replicas_required, op->alloc_reserve, op->flags, (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl); @@ -565,14 +798,13 @@ static void __bch2_write(struct closure *cl) continue; } - ob = wp->ob; - - BUG_ON(ob - c->open_buckets == 0 || - ob - c->open_buckets > U8_MAX); - op->open_buckets[open_bucket_nr++] = ob - c->open_buckets; - ret = bch2_write_extent(op, wp); + BUG_ON(op->open_buckets_nr + wp->nr_ptrs_can_use > + ARRAY_SIZE(op->open_buckets)); + bch2_open_bucket_get(c, wp, + &op->open_buckets_nr, + op->open_buckets); bch2_alloc_sectors_done(c, wp); if (ret < 0) @@ -603,30 +835,6 @@ err: : bch2_write_done, index_update_wq(op)); } -void bch2_wake_delayed_writes(unsigned long data) -{ - struct bch_fs *c = (void *) data; - struct bch_write_op *op; - unsigned long flags; - - spin_lock_irqsave(&c->foreground_write_pd_lock, flags); - - while ((op = c->write_wait_head)) { - if (time_after(op->expires, jiffies)) { - mod_timer(&c->foreground_write_wakeup, op->expires); - break; - } - - c->write_wait_head = op->next; - if (!c->write_wait_head) - c->write_wait_tail = NULL; - - closure_put(&op->cl); - } - - spin_unlock_irqrestore(&c->foreground_write_pd_lock, flags); -} - /** * bch_write - handle a write to a cache device or flash only volume * @@ -646,9 +854,17 @@ void bch2_wake_delayed_writes(unsigned long data) void bch2_write(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); - struct bio *bio = &op->wbio.bio; struct bch_fs *c = op->c; - u64 inode = op->pos.inode; + + BUG_ON(!op->nr_replicas); + BUG_ON(!op->write_point.v); + BUG_ON(!bkey_cmp(op->pos, POS_MAX)); + BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX); + + memset(&op->failed, 0, sizeof(op->failed)); + + bch2_keylist_init(&op->insert_keys, op->inline_keys); + wbio_init(&op->wbio.bio)->put_bio = false; if (c->opts.nochanges || !percpu_ref_tryget(&c->writes)) { @@ -658,102 +874,11 @@ void bch2_write(struct closure *cl) closure_return(cl); } - if (bversion_zero(op->version) && - bch2_csum_type_is_encryption(op->csum_type)) - op->version.lo = - atomic64_inc_return(&c->key_version) + 1; - - bch2_increment_clock(c, bio_sectors(bio), WRITE); - - /* Don't call bch2_next_delay() if rate is >= 1 GB/sec */ - - if ((op->flags & BCH_WRITE_THROTTLE) && - c->foreground_write_ratelimit_enabled && - c->foreground_write_pd.rate.rate < (1 << 30)) { - unsigned long flags; - u64 delay; - - spin_lock_irqsave(&c->foreground_write_pd_lock, flags); - bch2_ratelimit_increment(&c->foreground_write_pd.rate, - bio->bi_iter.bi_size); - - delay = bch2_ratelimit_delay(&c->foreground_write_pd.rate); - - if (delay >= HZ / 100) { - trace_write_throttle(c, inode, bio, delay); - - closure_get(&op->cl); /* list takes a ref */ - - op->expires = jiffies + delay; - op->next = NULL; - - if (c->write_wait_tail) - c->write_wait_tail->next = op; - else - c->write_wait_head = op; - c->write_wait_tail = op; - - if (!timer_pending(&c->foreground_write_wakeup)) - mod_timer(&c->foreground_write_wakeup, - op->expires); - - spin_unlock_irqrestore(&c->foreground_write_pd_lock, - flags); - continue_at(cl, __bch2_write, index_update_wq(op)); - } - - spin_unlock_irqrestore(&c->foreground_write_pd_lock, flags); - } + bch2_increment_clock(c, bio_sectors(&op->wbio.bio), WRITE); continue_at_nobarrier(cl, __bch2_write, NULL); } -void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, - struct disk_reservation res, - struct bch_devs_mask *devs, - unsigned long write_point, - struct bpos pos, - u64 *journal_seq, unsigned flags) -{ - EBUG_ON(res.sectors && !res.nr_replicas); - - op->c = c; - op->io_wq = index_update_wq(op); - op->written = 0; - op->error = 0; - op->flags = flags; - op->csum_type = bch2_data_checksum_type(c); - op->compression_type = - bch2_compression_opt_to_type(c->opts.compression); - op->nr_replicas = res.nr_replicas; - op->alloc_reserve = RESERVE_NONE; - op->nonce = 0; - op->pos = pos; - op->version = ZERO_VERSION; - op->res = res; - op->devs = devs; - op->write_point = write_point; - - if (journal_seq) { - op->journal_seq_p = journal_seq; - op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR; - } else { - op->journal_seq = 0; - } - - op->index_update_fn = bch2_write_index_default; - - memset(op->open_buckets, 0, sizeof(op->open_buckets)); - memset(&op->failed, 0, sizeof(op->failed)); - - bch2_keylist_init(&op->insert_keys, - op->inline_keys, - ARRAY_SIZE(op->inline_keys)); - - if (version_stress_test(c)) - get_random_bytes(&op->version, sizeof(op->version)); -} - /* Cache promotion on read */ struct promote_op { @@ -787,11 +912,20 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) trace_promote(&rbio->bio); /* we now own pages: */ + BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); swap(bio->bi_vcnt, rbio->bio.bi_vcnt); - memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, - sizeof(struct bio_vec) * bio->bi_vcnt); rbio->promote = NULL; + __bch2_write_op_init(&op->write.op, c); + + op->write.move_dev = -1; + op->write.op.devs = c->fastest_devs; + op->write.op.write_point = writepoint_hashed((unsigned long) current); + op->write.op.flags |= BCH_WRITE_ALLOC_NOWAIT; + op->write.op.flags |= BCH_WRITE_CACHED; + + bch2_migrate_write_init(&op->write, rbio); + closure_init(cl, NULL); closure_call(&op->write.op.cl, bch2_write, c->wq, cl); closure_return_with_destructor(cl, promote_done); @@ -801,57 +935,27 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) * XXX: multiple promotes can race with each other, wastefully. Keep a list of * outstanding promotes? */ -static struct promote_op *promote_alloc(struct bch_fs *c, - struct bvec_iter iter, - struct bkey_s_c k, - struct extent_pick_ptr *pick, - bool read_full) +static struct promote_op *promote_alloc(struct bch_read_bio *rbio) { struct promote_op *op; struct bio *bio; - /* - * biovec needs to be big enough to hold decompressed data, if - * bch2_write_extent() has to decompress/recompress it: - */ - unsigned sectors = max_t(unsigned, k.k->size, - crc_uncompressed_size(NULL, &pick->crc)); - unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); + /* data might have to be decompressed in the write path: */ + unsigned pages = DIV_ROUND_UP(rbio->pick.crc.uncompressed_size, + PAGE_SECTORS); - op = kmalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); + BUG_ON(!rbio->bounce); + BUG_ON(pages < rbio->bio.bi_vcnt); + + op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, + GFP_NOIO); if (!op) return NULL; bio = &op->write.op.wbio.bio; bio_init(bio, bio->bi_inline_vecs, pages); - bio->bi_iter = iter; - - if (pick->crc.compression_type) { - op->write.op.flags |= BCH_WRITE_DATA_COMPRESSED; - op->write.op.crc = pick->crc; - op->write.op.size = k.k->size; - } else if (read_full) { - /* - * Adjust bio to correspond to _live_ portion of @k - - * which might be less than what we're actually reading: - */ - bio->bi_iter.bi_size = sectors << 9; - bio_advance(bio, pick->crc.offset << 9); - BUG_ON(bio_sectors(bio) < k.k->size); - bio->bi_iter.bi_size = k.k->size << 9; - } else { - /* - * Set insert pos to correspond to what we're actually - * reading: - */ - op->write.op.pos.offset = iter.bi_sector; - } - bch2_migrate_write_init(c, &op->write, - c->fastest_devs, - k, NULL, - BCH_WRITE_ALLOC_NOWAIT| - BCH_WRITE_CACHED); - op->write.promote = true; + memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, + sizeof(struct bio_vec) * rbio->bio.bi_vcnt); return op; } @@ -863,9 +967,6 @@ static bool should_promote(struct bch_fs *c, if (!(flags & BCH_READ_MAY_PROMOTE)) return false; - if (flags & BCH_READ_IN_RETRY) - return false; - if (percpu_ref_is_dying(&c->writes)) return false; @@ -875,10 +976,20 @@ static bool should_promote(struct bch_fs *c, /* Read */ +static void bch2_read_nodecode_retry(struct bch_fs *, struct bch_read_bio *, + struct bvec_iter, u64, + struct bch_devs_mask *, unsigned); + #define READ_RETRY_AVOID 1 #define READ_RETRY 2 #define READ_ERR 3 +enum rbio_context { + RBIO_CONTEXT_NULL, + RBIO_CONTEXT_HIGHPRI, + RBIO_CONTEXT_UNBOUND, +}; + static inline struct bch_read_bio * bch2_rbio_parent(struct bch_read_bio *rbio) { @@ -887,14 +998,14 @@ bch2_rbio_parent(struct bch_read_bio *rbio) __always_inline static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, + enum rbio_context context, struct workqueue_struct *wq) { - - if (!wq || rbio->process_context) { + if (context <= rbio->context) { fn(&rbio->work); } else { rbio->work.func = fn; - rbio->process_context = true; + rbio->context = context; queue_work(wq, &rbio->work); } } @@ -932,7 +1043,7 @@ static void bch2_rbio_retry(struct work_struct *work) struct bch_fs *c = rbio->c; struct bvec_iter iter = rbio->bvec_iter; unsigned flags = rbio->flags; - u64 inode = rbio->inode; + u64 inode = rbio->pos.inode; struct bch_devs_mask avoid; trace_read_retry(&rbio->bio); @@ -942,15 +1053,24 @@ static void bch2_rbio_retry(struct work_struct *work) if (rbio->retry == READ_RETRY_AVOID) __set_bit(rbio->pick.ca->dev_idx, avoid.d); + if (rbio->promote) + kfree(rbio->promote); + rbio->promote = NULL; + if (rbio->split) rbio = bch2_rbio_free(rbio); else rbio->bio.bi_error = 0; - flags |= BCH_READ_MUST_CLONE; + if (!(flags & BCH_READ_NODECODE)) + flags |= BCH_READ_MUST_CLONE; flags |= BCH_READ_IN_RETRY; + flags &= ~BCH_READ_MAY_PROMOTE; - __bch2_read(c, rbio, iter, inode, &avoid, flags); + if (flags & BCH_READ_NODECODE) + bch2_read_nodecode_retry(c, rbio, iter, inode, &avoid, flags); + else + __bch2_read(c, rbio, iter, inode, &avoid, flags); } static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error) @@ -964,108 +1084,175 @@ static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, int error) bch2_rbio_parent(rbio)->bio.bi_error = error; bch2_rbio_done(rbio); } else { - bch2_rbio_punt(rbio, bch2_rbio_retry, rbio->c->wq); + bch2_rbio_punt(rbio, bch2_rbio_retry, + RBIO_CONTEXT_UNBOUND, system_unbound_wq); + } +} + +static void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) +{ + struct bch_fs *c = rbio->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_extent *e; + BKEY_PADDED(k) new; + struct bch_extent_crc_unpacked new_crc; + unsigned offset; + int ret; + + if (rbio->pick.crc.compression_type) + return; + + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, rbio->pos, + BTREE_ITER_INTENT); +retry: + k = bch2_btree_iter_peek(&iter); + if (IS_ERR_OR_NULL(k.k)) + goto out; + + if (!bkey_extent_is_data(k.k)) + goto out; + + bkey_reassemble(&new.k, k); + e = bkey_i_to_extent(&new.k); + + if (!bch2_extent_matches_ptr(c, extent_i_to_s_c(e), + rbio->pick.ptr, + rbio->pos.offset - + rbio->pick.crc.offset) || + bversion_cmp(e->k.version, rbio->version)) + goto out; + + /* Extent was merged? */ + if (bkey_start_offset(&e->k) < rbio->pos.offset || + e->k.p.offset > rbio->pos.offset + rbio->pick.crc.uncompressed_size) + goto out; + + /* The extent might have been partially overwritten since we read it: */ + offset = rbio->pick.crc.offset + (bkey_start_offset(&e->k) - rbio->pos.offset); + + if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, + rbio->pick.crc, NULL, &new_crc, + offset, e->k.size, + rbio->pick.crc.csum_type)) { + bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); + goto out; } + + if (!bch2_extent_narrow_crcs(e, new_crc)) + goto out; + + ret = bch2_btree_insert_at(c, NULL, NULL, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOWAIT, + BTREE_INSERT_ENTRY(&iter, &e->k_i)); + if (ret == -EINTR) + goto retry; +out: + bch2_btree_iter_unlock(&iter); +} + +static bool should_narrow_crcs(struct bkey_s_c_extent e, + struct extent_pick_ptr *pick, + unsigned flags) +{ + return !(flags & BCH_READ_IN_RETRY) && + bch2_can_narrow_extent_crcs(e, pick->crc); } -static int bch2_rbio_checksum_uncompress(struct bio *dst, - struct bch_read_bio *rbio) +/* Inner part that may run in process context */ +static void __bch2_read_endio(struct work_struct *work) { + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; - struct bio *src = &rbio->bio; + struct bio *src = &rbio->bio, *dst = &bch2_rbio_parent(rbio)->bio; struct bvec_iter dst_iter = rbio->bvec_iter; - struct nonce nonce = extent_nonce(rbio->version, - rbio->pick.crc.nonce, - crc_uncompressed_size(NULL, &rbio->pick.crc), - rbio->pick.crc.compression_type); + struct bch_extent_crc_unpacked crc = rbio->pick.crc; + struct nonce nonce = extent_nonce(rbio->version, crc); struct bch_csum csum; - int ret = 0; - /* - * reset iterator for checksumming and copying bounced data: here we've - * set rbio->compressed_size to the amount of data we actually read, - * which was not necessarily the full extent if we were only bouncing - * in order to promote - */ + /* Reset iterator for checksumming and copying bounced data: */ if (rbio->bounce) { - src->bi_iter.bi_size = crc_compressed_size(NULL, &rbio->pick.crc) << 9; - src->bi_iter.bi_idx = 0; - src->bi_iter.bi_bvec_done = 0; + src->bi_iter.bi_size = crc.compressed_size << 9; + src->bi_iter.bi_idx = 0; + src->bi_iter.bi_bvec_done = 0; } else { - src->bi_iter = rbio->bvec_iter; + src->bi_iter = rbio->bvec_iter; } - csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, nonce, src); - if (bch2_dev_io_err_on(bch2_crc_cmp(rbio->pick.crc.csum, csum), - rbio->pick.ca, - "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)", - rbio->inode, (u64) rbio->bvec_iter.bi_sector << 9, - rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, - csum.hi, csum.lo, - rbio->pick.crc.csum_type)) - ret = -EIO; + csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); + if (bch2_crc_cmp(csum, rbio->pick.crc.csum)) + goto csum_err; - /* - * If there was a checksum error, still copy the data back - unless it - * was compressed, we don't want to decompress bad data: - */ - if (rbio->pick.crc.compression_type != BCH_COMPRESSION_NONE) { - if (!ret) { - bch2_encrypt_bio(c, rbio->pick.crc.csum_type, nonce, src); - ret = bch2_bio_uncompress(c, src, dst, - dst_iter, rbio->pick.crc); - if (ret) - __bcache_io_error(c, "decompression error"); - } - } else if (rbio->bounce) { - bio_advance(src, rbio->pick.crc.offset << 9); - - /* don't need to decrypt the entire bio: */ - BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); - src->bi_iter.bi_size = dst_iter.bi_size; + if (unlikely(rbio->narrow_crcs)) + bch2_rbio_narrow_crcs(rbio); - nonce = nonce_add(nonce, rbio->pick.crc.offset << 9); + if (rbio->flags & BCH_READ_NODECODE) + goto nodecode; - bch2_encrypt_bio(c, rbio->pick.crc.csum_type, - nonce, src); + /* Adjust crc to point to subset of data we want: */ + crc.offset += rbio->bvec_iter.bi_sector - rbio->pos.offset; + crc.live_size = bvec_iter_sectors(rbio->bvec_iter); - bio_copy_data_iter(dst, &dst_iter, - src, &src->bi_iter); + if (crc.compression_type != BCH_COMPRESSION_NONE) { + bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) + goto decompression_err; } else { - bch2_encrypt_bio(c, rbio->pick.crc.csum_type, nonce, src); - } + /* don't need to decrypt the entire bio: */ + nonce = nonce_add(nonce, crc.offset << 9); + bio_advance(src, crc.offset << 9); - return ret; -} + BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); + src->bi_iter.bi_size = dst_iter.bi_size; -/* Inner part that may run in process context */ -static void __bch2_read_endio(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - int ret; + bch2_encrypt_bio(c, crc.csum_type, nonce, src); - ret = bch2_rbio_checksum_uncompress(&bch2_rbio_parent(rbio)->bio, rbio); - if (ret) { - /* - * Checksum error: if the bio wasn't bounced, we may have been - * reading into buffers owned by userspace (that userspace can - * scribble over) - retry the read, bouncing it this time: - */ - if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { - rbio->flags |= BCH_READ_MUST_BOUNCE; - bch2_rbio_error(rbio, READ_RETRY, ret); - } else { - bch2_rbio_error(rbio, READ_RETRY_AVOID, ret); + if (rbio->bounce) { + struct bvec_iter src_iter = src->bi_iter; + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); } - return; } - if (rbio->promote) + if (rbio->promote) { + /* + * Re encrypt data we decrypted, so it's consistent with + * rbio->crc: + */ + bch2_encrypt_bio(c, crc.csum_type, nonce, src); promote_start(rbio->promote, rbio); - + } +nodecode: if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) bch2_rbio_done(rbio); + return; +csum_err: + /* + * Checksum error: if the bio wasn't bounced, we may have been + * reading into buffers owned by userspace (that userspace can + * scribble over) - retry the read, bouncing it this time: + */ + if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { + rbio->flags |= BCH_READ_MUST_BOUNCE; + bch2_rbio_error(rbio, READ_RETRY, -EIO); + return; + } + + bch2_dev_io_error(rbio->pick.ca, + "data checksum error, inode %llu offset %llu: expected %0llx%0llx got %0llx%0llx (type %u)", + rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, + rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, + csum.hi, csum.lo, crc.csum_type); + bch2_rbio_error(rbio, READ_RETRY_AVOID, -EIO); + return; +decompression_err: + __bcache_io_error(c, "decompression error, inode %llu offset %llu", + rbio->pos.inode, + (u64) rbio->bvec_iter.bi_sector); + bch2_rbio_error(rbio, READ_ERR, -EIO); + return; } static void bch2_read_endio(struct bio *bio) @@ -1074,6 +1261,9 @@ static void bch2_read_endio(struct bio *bio) container_of(bio, struct bch_read_bio, bio); struct bch_fs *c = rbio->c; struct workqueue_struct *wq = NULL; + enum rbio_context context = RBIO_CONTEXT_NULL; + + bch2_latency_acct(rbio->pick.ca, rbio->submit_time_us, READ); percpu_ref_put(&rbio->pick.ca->io_ref); @@ -1097,38 +1287,45 @@ static void bch2_read_endio(struct bio *bio) return; } - if (rbio->pick.crc.compression_type || + if (rbio->narrow_crcs || + rbio->pick.crc.compression_type || bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) - wq = system_unbound_wq; + context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; else if (rbio->pick.crc.csum_type) - wq = system_highpri_wq; + context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; - bch2_rbio_punt(rbio, __bch2_read_endio, wq); + bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); } int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, - struct bvec_iter iter, struct bkey_s_c k, + struct bvec_iter iter, struct bkey_s_c_extent e, struct extent_pick_ptr *pick, unsigned flags) { struct bch_read_bio *rbio; - struct promote_op *promote_op = NULL; - unsigned skip = iter.bi_sector - bkey_start_offset(k.k); - bool bounce = false, split, read_full = false; + bool split = false, bounce = false, read_full = false; + bool promote = false, narrow_crcs = false; + struct bpos pos = bkey_start_pos(e.k); int ret = 0; - bch2_increment_clock(c, bio_sectors(&orig->bio), READ); PTR_BUCKET(pick->ca, &pick->ptr)->prio[READ] = c->prio_clock[READ].hand; - EBUG_ON(bkey_start_offset(k.k) > iter.bi_sector || - k.k->p.offset < bvec_iter_end_sector(iter)); + narrow_crcs = should_narrow_crcs(e, pick, flags); + + if (flags & BCH_READ_NODECODE) { + BUG_ON(iter.bi_size < pick->crc.compressed_size << 9); + iter.bi_size = pick->crc.compressed_size << 9; + goto noclone; + } + + if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) + flags |= BCH_READ_MUST_BOUNCE; + + EBUG_ON(bkey_start_offset(e.k) > iter.bi_sector || + e.k->p.offset < bvec_iter_end_sector(iter)); - /* - * note: if compression_type and crc_type both == none, then - * compressed/uncompressed size is zero - */ if (pick->crc.compression_type != BCH_COMPRESSION_NONE || (pick->crc.csum_type != BCH_CSUM_NONE && - (bvec_iter_sectors(iter) != crc_uncompressed_size(NULL, &pick->crc) || + (bvec_iter_sectors(iter) != pick->crc.uncompressed_size || (bch2_csum_type_is_encryption(pick->crc.csum_type) && (flags & BCH_READ_USER_MAPPED)) || (flags & BCH_READ_MUST_BOUNCE)))) { @@ -1136,17 +1333,30 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, bounce = true; } - if (should_promote(c, pick, flags)) - promote_op = promote_alloc(c, iter, k, pick, read_full); - + promote = should_promote(c, pick, flags); /* could also set read_full */ - if (promote_op) + if (promote) bounce = true; + if (!read_full) { + EBUG_ON(pick->crc.compression_type); + EBUG_ON(pick->crc.csum_type && + (bvec_iter_sectors(iter) != pick->crc.uncompressed_size || + bvec_iter_sectors(iter) != pick->crc.live_size || + pick->crc.offset || + iter.bi_sector != pos.offset)); + + pick->ptr.offset += pick->crc.offset + + (iter.bi_sector - pos.offset); + pick->crc.compressed_size = bvec_iter_sectors(iter); + pick->crc.uncompressed_size = bvec_iter_sectors(iter); + pick->crc.offset = 0; + pick->crc.live_size = bvec_iter_sectors(iter); + pos.offset = iter.bi_sector; + } + if (bounce) { - unsigned sectors = read_full - ? (crc_compressed_size(NULL, &pick->crc) ?: k.k->size) - : bvec_iter_sectors(iter); + unsigned sectors = pick->crc.compressed_size; rbio = rbio_init(bio_alloc_bioset(GFP_NOIO, DIV_ROUND_UP(sectors, PAGE_SECTORS), @@ -1163,41 +1373,38 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, * from the whole bio, in which case we don't want to retry and * lose the error) */ - rbio = rbio_init(bio_clone_fast(&orig->bio, - GFP_NOIO, &c->bio_read_split)); + rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO, + &c->bio_read_split)); rbio->bio.bi_iter = iter; split = true; } else { +noclone: rbio = orig; rbio->bio.bi_iter = iter; split = false; BUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); } - rbio->c = c; + BUG_ON(bio_sectors(&rbio->bio) != pick->crc.compressed_size); + rbio->c = c; if (split) rbio->parent = orig; else rbio->end_io = orig->bio.bi_end_io; - rbio->bvec_iter = iter; + rbio->submit_time_us = local_clock_us(); rbio->flags = flags; rbio->bounce = bounce; rbio->split = split; - rbio->process_context = false; + rbio->narrow_crcs = narrow_crcs; rbio->retry = 0; + rbio->context = 0; + rbio->devs_have = bch2_extent_devs(e); rbio->pick = *pick; - /* - * crc.compressed_size will be 0 if there wasn't any checksum - * information, also we need to stash the original size of the bio if we - * bounced (which isn't necessarily the original key size, if we bounced - * only for promoting) - */ - rbio->pick.crc._compressed_size = bio_sectors(&rbio->bio) - 1; - rbio->version = k.k->version; - rbio->promote = promote_op; - rbio->inode = k.k->p.inode; + rbio->pos = pos; + rbio->version = e.k->version; + rbio->promote = promote ? promote_alloc(rbio) : NULL; INIT_WORK(&rbio->work, NULL); rbio->bio.bi_bdev = pick->ca->disk_sb.bdev; @@ -1205,16 +1412,10 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, rbio->bio.bi_iter.bi_sector = pick->ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; - if (read_full) - rbio->pick.crc.offset += skip; - else - rbio->bio.bi_iter.bi_sector += skip; - - rbio->submit_time_us = local_clock_us(); - if (bounce) trace_read_bounce(&rbio->bio); + bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); this_cpu_add(pick->ca->io_done->sectors[READ][BCH_DATA_USER], bio_sectors(&rbio->bio)); @@ -1223,7 +1424,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, } else { submit_bio_wait(&rbio->bio); - rbio->process_context = true; + rbio->context = RBIO_CONTEXT_UNBOUND; bch2_read_endio(&rbio->bio); ret = rbio->retry; @@ -1234,6 +1435,79 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, return ret; } +static void bch2_read_nodecode_retry(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, u64 inode, + struct bch_devs_mask *avoid, unsigned flags) +{ + struct extent_pick_ptr pick; + struct btree_iter iter; + BKEY_PADDED(k) tmp; + struct bkey_s_c k; + int ret; + + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, + POS(inode, bvec_iter.bi_sector), + BTREE_ITER_WITH_HOLES); +retry: + k = bch2_btree_iter_peek_with_holes(&iter); + if (btree_iter_err(k)) { + bch2_btree_iter_unlock(&iter); + goto err; + } + + bkey_reassemble(&tmp.k, k); + k = bkey_i_to_s_c(&tmp.k); + bch2_btree_iter_unlock(&iter); + + if (!bkey_extent_is_data(k.k) || + !bch2_extent_matches_ptr(c, bkey_i_to_s_c_extent(&tmp.k), + rbio->pick.ptr, + rbio->pos.offset - + rbio->pick.crc.offset) || + bkey_start_offset(k.k) != bvec_iter.bi_sector) + goto err; + + bch2_extent_pick_ptr(c, k, avoid, &pick); + if (IS_ERR(pick.ca)) { + bcache_io_error(c, &rbio->bio, "no device to read from"); + bio_endio(&rbio->bio); + return; + } + + if (!pick.ca) + goto err; + + if (pick.crc.compressed_size > bvec_iter_sectors(bvec_iter)) { + percpu_ref_put(&pick.ca->io_ref); + goto err; + + } + + ret = __bch2_read_extent(c, rbio, bvec_iter, bkey_s_c_to_extent(k), + &pick, flags); + switch (ret) { + case READ_RETRY_AVOID: + __set_bit(pick.ca->dev_idx, avoid->d); + case READ_RETRY: + goto retry; + case READ_ERR: + bio_endio(&rbio->bio); + return; + }; + + return; +err: + /* + * extent we wanted to read no longer exists, or + * was merged or partially overwritten (and thus + * possibly bigger than the memory that was + * originally allocated) + */ + rbio->bio.bi_error = -EINTR; + bio_endio(&rbio->bio); + return; +} + void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, u64 inode, struct bch_devs_mask *avoid, unsigned flags) @@ -1241,6 +1515,8 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, struct btree_iter iter; struct bkey_s_c k; int ret; + + EBUG_ON(flags & BCH_READ_NODECODE); retry: for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS(inode, bvec_iter.bi_sector), @@ -1277,7 +1553,8 @@ retry: } ret = __bch2_read_extent(c, rbio, fragment, - k, &pick, flags); + bkey_s_c_to_extent(k), + &pick, flags); switch (ret) { case READ_RETRY_AVOID: __set_bit(pick.ca->dev_idx, avoid->d); diff --git a/libbcachefs/io.h b/libbcachefs/io.h index 658c15a..bd0d7c4 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -2,6 +2,8 @@ #define _BCACHEFS_IO_H #include +#include "alloc.h" +#include "checksum.h" #include "io_types.h" #define to_wbio(_bio) \ @@ -12,6 +14,9 @@ void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); +void bch2_bio_alloc_more_pages_pool(struct bch_fs *, struct bio *, size_t); + +void bch2_latency_acct(struct bch_dev *, unsigned, int); void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *); @@ -20,14 +25,15 @@ enum bch_write_flags { BCH_WRITE_ALLOC_NOWAIT = (1 << 0), BCH_WRITE_CACHED = (1 << 1), BCH_WRITE_FLUSH = (1 << 2), - BCH_WRITE_DATA_COMPRESSED = (1 << 3), - BCH_WRITE_THROTTLE = (1 << 4), - BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 5), + BCH_WRITE_DATA_ENCODED = (1 << 3), + BCH_WRITE_PAGES_STABLE = (1 << 4), + BCH_WRITE_PAGES_OWNED = (1 << 5), + BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), /* Internal: */ - BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 6), - BCH_WRITE_DONE = (1 << 7), - BCH_WRITE_LOOPED = (1 << 8), + BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 7), + BCH_WRITE_DONE = (1 << 8), + BCH_WRITE_LOOPED = (1 << 9), }; static inline u64 *op_journal_seq(struct bch_write_op *op) @@ -36,11 +42,60 @@ static inline u64 *op_journal_seq(struct bch_write_op *op) ? op->journal_seq_p : &op->journal_seq; } -void bch2_write_op_init(struct bch_write_op *, struct bch_fs *, - struct disk_reservation, - struct bch_devs_mask *, - unsigned long, - struct bpos, u64 *, unsigned); +static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) +{ + return op->alloc_reserve == RESERVE_MOVINGGC + ? op->c->copygc_wq + : op->c->wq; +} + +int bch2_write_index_default(struct bch_write_op *); + +static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c) +{ + op->c = c; + op->io_wq = index_update_wq(op); + op->flags = 0; + op->written = 0; + op->error = 0; + op->csum_type = bch2_data_checksum_type(c); + op->compression_type = + bch2_compression_opt_to_type(c->opts.compression); + op->nr_replicas = 0; + op->nr_replicas_required = c->opts.data_replicas_required; + op->alloc_reserve = RESERVE_NONE; + op->open_buckets_nr = 0; + op->devs_have.nr = 0; + op->pos = POS_MAX; + op->version = ZERO_VERSION; + op->devs = NULL; + op->write_point = (struct write_point_specifier) { 0 }; + op->res = (struct disk_reservation) { 0 }; + op->journal_seq = 0; + op->index_update_fn = bch2_write_index_default; +} + +static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, + struct disk_reservation res, + struct bch_devs_mask *devs, + struct write_point_specifier write_point, + struct bpos pos, + u64 *journal_seq, unsigned flags) +{ + __bch2_write_op_init(op, c); + op->flags = flags; + op->nr_replicas = res.nr_replicas; + op->pos = pos; + op->res = res; + op->devs = devs; + op->write_point = write_point; + + if (journal_seq) { + op->journal_seq_p = journal_seq; + op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR; + } +} + void bch2_write(struct closure *); static inline struct bch_write_bio *wbio_init(struct bio *bio) @@ -51,14 +106,13 @@ static inline struct bch_write_bio *wbio_init(struct bio *bio) return wbio; } -void bch2_wake_delayed_writes(unsigned long data); - struct bch_devs_mask; struct cache_promote_op; struct extent_pick_ptr; int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - struct bkey_s_c k, struct extent_pick_ptr *, unsigned); + struct bkey_s_c_extent e, struct extent_pick_ptr *, + unsigned); void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, u64, struct bch_devs_mask *, unsigned); @@ -66,21 +120,22 @@ enum bch_read_flags { BCH_READ_RETRY_IF_STALE = 1 << 0, BCH_READ_MAY_PROMOTE = 1 << 1, BCH_READ_USER_MAPPED = 1 << 2, + BCH_READ_NODECODE = 1 << 3, /* internal: */ - BCH_READ_MUST_BOUNCE = 1 << 3, - BCH_READ_MUST_CLONE = 1 << 4, - BCH_READ_IN_RETRY = 1 << 5, + BCH_READ_MUST_BOUNCE = 1 << 4, + BCH_READ_MUST_CLONE = 1 << 5, + BCH_READ_IN_RETRY = 1 << 6, }; static inline void bch2_read_extent(struct bch_fs *c, struct bch_read_bio *rbio, - struct bkey_s_c k, + struct bkey_s_c_extent e, struct extent_pick_ptr *pick, unsigned flags) { rbio->_state = 0; - __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, pick, flags); + __bch2_read_extent(c, rbio, rbio->bio.bi_iter, e, pick, flags); } static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index f77106b..ed9a4bb 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -1,20 +1,16 @@ #ifndef _BCACHEFS_IO_TYPES_H #define _BCACHEFS_IO_TYPES_H +#include "alloc_types.h" #include "btree_types.h" #include "buckets_types.h" +#include "extents_types.h" #include "keylist_types.h" #include "super_types.h" #include #include -struct extent_pick_ptr { - struct bch_extent_crc128 crc; - struct bch_extent_ptr ptr; - struct bch_dev *ca; -}; - struct bch_read_bio { struct bch_fs *c; @@ -44,26 +40,22 @@ struct bch_read_bio { struct { u8 bounce:1, split:1, - process_context:1, - retry:2; + narrow_crcs:1, + retry:2, + context:2; }; u8 _state; }; + struct bch_devs_list devs_have; + struct extent_pick_ptr pick; + /* start pos of data we read (may not be pos of data we want) */ + struct bpos pos; struct bversion version; struct promote_op *promote; - /* - * If we have to retry the read (IO error, checksum failure, read stale - * data (raced with allocator), we retry the portion of the parent bio - * that failed (i.e. this bio's portion, bvec_iter). - * - * But we need to stash the inode somewhere: - */ - u64 inode; - struct work_struct work; struct bio bio; @@ -98,36 +90,33 @@ struct bch_write_op { struct bch_fs *c; struct workqueue_struct *io_wq; - unsigned written; /* sectors */ - - short error; - u16 flags; + u16 written; /* sectors */ + s8 error; + unsigned csum_type:4; unsigned compression_type:4; unsigned nr_replicas:4; + unsigned nr_replicas_required:4; unsigned alloc_reserve:4; - unsigned nonce:14; + + u8 open_buckets_nr; + struct bch_devs_list devs_have; + u16 target; + u16 nonce; struct bpos pos; struct bversion version; - /* For BCH_WRITE_DATA_COMPRESSED: */ - struct bch_extent_crc128 crc; - unsigned size; + /* For BCH_WRITE_DATA_ENCODED: */ + struct bch_extent_crc_unpacked crc; struct bch_devs_mask *devs; - unsigned long write_point; + struct write_point_specifier write_point; struct disk_reservation res; - union { u8 open_buckets[16]; - struct { - struct bch_write_op *next; - unsigned long expires; - }; - }; /* * If caller wants to flush but hasn't passed us a journal_seq ptr, we diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 37b342b..5d9a298 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -464,7 +464,8 @@ static int journal_validate_key(struct bch_fs *c, struct jset *j, if (invalid) { bch2_bkey_val_to_text(c, key_type, buf, sizeof(buf), bkey_i_to_s_c(k)); - mustfix_fsck_err(c, "invalid %s in journal: %s", type, buf); + mustfix_fsck_err(c, "invalid %s in journal: %s\n%s", + type, invalid, buf); le16_add_cpu(&entry->u64s, -k->k.u64s); memmove(k, bkey_next(k), next - (void *) bkey_next(k)); @@ -1568,35 +1569,31 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); swap(new_buckets, ja->buckets); swap(new_bucket_seq, ja->bucket_seq); + spin_unlock(&j->lock); while (ja->nr < nr) { - /* must happen under journal lock, to avoid racing with gc: */ - long b = bch2_bucket_alloc(c, ca, RESERVE_ALLOC); - if (b < 0) { - if (!closure_wait(&c->freelist_wait, &cl)) { - spin_unlock(&j->lock); + struct open_bucket *ob; + size_t bucket; + int ob_idx; + + ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, &cl); + if (ob_idx < 0) { + if (!closure_wait(&c->freelist_wait, &cl)) closure_sync(&cl); - spin_lock(&j->lock); - } continue; } - bch2_mark_metadata_bucket(ca, &ca->buckets[b], - BUCKET_JOURNAL, false); - bch2_mark_alloc_bucket(ca, &ca->buckets[b], false); + ob = c->open_buckets + ob_idx; + bucket = sector_to_bucket(ca, ob->ptr.offset); - memmove(ja->buckets + ja->last_idx + 1, - ja->buckets + ja->last_idx, - (ja->nr - ja->last_idx) * sizeof(u64)); - memmove(ja->bucket_seq + ja->last_idx + 1, - ja->bucket_seq + ja->last_idx, - (ja->nr - ja->last_idx) * sizeof(u64)); - memmove(journal_buckets->buckets + ja->last_idx + 1, - journal_buckets->buckets + ja->last_idx, - (ja->nr - ja->last_idx) * sizeof(u64)); + spin_lock(&j->lock); + __array_insert_item(ja->buckets, ja->nr, ja->last_idx); + __array_insert_item(ja->bucket_seq, ja->nr, ja->last_idx); + __array_insert_item(journal_buckets->buckets, ja->nr, ja->last_idx); - ja->buckets[ja->last_idx] = b; - journal_buckets->buckets[ja->last_idx] = cpu_to_le64(b); + ja->buckets[ja->last_idx] = bucket; + ja->bucket_seq[ja->last_idx] = 0; + journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket); if (ja->last_idx < ja->nr) { if (ja->cur_idx >= ja->last_idx) @@ -1604,9 +1601,14 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, ja->last_idx++; } ja->nr++; + spin_unlock(&j->lock); + + bch2_mark_metadata_bucket(c, ca, &ca->buckets[bucket], + BUCKET_JOURNAL, + gc_phase(GC_PHASE_SB), 0); + bch2_open_bucket_put(c, ob); } - spin_unlock(&j->lock); BUG_ON(bch2_sb_validate_journal(ca->disk_sb.sb, ca->mi)); @@ -1623,6 +1625,8 @@ err: if (!ret) bch2_dev_allocator_add(c, ca); + closure_sync(&cl); + return ret; } diff --git a/libbcachefs/keylist.h b/libbcachefs/keylist.h index ea65f8e..b7c8a86 100644 --- a/libbcachefs/keylist.h +++ b/libbcachefs/keylist.h @@ -7,8 +7,7 @@ int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t); void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *); void bch2_keylist_pop_front(struct keylist *); -static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys, - size_t nr_inline_u64s) +static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys) { l->top_p = l->keys_p = inline_keys; } @@ -17,7 +16,7 @@ static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys) { if (l->keys_p != inline_keys) kfree(l->keys_p); - memset(l, 0, sizeof(*l)); + bch2_keylist_init(l, inline_keys); } static inline void bch2_keylist_push(struct keylist *l) diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index d7f27a3..8d1c0ee 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -13,31 +13,16 @@ #include "move.h" #include "super-io.h" -static int issue_migration_move(struct bch_dev *ca, - struct moving_context *ctxt, - struct bch_devs_mask *devs, - struct bkey_s_c k) +static bool migrate_pred(void *arg, struct bkey_s_c_extent e) { - struct bch_fs *c = ca->fs; - struct disk_reservation res; + struct bch_dev *ca = arg; const struct bch_extent_ptr *ptr; - int ret; - - if (bch2_disk_reservation_get(c, &res, k.k->size, 0)) - return -ENOSPC; - extent_for_each_ptr(bkey_s_c_to_extent(k), ptr) + extent_for_each_ptr(e, ptr) if (ptr->dev == ca->dev_idx) - goto found; + return true; - BUG(); -found: - /* XXX: we need to be doing something with the disk reservation */ - - ret = bch2_data_move(c, ctxt, devs, k, ptr); - if (ret) - bch2_disk_reservation_put(c, &res); - return ret; + return false; } #define MAX_DATA_OFF_ITER 10 @@ -58,10 +43,11 @@ found: int bch2_move_data_off_device(struct bch_dev *ca) { - struct moving_context ctxt; struct bch_fs *c = ca->fs; + struct btree_iter iter; + struct bkey_s_c k; + u64 keys_moved, sectors_moved; unsigned pass = 0; - u64 seen_key_count; int ret = 0; BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW); @@ -69,12 +55,6 @@ int bch2_move_data_off_device(struct bch_dev *ca) if (!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_USER))) return 0; - mutex_lock(&c->replicas_gc_lock); - bch2_replicas_gc_start(c, 1 << BCH_DATA_USER); - - bch2_move_ctxt_init(&ctxt, NULL, SECTORS_IN_FLIGHT_PER_DEVICE); - __set_bit(ca->dev_idx, ctxt.avoid.d); - /* * In theory, only one pass should be necessary as we've * quiesced all writes before calling this. @@ -91,69 +71,43 @@ int bch2_move_data_off_device(struct bch_dev *ca) * Thus this scans the tree one more time than strictly necessary, * but that can be viewed as a verification pass. */ - do { - struct btree_iter iter; - struct bkey_s_c k; - - seen_key_count = 0; - atomic_set(&ctxt.error_count, 0); - atomic_set(&ctxt.error_flags, 0); - - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, - BTREE_ITER_PREFETCH); - - while (!bch2_move_ctxt_wait(&ctxt) && - (k = bch2_btree_iter_peek(&iter)).k && - !(ret = btree_iter_err(k))) { - if (!bkey_extent_is_data(k.k) || - !bch2_extent_has_device(bkey_s_c_to_extent(k), - ca->dev_idx)) - goto next; - - ret = issue_migration_move(ca, &ctxt, NULL, k); - if (ret == -ENOMEM) { - bch2_btree_iter_unlock(&iter); - - /* - * memory allocation failure, wait for some IO - * to finish - */ - bch2_move_ctxt_wait_for_io(&ctxt); - continue; - } - if (ret == -ENOSPC) - break; - BUG_ON(ret); + ret = bch2_move_data(c, NULL, + SECTORS_IN_FLIGHT_PER_DEVICE, + NULL, + writepoint_hashed((unsigned long) current), + 0, + ca->dev_idx, + migrate_pred, ca, + &keys_moved, + §ors_moved); + if (ret) { + bch_err(c, "error migrating data: %i", ret); + return ret; + } + } while (keys_moved && pass++ < MAX_DATA_OFF_ITER); - seen_key_count++; - continue; -next: - if (bkey_extent_is_data(k.k)) { - ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k), - BCH_DATA_USER); - if (ret) - break; - } - bch2_btree_iter_advance_pos(&iter); - bch2_btree_iter_cond_resched(&iter); + if (keys_moved) { + bch_err(c, "unable to migrate all data in %d iterations", + MAX_DATA_OFF_ITER); + return -1; + } - } - bch2_btree_iter_unlock(&iter); - bch2_move_ctxt_exit(&ctxt); + mutex_lock(&c->replicas_gc_lock); + bch2_replicas_gc_start(c, 1 << BCH_DATA_USER); - if (ret) - goto err; - } while (seen_key_count && pass++ < MAX_DATA_OFF_ITER); + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, BTREE_ITER_PREFETCH, k) { + if (!bkey_extent_is_data(k.k)) + continue; - if (seen_key_count) { - pr_err("Unable to migrate all data in %d iterations.", - MAX_DATA_OFF_ITER); - ret = -1; - goto err; + ret = bch2_check_mark_super(c, bkey_s_c_to_extent(k), + BCH_DATA_USER); + if (ret) { + bch_err(c, "error migrating data %i from check_mark_super()", ret); + break; + } } -err: bch2_replicas_gc_end(c, ret); mutex_unlock(&c->replicas_gc_lock); return ret; @@ -167,14 +121,11 @@ static int bch2_move_btree_off(struct bch_fs *c, struct bch_dev *ca, enum btree_id id) { struct btree_iter iter; - struct closure cl; struct btree *b; int ret; BUG_ON(ca->mi.state == BCH_MEMBER_STATE_RW); - closure_init_stack(&cl); - for_each_btree_node(&iter, c, id, POS_MIN, BTREE_ITER_PREFETCH, b) { struct bkey_s_c_extent e = bkey_i_to_s_c_extent(&b->key); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 0c5b924..5eaf0cf 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -9,41 +9,38 @@ #include "keylist.h" #include +#include #include -static struct bch_extent_ptr *bkey_find_ptr(struct bch_fs *c, - struct bkey_s_extent e, - struct bch_extent_ptr ptr) -{ - struct bch_extent_ptr *ptr2; - struct bch_dev *ca = c->devs[ptr.dev]; +struct moving_io { + struct list_head list; + struct closure cl; + bool read_completed; + unsigned sectors; - extent_for_each_ptr(e, ptr2) - if (ptr2->dev == ptr.dev && - ptr2->gen == ptr.gen && - PTR_BUCKET_NR(ca, ptr2) == - PTR_BUCKET_NR(ca, &ptr)) - return ptr2; + struct bch_read_bio rbio; - return NULL; -} + struct migrate_write write; + /* Must be last since it is variable size */ + struct bio_vec bi_inline_vecs[0]; +}; -static struct bch_extent_ptr *bch2_migrate_matching_ptr(struct migrate_write *m, - struct bkey_s_extent e) -{ - const struct bch_extent_ptr *ptr; - struct bch_extent_ptr *ret; +struct moving_context { + /* Closure for waiting on all reads and writes to complete */ + struct closure cl; - if (m->move) - ret = bkey_find_ptr(m->op.c, e, m->move_ptr); - else - extent_for_each_ptr(bkey_i_to_s_c_extent(&m->key), ptr) - if ((ret = bkey_find_ptr(m->op.c, e, *ptr))) - break; + /* Key and sector moves issued, updated from submission context */ + u64 keys_moved; + u64 sectors_moved; + atomic64_t sectors_raced; - return ret; -} + struct list_head reads; + + atomic_t sectors_in_flight; + + wait_queue_head_t wait; +}; static int bch2_migrate_index_update(struct bch_write_op *op) { @@ -59,71 +56,78 @@ static int bch2_migrate_index_update(struct bch_write_op *op) BTREE_ITER_INTENT); while (1) { - struct bkey_s_extent insert = - bkey_i_to_s_extent(bch2_keylist_front(keys)); struct bkey_s_c k = bch2_btree_iter_peek_with_holes(&iter); + struct bkey_i_extent *insert, *new = + bkey_i_to_extent(bch2_keylist_front(keys)); + BKEY_PADDED(k) _new, _insert; struct bch_extent_ptr *ptr; - struct bkey_s_extent e; - BKEY_PADDED(k) new; + struct bch_extent_crc_unpacked crc; + bool did_work = false; - if (!k.k) { + if (btree_iter_err(k)) { ret = bch2_btree_iter_unlock(&iter); break; } - if (!bkey_extent_is_data(k.k)) + if (bversion_cmp(k.k->version, new->k.version) || + !bkey_extent_is_data(k.k) || + !bch2_extent_matches_ptr(c, bkey_s_c_to_extent(k), + m->ptr, m->offset)) goto nomatch; - bkey_reassemble(&new.k, k); - bch2_cut_front(iter.pos, &new.k); - bch2_cut_back(insert.k->p, &new.k.k); - e = bkey_i_to_s_extent(&new.k); - - /* hack - promotes can race: */ - if (m->promote) - extent_for_each_ptr(insert, ptr) - if (bch2_extent_has_device(e.c, ptr->dev)) - goto nomatch; - - ptr = bch2_migrate_matching_ptr(m, e); - if (ptr) { - int nr_new_dirty = bch2_extent_nr_dirty_ptrs(insert.s_c); - unsigned insert_flags = - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOFAIL; + bkey_reassemble(&_insert.k, k); + insert = bkey_i_to_extent(&_insert.k); + + bkey_copy(&_new.k, bch2_keylist_front(keys)); + new = bkey_i_to_extent(&_new.k); + + bch2_cut_front(iter.pos, &insert->k_i); + bch2_cut_back(new->k.p, &insert->k); + bch2_cut_back(insert->k.p, &new->k); + + if (m->move_dev >= 0 && + (ptr = (struct bch_extent_ptr *) + bch2_extent_has_device(extent_i_to_s_c(insert), + m->move_dev))) + bch2_extent_drop_ptr(extent_i_to_s(insert), ptr); - /* copygc uses btree node reserve: */ - if (m->move) - insert_flags |= BTREE_INSERT_USE_RESERVE; - if (m->move) { - nr_new_dirty -= !ptr->cached; - __bch2_extent_drop_ptr(e, ptr); + extent_for_each_ptr_crc(extent_i_to_s(new), ptr, crc) { + if (bch2_extent_has_device(extent_i_to_s_c(insert), ptr->dev)) { + /* + * raced with another move op? extent already + * has a pointer to the device we just wrote + * data to + */ + continue; } - BUG_ON(nr_new_dirty < 0); - - memcpy_u64s(extent_entry_last(e), - insert.v, - bkey_val_u64s(insert.k)); - e.k->u64s += bkey_val_u64s(insert.k); - - bch2_extent_narrow_crcs(e); - bch2_extent_drop_redundant_crcs(e); - bch2_extent_normalize(c, e.s); - bch2_extent_mark_replicas_cached(c, e, nr_new_dirty); - - ret = bch2_btree_insert_at(c, &op->res, - NULL, op_journal_seq(op), - insert_flags, - BTREE_INSERT_ENTRY(&iter, &new.k)); - if (ret && ret != -EINTR) - break; - } else { -nomatch: - bch2_btree_iter_advance_pos(&iter); + bch2_extent_crc_append(insert, crc); + extent_ptr_append(insert, *ptr); + did_work = true; } + if (!did_work) + goto nomatch; + + bch2_extent_narrow_crcs(insert, + (struct bch_extent_crc_unpacked) { 0 }); + bch2_extent_normalize(c, extent_i_to_s(insert).s); + bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert)); + + ret = bch2_btree_insert_at(c, &op->res, + NULL, op_journal_seq(op), + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + m->btree_insert_flags, + BTREE_INSERT_ENTRY(&iter, &insert->k_i)); + if (!ret) + atomic_long_inc(&c->extent_migrate_done); + if (ret == -EINTR) + ret = 0; + if (ret) + break; +next: while (bkey_cmp(iter.pos, bch2_keylist_front(keys)->k.p) >= 0) { bch2_keylist_pop_front(keys); if (bch2_keylist_empty(keys)) @@ -131,96 +135,83 @@ nomatch: } bch2_cut_front(iter.pos, bch2_keylist_front(keys)); + continue; +nomatch: + if (m->ctxt) + atomic64_add(k.k->p.offset - iter.pos.offset, + &m->ctxt->sectors_raced); + atomic_long_inc(&c->extent_migrate_raced); + trace_move_race(&new->k); + bch2_btree_iter_advance_pos(&iter); + goto next; } out: bch2_btree_iter_unlock(&iter); return ret; } -void bch2_migrate_write_init(struct bch_fs *c, - struct migrate_write *m, - struct bch_devs_mask *devs, - struct bkey_s_c k, - const struct bch_extent_ptr *move_ptr, - unsigned flags) +void bch2_migrate_write_init(struct migrate_write *m, + struct bch_read_bio *rbio) { - bkey_reassemble(&m->key, k); - - m->promote = false; - m->move = move_ptr != NULL; - if (move_ptr) - m->move_ptr = *move_ptr; - - if (bkey_extent_is_cached(k.k) || - (move_ptr && move_ptr->cached)) - flags |= BCH_WRITE_CACHED; + /* write bio must own pages: */ + BUG_ON(!m->op.wbio.bio.bi_vcnt); + + m->ptr = rbio->pick.ptr; + m->offset = rbio->pos.offset - rbio->pick.crc.offset; + m->op.devs_have = rbio->devs_have; + m->op.pos = rbio->pos; + m->op.version = rbio->version; + m->op.crc = rbio->pick.crc; + + if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) { + m->op.nonce = m->op.crc.nonce + m->op.crc.offset; + m->op.csum_type = m->op.crc.csum_type; + } - bch2_write_op_init(&m->op, c, (struct disk_reservation) { 0 }, - devs, (unsigned long) current, - bkey_start_pos(k.k), NULL, - flags|BCH_WRITE_ONLY_SPECIFIED_DEVS); + if (m->move_dev >= 0) + bch2_dev_list_drop_dev(&m->op.devs_have, m->move_dev); - if (m->move) + if (m->btree_insert_flags & BTREE_INSERT_USE_RESERVE) m->op.alloc_reserve = RESERVE_MOVINGGC; - m->op.nonce = extent_current_nonce(bkey_s_c_to_extent(k)); + m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS| + BCH_WRITE_PAGES_STABLE| + BCH_WRITE_PAGES_OWNED| + BCH_WRITE_DATA_ENCODED; + + m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; m->op.nr_replicas = 1; + m->op.nr_replicas_required = 1; m->op.index_update_fn = bch2_migrate_index_update; } -static void migrate_bio_init(struct moving_io *io, struct bio *bio, - unsigned sectors) +static void move_free(struct closure *cl) { - bio_init(bio, io->bi_inline_vecs, - DIV_ROUND_UP(sectors, PAGE_SECTORS)); - bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); - - bio->bi_iter.bi_size = sectors << 9; - bio->bi_private = &io->cl; - bch2_bio_map(bio, NULL); -} - -static void moving_io_free(struct moving_io *io) -{ - struct moving_context *ctxt = io->ctxt; + struct moving_io *io = container_of(cl, struct moving_io, cl); + struct moving_context *ctxt = io->write.ctxt; struct bio_vec *bv; int i; - atomic_sub(io->write.key.k.size, &ctxt->sectors_in_flight); - wake_up(&ctxt->wait); - bio_for_each_segment_all(bv, &io->write.op.wbio.bio, i) if (bv->bv_page) __free_page(bv->bv_page); - kfree(io); -} - -static void moving_error(struct moving_context *ctxt, unsigned flag) -{ - atomic_inc(&ctxt->error_count); - //atomic_or(flag, &ctxt->error_flags); -} -static void moving_write_done(struct closure *cl) -{ - struct moving_io *io = container_of(cl, struct moving_io, cl); - - if (io->write.op.error) - moving_error(io->ctxt, MOVING_FLAG_WRITE); - - //if (io->replace.failures) - // trace_copy_collision(q, &io->key.k); + atomic_sub(io->sectors, &ctxt->sectors_in_flight); + wake_up(&ctxt->wait); - moving_io_free(io); + kfree(io); } -static void write_moving(struct closure *cl) +static void move_write(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); - struct bch_write_op *op = &io->write.op; - closure_call(&op->cl, bch2_write, NULL, &io->cl); - closure_return_with_destructor(&io->cl, moving_write_done); + if (likely(!io->rbio.bio.bi_error)) { + bch2_migrate_write_init(&io->write, &io->rbio); + closure_call(&io->write.op.cl, bch2_write, NULL, cl); + } + + closure_return_with_destructor(cl, move_free); } static inline struct moving_io *next_pending_write(struct moving_context *ctxt) @@ -231,16 +222,10 @@ static inline struct moving_io *next_pending_write(struct moving_context *ctxt) return io && io->read_completed ? io : NULL; } -static void read_moving_endio(struct bio *bio) +static void move_read_endio(struct bio *bio) { - struct closure *cl = bio->bi_private; - struct moving_io *io = container_of(cl, struct moving_io, cl); - struct moving_context *ctxt = io->ctxt; - - trace_move_read_done(&io->write.key.k); - - if (bio->bi_error) - moving_error(io->ctxt, MOVING_FLAG_READ); + struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); + struct moving_context *ctxt = io->write.ctxt; io->read_completed = true; if (next_pending_write(ctxt)) @@ -249,58 +234,81 @@ static void read_moving_endio(struct bio *bio) closure_put(&ctxt->cl); } -int bch2_data_move(struct bch_fs *c, - struct moving_context *ctxt, - struct bch_devs_mask *devs, - struct bkey_s_c k, - const struct bch_extent_ptr *move_ptr) +static int bch2_move_extent(struct bch_fs *c, + struct moving_context *ctxt, + struct bch_devs_mask *devs, + struct write_point_specifier wp, + int btree_insert_flags, + int move_device, + struct bkey_s_c k) { struct extent_pick_ptr pick; struct moving_io *io; + const struct bch_extent_ptr *ptr; + struct bch_extent_crc_unpacked crc; + unsigned sectors = k.k->size, pages; - bch2_extent_pick_ptr(c, k, &ctxt->avoid, &pick); + bch2_extent_pick_ptr(c, k, NULL, &pick); if (IS_ERR_OR_NULL(pick.ca)) return pick.ca ? PTR_ERR(pick.ca) : 0; - io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) * - DIV_ROUND_UP(k.k->size, PAGE_SECTORS), GFP_KERNEL); - if (!io) - return -ENOMEM; + /* write path might have to decompress data: */ + extent_for_each_ptr_crc(bkey_s_c_to_extent(k), ptr, crc) + sectors = max_t(unsigned, sectors, crc.uncompressed_size); - io->ctxt = ctxt; + pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); + io = kzalloc(sizeof(struct moving_io) + + sizeof(struct bio_vec) * pages, GFP_KERNEL); + if (!io) + goto err; - migrate_bio_init(io, &io->rbio.bio, k.k->size); + io->write.ctxt = ctxt; + io->sectors = k.k->size; - bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0); - io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); - io->rbio.bio.bi_end_io = read_moving_endio; + bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages); + bio_set_prio(&io->write.op.wbio.bio, + IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + io->write.op.wbio.bio.bi_iter.bi_size = sectors << 9; - if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) { + bch2_bio_map(&io->write.op.wbio.bio, NULL); + if (bio_alloc_pages(&io->write.op.wbio.bio, GFP_KERNEL)) { kfree(io); - return -ENOMEM; + goto err; } - migrate_bio_init(io, &io->write.op.wbio.bio, k.k->size); + bio_init(&io->rbio.bio, io->bi_inline_vecs, pages); + bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + io->rbio.bio.bi_iter.bi_size = sectors << 9; - bch2_migrate_write_init(c, &io->write, devs, k, move_ptr, 0); + bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0); + io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); + io->rbio.bio.bi_end_io = move_read_endio; - trace_move_read(&io->write.key.k); + __bch2_write_op_init(&io->write.op, c); + io->write.btree_insert_flags = btree_insert_flags; + io->write.move_dev = move_device; + io->write.op.devs = devs; + io->write.op.write_point = wp; ctxt->keys_moved++; ctxt->sectors_moved += k.k->size; - if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, k.k->size); - atomic_add(k.k->size, &ctxt->sectors_in_flight); + trace_move_extent(k.k); + + atomic_add(io->sectors, &ctxt->sectors_in_flight); list_add_tail(&io->list, &ctxt->reads); /* - * dropped by read_moving_endio() - guards against use after free of + * dropped by move_read_endio() - guards against use after free of * ctxt when doing wakeup */ - closure_get(&io->ctxt->cl); - bch2_read_extent(c, &io->rbio, k, &pick, 0); + closure_get(&ctxt->cl); + bch2_read_extent(c, &io->rbio, bkey_s_c_to_extent(k), + &pick, BCH_READ_NODECODE); return 0; +err: + trace_move_alloc_fail(k.k); + return -ENOMEM; } static void do_pending_writes(struct moving_context *ctxt) @@ -309,14 +317,7 @@ static void do_pending_writes(struct moving_context *ctxt) while ((io = next_pending_write(ctxt))) { list_del(&io->list); - - if (io->rbio.bio.bi_error) { - moving_io_free(io); - continue; - } - - trace_move_write(&io->write.key.k); - closure_call(&io->cl, write_moving, NULL, &ctxt->cl); + closure_call(&io->cl, move_write, NULL, &ctxt->cl); } } @@ -330,18 +331,7 @@ do { \ next_pending_write(_ctxt) || (_cond)); \ } while (1) -int bch2_move_ctxt_wait(struct moving_context *ctxt) -{ - move_ctxt_wait_event(ctxt, - atomic_read(&ctxt->sectors_in_flight) < - ctxt->max_sectors_in_flight); - - return ctxt->rate - ? bch2_ratelimit_wait_freezable_stoppable(ctxt->rate) - : 0; -} - -void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) +static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) { unsigned sectors_pending = atomic_read(&ctxt->sectors_in_flight); @@ -350,7 +340,7 @@ void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) atomic_read(&ctxt->sectors_in_flight) != sectors_pending); } -void bch2_move_ctxt_exit(struct moving_context *ctxt) +static void bch2_move_ctxt_exit(struct moving_context *ctxt) { move_ctxt_wait_event(ctxt, !atomic_read(&ctxt->sectors_in_flight)); closure_sync(&ctxt->cl); @@ -359,16 +349,92 @@ void bch2_move_ctxt_exit(struct moving_context *ctxt) EBUG_ON(atomic_read(&ctxt->sectors_in_flight)); } -void bch2_move_ctxt_init(struct moving_context *ctxt, - struct bch_ratelimit *rate, - unsigned max_sectors_in_flight) +static void bch2_move_ctxt_init(struct moving_context *ctxt) { memset(ctxt, 0, sizeof(*ctxt)); closure_init_stack(&ctxt->cl); - ctxt->rate = rate; - ctxt->max_sectors_in_flight = max_sectors_in_flight; - INIT_LIST_HEAD(&ctxt->reads); init_waitqueue_head(&ctxt->wait); } + +int bch2_move_data(struct bch_fs *c, + struct bch_ratelimit *rate, + unsigned sectors_in_flight, + struct bch_devs_mask *devs, + struct write_point_specifier wp, + int btree_insert_flags, + int move_device, + move_pred_fn pred, void *arg, + u64 *keys_moved, + u64 *sectors_moved) +{ + bool kthread = (current->flags & PF_KTHREAD) != 0; + struct moving_context ctxt; + struct btree_iter iter; + BKEY_PADDED(k) tmp; + struct bkey_s_c k; + int ret = 0; + + bch2_move_ctxt_init(&ctxt); + bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, + BTREE_ITER_PREFETCH); + + if (rate) + bch2_ratelimit_reset(rate); + + while (!kthread || !(ret = kthread_should_stop())) { + if (atomic_read(&ctxt.sectors_in_flight) >= sectors_in_flight) { + bch2_btree_iter_unlock(&iter); + move_ctxt_wait_event(&ctxt, + atomic_read(&ctxt.sectors_in_flight) < + sectors_in_flight); + } + + if (rate && + bch2_ratelimit_delay(rate) && + (bch2_btree_iter_unlock(&iter), + (ret = bch2_ratelimit_wait_freezable_stoppable(rate)))) + break; + + k = bch2_btree_iter_peek(&iter); + if (!k.k) + break; + ret = btree_iter_err(k); + if (ret) + break; + + if (!bkey_extent_is_data(k.k) || + !pred(arg, bkey_s_c_to_extent(k))) + goto next; + + /* unlock before doing IO: */ + bkey_reassemble(&tmp.k, k); + k = bkey_i_to_s_c(&tmp.k); + bch2_btree_iter_unlock(&iter); + + if (bch2_move_extent(c, &ctxt, devs, wp, + btree_insert_flags, + move_device, k)) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(&ctxt); + continue; + } + + if (rate) + bch2_ratelimit_increment(rate, k.k->size); +next: + bch2_btree_iter_advance_pos(&iter); + bch2_btree_iter_cond_resched(&iter); + } + + bch2_btree_iter_unlock(&iter); + bch2_move_ctxt_exit(&ctxt); + + trace_move_data(c, ctxt.sectors_moved, ctxt.keys_moved); + + *keys_moved = ctxt.keys_moved; + *sectors_moved = ctxt.sectors_moved; + + return ret; +} diff --git a/libbcachefs/move.h b/libbcachefs/move.h index a756a46..2e884ce 100644 --- a/libbcachefs/move.h +++ b/libbcachefs/move.h @@ -4,77 +4,31 @@ #include "buckets.h" #include "io_types.h" -enum moving_flag_bitnos { - MOVING_FLAG_BITNO_READ = 0, - MOVING_FLAG_BITNO_WRITE, -}; - -#define MOVING_FLAG_READ (1U << MOVING_FLAG_BITNO_READ) -#define MOVING_FLAG_WRITE (1U << MOVING_FLAG_BITNO_WRITE) +struct bch_read_bio; +struct moving_context; struct migrate_write { - BKEY_PADDED(key); - bool promote; - bool move; - struct bch_extent_ptr move_ptr; - struct bch_write_op op; -}; - -void bch2_migrate_write_init(struct bch_fs *, struct migrate_write *, - struct bch_devs_mask *, struct bkey_s_c, - const struct bch_extent_ptr *, unsigned); - -#define SECTORS_IN_FLIGHT_PER_DEVICE 2048 - -struct moving_context { - /* Closure for waiting on all reads and writes to complete */ - struct closure cl; - - /* Number and types of errors reported */ - atomic_t error_count; - atomic_t error_flags; - - /* Key and sector moves issued, updated from submission context */ - u64 keys_moved; - u64 sectors_moved; - - /* Rate-limiter counting submitted reads */ - struct bch_ratelimit *rate; - - /* Try to avoid reading the following device */ - struct bch_devs_mask avoid; - - struct list_head reads; + struct moving_context *ctxt; - /* Configuration */ - unsigned max_sectors_in_flight; - atomic_t sectors_in_flight; + /* what we read: */ + struct bch_extent_ptr ptr; + u64 offset; - wait_queue_head_t wait; + int move_dev; + int btree_insert_flags; + struct bch_write_op op; }; -struct moving_io { - struct list_head list; - struct rb_node node; - struct closure cl; - struct moving_context *ctxt; - struct migrate_write write; - bool read_completed; - - struct bch_read_bio rbio; - /* Must be last since it is variable size */ - struct bio_vec bi_inline_vecs[0]; -}; +void bch2_migrate_write_init(struct migrate_write *, struct bch_read_bio *); -int bch2_data_move(struct bch_fs *, struct moving_context *, - struct bch_devs_mask *, struct bkey_s_c, - const struct bch_extent_ptr *); +#define SECTORS_IN_FLIGHT_PER_DEVICE 2048 -int bch2_move_ctxt_wait(struct moving_context *); -void bch2_move_ctxt_wait_for_io(struct moving_context *); +typedef bool (*move_pred_fn)(void *, struct bkey_s_c_extent); -void bch2_move_ctxt_exit(struct moving_context *); -void bch2_move_ctxt_init(struct moving_context *, struct bch_ratelimit *, - unsigned); +int bch2_move_data(struct bch_fs *, struct bch_ratelimit *, + unsigned, struct bch_devs_mask *, + struct write_point_specifier, + int, int, move_pred_fn, void *, + u64 *, u64 *); #endif /* _BCACHEFS_MOVE_H */ diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 125159e..728be2b 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -6,6 +6,7 @@ #include "bcachefs.h" #include "btree_iter.h" +#include "btree_update.h" #include "buckets.h" #include "clock.h" #include "extents.h" @@ -23,137 +24,63 @@ #include #include -/* Moving GC - IO loop */ - -static int bucket_idx_cmp(const void *_l, const void *_r, size_t size) -{ - const struct bucket_heap_entry *l = _l; - const struct bucket_heap_entry *r = _r; +/* + * We can't use the entire copygc reserve in one iteration of copygc: we may + * need the buckets we're freeing up to go back into the copygc reserve to make + * forward progress, but if the copygc reserve is full they'll be available for + * any allocation - and it's possible that in a given iteration, we free up most + * of the buckets we're going to free before we allocate most of the buckets + * we're going to allocate. + * + * If we only use half of the reserve per iteration, then in steady state we'll + * always have room in the reserve for the buckets we're going to need in the + * next iteration: + */ +#define COPYGC_BUCKETS_PER_ITER(ca) \ + ((ca)->free[RESERVE_MOVINGGC].size / 2) - if (l->bucket < r->bucket) - return -1; - if (l->bucket > r->bucket) - return 1; - return 0; -} +/* + * Max sectors to move per iteration: Have to take into account internal + * fragmentation from the multiple write points for each generation: + */ +#define COPYGC_SECTORS_PER_ITER(ca) \ + ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca)) -static const struct bch_extent_ptr *moving_pred(struct bch_dev *ca, - struct bkey_s_c k) +static inline int sectors_used_cmp(copygc_heap *heap, + struct copygc_heap_entry l, + struct copygc_heap_entry r) { - bucket_heap *h = &ca->copygc_heap; - const struct bch_extent_ptr *ptr; - - if (bkey_extent_is_data(k.k) && - (ptr = bch2_extent_has_device(bkey_s_c_to_extent(k), - ca->dev_idx))) { - struct bucket_heap_entry search = { - .bucket = PTR_BUCKET_NR(ca, ptr) - }; - - size_t i = eytzinger0_find(h->data, h->used, - sizeof(h->data[0]), - bucket_idx_cmp, &search); - - if (i < h->used) - return ptr; - } - - return NULL; + return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark); } -static int issue_moving_gc_move(struct bch_dev *ca, - struct moving_context *ctxt, - struct bkey_s_c k) +static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) { - struct bch_fs *c = ca->fs; - const struct bch_extent_ptr *ptr; - int ret; + const struct copygc_heap_entry *l = _l; + const struct copygc_heap_entry *r = _r; - ptr = moving_pred(ca, k); - if (!ptr) /* We raced - bucket's been reused */ - return 0; - - ret = bch2_data_move(c, ctxt, &ca->self, k, ptr); - if (!ret) - trace_gc_copy(k.k); - else - trace_moving_gc_alloc_fail(c, k.k->size); - return ret; + return (l->offset > r->offset) - (l->offset < r->offset); } -static void read_moving(struct bch_dev *ca, size_t buckets_to_move, - u64 sectors_to_move) +static bool copygc_pred(void *arg, struct bkey_s_c_extent e) { - struct bch_fs *c = ca->fs; - bucket_heap *h = &ca->copygc_heap; - struct moving_context ctxt; - struct btree_iter iter; - struct bkey_s_c k; - u64 sectors_not_moved = 0; - size_t buckets_not_moved = 0; - struct bucket_heap_entry *i; - - bch2_ratelimit_reset(&ca->moving_gc_pd.rate); - bch2_move_ctxt_init(&ctxt, &ca->moving_gc_pd.rate, - SECTORS_IN_FLIGHT_PER_DEVICE); - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, - BTREE_ITER_PREFETCH); - - while (1) { - if (kthread_should_stop()) - goto out; - if (bch2_move_ctxt_wait(&ctxt)) - goto out; - k = bch2_btree_iter_peek(&iter); - if (!k.k) - break; - if (btree_iter_err(k)) - goto out; - - if (!moving_pred(ca, k)) - goto next; + struct bch_dev *ca = arg; + copygc_heap *h = &ca->copygc_heap; + const struct bch_extent_ptr *ptr = + bch2_extent_has_device(e, ca->dev_idx); - if (issue_moving_gc_move(ca, &ctxt, k)) { - bch2_btree_iter_unlock(&iter); + if (ptr) { + struct copygc_heap_entry search = { .offset = ptr->offset }; - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(&ctxt); - continue; - } -next: - bch2_btree_iter_advance_pos(&iter); - //bch2_btree_iter_cond_resched(&iter); + size_t i = eytzinger0_find_le(h->data, h->used, + sizeof(h->data[0]), + bucket_offset_cmp, &search); - /* unlock before calling moving_context_wait() */ - bch2_btree_iter_unlock(&iter); - cond_resched(); + return (i >= 0 && + ptr->offset < h->data[i].offset + ca->mi.bucket_size && + ptr->gen == h->data[i].mark.gen); } - bch2_btree_iter_unlock(&iter); - bch2_move_ctxt_exit(&ctxt); - trace_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved, - buckets_to_move); - - /* don't check this if we bailed out early: */ - for (i = h->data; i < h->data + h->used; i++) { - struct bucket_mark m = READ_ONCE(ca->buckets[i->bucket].mark); - - if (i->mark.gen == m.gen && bucket_sectors_used(m)) { - sectors_not_moved += bucket_sectors_used(m); - buckets_not_moved++; - } - } - - if (sectors_not_moved) - bch_warn(c, "copygc finished but %llu/%llu sectors, %zu/%zu buckets not moved", - sectors_not_moved, sectors_to_move, - buckets_not_moved, buckets_to_move); - return; -out: - bch2_btree_iter_unlock(&iter); - bch2_move_ctxt_exit(&ctxt); - trace_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved, - buckets_to_move); + return false; } static bool have_copygc_reserve(struct bch_dev *ca) @@ -168,38 +95,17 @@ static bool have_copygc_reserve(struct bch_dev *ca) return ret; } -static inline int sectors_used_cmp(bucket_heap *heap, - struct bucket_heap_entry l, - struct bucket_heap_entry r) -{ - return bucket_sectors_used(l.mark) - bucket_sectors_used(r.mark); -} - -static void bch2_moving_gc(struct bch_dev *ca) +static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) { - struct bch_fs *c = ca->fs; + copygc_heap *h = &ca->copygc_heap; + struct copygc_heap_entry e, *i; struct bucket *g; - u64 sectors_to_move = 0; - size_t buckets_to_move, buckets_unused = 0; - struct bucket_heap_entry e, *i; - int reserve_sectors; - - if (!have_copygc_reserve(ca)) { - struct closure cl; - - closure_init_stack(&cl); - while (1) { - closure_wait(&c->freelist_wait, &cl); - if (have_copygc_reserve(ca)) - break; - closure_sync(&cl); - } - closure_wake_up(&c->freelist_wait); - } - - reserve_sectors = COPYGC_SECTORS_PER_ITER(ca); + u64 keys_moved, sectors_moved; + u64 sectors_to_move = 0, sectors_not_moved = 0; + u64 buckets_to_move, buckets_not_moved = 0; + int ret; - trace_moving_gc_start(ca); + closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); /* * Find buckets with lowest sector counts, skipping completely @@ -213,48 +119,73 @@ static void bch2_moving_gc(struct bch_dev *ca) * them: */ down_read(&c->gc_lock); - ca->copygc_heap.used = 0; + h->used = 0; for_each_bucket(g, ca) { struct bucket_mark m = READ_ONCE(g->mark); - struct bucket_heap_entry e = { g - ca->buckets, m }; - - if (bucket_unused(m)) { - buckets_unused++; - continue; - } + struct copygc_heap_entry e; if (m.owned_by_allocator || - m.data_type != BUCKET_DATA) + m.data_type != BUCKET_DATA || + !bucket_sectors_used(m) || + bucket_sectors_used(m) >= ca->mi.bucket_size) continue; - if (bucket_sectors_used(m) >= ca->mi.bucket_size) - continue; - - heap_add_or_replace(&ca->copygc_heap, e, -sectors_used_cmp); + e = (struct copygc_heap_entry) { + .offset = bucket_to_sector(ca, g - ca->buckets), + .mark = m + }; + heap_add_or_replace(h, e, -sectors_used_cmp); } up_read(&c->gc_lock); - for (i = ca->copygc_heap.data; - i < ca->copygc_heap.data + ca->copygc_heap.used; - i++) + for (i = h->data; i < h->data + h->used; i++) sectors_to_move += bucket_sectors_used(i->mark); while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) { - BUG_ON(!heap_pop(&ca->copygc_heap, e, -sectors_used_cmp)); + BUG_ON(!heap_pop(h, e, -sectors_used_cmp)); sectors_to_move -= bucket_sectors_used(e.mark); } - buckets_to_move = ca->copygc_heap.used; + buckets_to_move = h->used; + + if (!buckets_to_move) + return; + + eytzinger0_sort(h->data, h->used, + sizeof(h->data[0]), + bucket_offset_cmp, NULL); + + ret = bch2_move_data(c, &ca->copygc_pd.rate, + SECTORS_IN_FLIGHT_PER_DEVICE, + &ca->self, + writepoint_ptr(&ca->copygc_write_point), + BTREE_INSERT_USE_RESERVE, + ca->dev_idx, + copygc_pred, ca, + &keys_moved, + §ors_moved); + + for (i = h->data; i < h->data + h->used; i++) { + size_t bucket = sector_to_bucket(ca, i->offset); + struct bucket_mark m = READ_ONCE(ca->buckets[bucket].mark); + + if (i->mark.gen == m.gen && bucket_sectors_used(m)) { + sectors_not_moved += bucket_sectors_used(m); + buckets_not_moved++; + } + } - eytzinger0_sort(ca->copygc_heap.data, - ca->copygc_heap.used, - sizeof(ca->copygc_heap.data[0]), - bucket_idx_cmp, NULL); + if (sectors_not_moved && !ret) + bch_warn(c, "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved", + sectors_not_moved, sectors_to_move, + buckets_not_moved, buckets_to_move); - read_moving(ca, buckets_to_move, sectors_to_move); + trace_copygc(ca, + sectors_moved, sectors_not_moved, + buckets_to_move, buckets_not_moved); } -static int bch2_moving_gc_thread(void *arg) +static int bch2_copygc_thread(void *arg) { struct bch_dev *ca = arg; struct bch_fs *c = ca->fs; @@ -273,7 +204,7 @@ static int bch2_moving_gc_thread(void *arg) * don't start copygc until less than half the gc reserve is * available: */ - available = dev_buckets_available(ca); + available = dev_buckets_available(c, ca); want = div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) * c->opts.gc_reserve_percent, 200); if (available > want) { @@ -283,46 +214,46 @@ static int bch2_moving_gc_thread(void *arg) continue; } - bch2_moving_gc(ca); + bch2_copygc(c, ca); } return 0; } -void bch2_moving_gc_stop(struct bch_dev *ca) +void bch2_copygc_stop(struct bch_dev *ca) { - ca->moving_gc_pd.rate.rate = UINT_MAX; - bch2_ratelimit_reset(&ca->moving_gc_pd.rate); + ca->copygc_pd.rate.rate = UINT_MAX; + bch2_ratelimit_reset(&ca->copygc_pd.rate); - if (ca->moving_gc_read) - kthread_stop(ca->moving_gc_read); - ca->moving_gc_read = NULL; + if (ca->copygc_thread) + kthread_stop(ca->copygc_thread); + ca->copygc_thread = NULL; } -int bch2_moving_gc_start(struct bch_dev *ca) +int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca) { struct task_struct *t; - BUG_ON(ca->moving_gc_read); + BUG_ON(ca->copygc_thread); - if (ca->fs->opts.nochanges) + if (c->opts.nochanges) return 0; - if (bch2_fs_init_fault("moving_gc_start")) + if (bch2_fs_init_fault("copygc_start")) return -ENOMEM; - t = kthread_create(bch2_moving_gc_thread, ca, "bch_copygc_read"); + t = kthread_create(bch2_copygc_thread, ca, "bch_copygc"); if (IS_ERR(t)) return PTR_ERR(t); - ca->moving_gc_read = t; - wake_up_process(ca->moving_gc_read); + ca->copygc_thread = t; + wake_up_process(ca->copygc_thread); return 0; } -void bch2_dev_moving_gc_init(struct bch_dev *ca) +void bch2_dev_copygc_init(struct bch_dev *ca) { - bch2_pd_controller_init(&ca->moving_gc_pd); - ca->moving_gc_pd.d_term = 0; + bch2_pd_controller_init(&ca->copygc_pd); + ca->copygc_pd.d_term = 0; } diff --git a/libbcachefs/movinggc.h b/libbcachefs/movinggc.h index d835d13..c46fa1f 100644 --- a/libbcachefs/movinggc.h +++ b/libbcachefs/movinggc.h @@ -1,30 +1,8 @@ #ifndef _BCACHEFS_MOVINGGC_H #define _BCACHEFS_MOVINGGC_H -/* - * We can't use the entire copygc reserve in one iteration of copygc: we may - * need the buckets we're freeing up to go back into the copygc reserve to make - * forward progress, but if the copygc reserve is full they'll be available for - * any allocation - and it's possible that in a given iteration, we free up most - * of the buckets we're going to free before we allocate most of the buckets - * we're going to allocate. - * - * If we only use half of the reserve per iteration, then in steady state we'll - * always have room in the reserve for the buckets we're going to need in the - * next iteration: - */ -#define COPYGC_BUCKETS_PER_ITER(ca) \ - ((ca)->free[RESERVE_MOVINGGC].size / 2) - -/* - * Max sectors to move per iteration: Have to take into account internal - * fragmentation from the multiple write points for each generation: - */ -#define COPYGC_SECTORS_PER_ITER(ca) \ - ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca)) - -void bch2_moving_gc_stop(struct bch_dev *); -int bch2_moving_gc_start(struct bch_dev *); -void bch2_dev_moving_gc_init(struct bch_dev *); +void bch2_copygc_stop(struct bch_dev *); +int bch2_copygc_start(struct bch_fs *, struct bch_dev *); +void bch2_dev_copygc_init(struct bch_dev *); #endif /* _BCACHEFS_MOVINGGC_H */ diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index 1e4eafb..a3ecfb9 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -425,6 +425,11 @@ const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) if (err) return err; + if (le64_to_cpu(sb->version) < BCH_SB_VERSION_EXTENT_NONCE_V1 && + bch2_sb_get_crypt(sb) && + BCH_SB_INITIALIZED(sb)) + return "Incompatible extent nonces"; + sb->version = cpu_to_le64(BCH_SB_VERSION_MAX); return NULL; diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 0342778..4e8b0a5 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -20,6 +20,7 @@ #include "debug.h" #include "error.h" #include "fs.h" +#include "fs-io.h" #include "fsck.h" #include "inode.h" #include "io.h" @@ -209,7 +210,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch2_tiering_stop(c); for_each_member_device(ca, c, i) - bch2_moving_gc_stop(ca); + bch2_copygc_stop(ca); bch2_gc_thread_stop(c); @@ -258,12 +259,8 @@ void bch2_fs_read_only(struct bch_fs *c) */ percpu_ref_kill(&c->writes); - del_timer(&c->foreground_write_wakeup); cancel_delayed_work(&c->pd_controllers_update); - c->foreground_write_pd.rate.rate = UINT_MAX; - bch2_wake_delayed_writes((unsigned long) c); - /* * If we're not doing an emergency shutdown, we want to wait on * outstanding writes to complete so they don't see spurious errors due @@ -348,9 +345,9 @@ const char *bch2_fs_read_write(struct bch_fs *c) if (bch2_gc_thread_start(c)) goto err; - err = "error starting moving GC thread"; + err = "error starting copygc thread"; for_each_rw_member(ca, c, i) - if (bch2_moving_gc_start(ca)) { + if (bch2_copygc_start(c, ca)) { percpu_ref_put(&ca->io_ref); goto err; } @@ -375,6 +372,7 @@ err: static void bch2_fs_free(struct bch_fs *c) { + bch2_fs_fsio_exit(c); bch2_fs_encryption_exit(c); bch2_fs_btree_cache_exit(c); bch2_fs_journal_exit(&c->journal); @@ -411,7 +409,6 @@ static void bch2_fs_exit(struct bch_fs *c) { unsigned i; - del_timer_sync(&c->foreground_write_wakeup); cancel_delayed_work_sync(&c->pd_controllers_update); cancel_work_sync(&c->read_only_work); @@ -535,8 +532,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->tiering_enabled = 1; c->tiering_percent = 10; - c->foreground_target_percent = 20; - c->journal.write_time = &c->journal_write_time; c->journal.delay_time = &c->journal_delay_time; c->journal.blocked_time = &c->journal_blocked_time; @@ -600,7 +595,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_btree_cache_init(c) || bch2_fs_encryption_init(c) || bch2_fs_compress_init(c) || - bch2_check_set_has_compressed_data(c, c->opts.compression)) + bch2_check_set_has_compressed_data(c, c->opts.compression) || + bch2_fs_fsio_init(c)) goto err; c->bdi.ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; @@ -1105,8 +1101,10 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ca->dev_idx = dev_idx; __set_bit(ca->dev_idx, ca->self.d); + writepoint_init(&ca->copygc_write_point, BCH_DATA_USER); + spin_lock_init(&ca->freelist_lock); - bch2_dev_moving_gc_init(ca); + bch2_dev_copygc_init(ca); INIT_WORK(&ca->io_error_work, bch2_io_error_work); @@ -1224,10 +1222,7 @@ static int __bch2_dev_online(struct bch_fs *c, struct bch_sb_handle *sb) if (bch2_dev_sysfs_online(ca)) pr_warn("error creating sysfs objects"); - lg_local_lock(&c->usage_lock); - if (!gc_will_visit(c, gc_phase(GC_PHASE_SB_METADATA))) - bch2_mark_dev_metadata(c, ca); - lg_local_unlock(&c->usage_lock); + bch2_mark_dev_superblock(c, ca, 0); if (ca->mi.state == BCH_MEMBER_STATE_RW) bch2_dev_allocator_add(c, ca); @@ -1324,7 +1319,7 @@ static bool bch2_fs_may_start(struct bch_fs *c) static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) { - bch2_moving_gc_stop(ca); + bch2_copygc_stop(ca); /* * This stops new data writes (e.g. to existing open data @@ -1347,8 +1342,8 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) if (bch2_dev_allocator_start(ca)) return "error starting allocator thread"; - if (bch2_moving_gc_start(ca)) - return "error starting moving GC thread"; + if (bch2_copygc_start(c, ca)) + return "error starting copygc thread"; if (bch2_tiering_start(c)) return "error starting tiering thread"; diff --git a/libbcachefs/super.h b/libbcachefs/super.h index 18e36c0..eb1d2f3 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -35,6 +35,30 @@ static inline unsigned dev_mask_nr(struct bch_devs_mask *devs) return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); } +static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, + unsigned dev) +{ + unsigned i; + + for (i = 0; i < devs.nr; i++) + if (devs.devs[i] == dev) + return true; + + return false; +} + +static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, + unsigned dev) +{ + unsigned i; + + for (i = 0; i < devs->nr; i++) + if (devs->devs[i] == dev) { + array_remove_item(devs->devs, devs->nr, i); + return; + } +} + static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, struct bch_devs_mask *mask) { diff --git a/libbcachefs/super_types.h b/libbcachefs/super_types.h index 756dfeb..35c8beb 100644 --- a/libbcachefs/super_types.h +++ b/libbcachefs/super_types.h @@ -13,4 +13,33 @@ struct bch_devs_mask { unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; }; +struct bch_devs_list { + u8 nr; + u8 devs[BCH_REPLICAS_MAX]; +}; + +struct bch_member_cpu { + u64 nbuckets; /* device size */ + u16 first_bucket; /* index of first bucket used */ + u16 bucket_size; /* sectors */ + u8 state; + u8 tier; + u8 replacement; + u8 discard; + u8 data_allowed; + u8 valid; +}; + +struct bch_replicas_cpu_entry { + u8 data_type; + u8 devs[BCH_SB_MEMBERS_MAX / 8]; +}; + +struct bch_replicas_cpu { + struct rcu_head rcu; + unsigned nr; + unsigned entry_size; + struct bch_replicas_cpu_entry entries[]; +}; + #endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index c20769b..35f1e56 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -161,8 +161,11 @@ read_attribute(meta_buckets); read_attribute(alloc_buckets); read_attribute(has_data); read_attribute(alloc_debug); +write_attribute(wake_allocator); read_attribute(read_realloc_races); +read_attribute(extent_migrate_done); +read_attribute(extent_migrate_raced); rw_attribute(journal_write_delay_ms); rw_attribute(journal_reclaim_delay_ms); @@ -170,7 +173,6 @@ rw_attribute(journal_reclaim_delay_ms); rw_attribute(discard); rw_attribute(cache_replacement_policy); -rw_attribute(foreground_write_ratelimit_enabled); rw_attribute(copy_gc_enabled); sysfs_pd_controller_attribute(copy_gc); @@ -179,12 +181,9 @@ rw_attribute(tiering_enabled); rw_attribute(tiering_percent); sysfs_pd_controller_attribute(tiering); -sysfs_pd_controller_attribute(foreground_write); rw_attribute(pd_controllers_update_seconds); -rw_attribute(foreground_target_percent); - read_attribute(meta_replicas_have); read_attribute(data_replicas_have); @@ -272,18 +271,18 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) if (k.k->type == BCH_EXTENT) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); const struct bch_extent_ptr *ptr; - const union bch_extent_crc *crc; + struct bch_extent_crc_unpacked crc; extent_for_each_ptr_crc(e, ptr, crc) { - if (crc_compression_type(crc) == BCH_COMPRESSION_NONE) { + if (crc.compression_type == BCH_COMPRESSION_NONE) { nr_uncompressed_extents++; uncompressed_sectors += e.k->size; } else { nr_compressed_extents++; compressed_sectors_compressed += - crc_compressed_size(e.k, crc); + crc.compressed_size; compressed_sectors_uncompressed += - crc_uncompressed_size(e.k, crc); + crc.uncompressed_size; } /* only looking at the first ptr */ @@ -323,17 +322,17 @@ SHOW(bch2_fs) sysfs_print(read_realloc_races, atomic_long_read(&c->read_realloc_races)); + sysfs_print(extent_migrate_done, + atomic_long_read(&c->extent_migrate_done)); + sysfs_print(extent_migrate_raced, + atomic_long_read(&c->extent_migrate_raced)); sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); - sysfs_printf(foreground_write_ratelimit_enabled, "%i", - c->foreground_write_ratelimit_enabled); sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); - sysfs_pd_controller_show(foreground_write, &c->foreground_write_pd); sysfs_print(pd_controllers_update_seconds, c->pd_controllers_update_seconds); - sysfs_print(foreground_target_percent, c->foreground_target_percent); sysfs_printf(tiering_enabled, "%i", c->tiering_enabled); sysfs_print(tiering_percent, c->tiering_percent); @@ -371,9 +370,6 @@ STORE(__bch2_fs) sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms); sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); - sysfs_strtoul(foreground_write_ratelimit_enabled, - c->foreground_write_ratelimit_enabled); - if (attr == &sysfs_btree_gc_periodic) { ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) ?: (ssize_t) size; @@ -389,8 +385,8 @@ STORE(__bch2_fs) ?: (ssize_t) size; for_each_member_device(ca, c, i) - if (ca->moving_gc_read) - wake_up_process(ca->moving_gc_read); + if (ca->copygc_thread) + wake_up_process(ca->copygc_thread); return ret; } @@ -402,11 +398,8 @@ STORE(__bch2_fs) return ret; } - sysfs_pd_controller_store(foreground_write, &c->foreground_write_pd); - sysfs_strtoul(pd_controllers_update_seconds, c->pd_controllers_update_seconds); - sysfs_strtoul(foreground_target_percent, c->foreground_target_percent); sysfs_strtoul(tiering_percent, c->tiering_percent); sysfs_pd_controller_store(tiering, &c->tiers[1].pd); /* XXX */ @@ -466,7 +459,6 @@ struct attribute *bch2_fs_files[] = { &sysfs_journal_write_delay_ms, &sysfs_journal_reclaim_delay_ms, - &sysfs_foreground_target_percent, &sysfs_tiering_percent, &sysfs_compression_stats, @@ -494,17 +486,17 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_journal_pins, &sysfs_read_realloc_races, + &sysfs_extent_migrate_done, + &sysfs_extent_migrate_raced, &sysfs_trigger_journal_flush, &sysfs_trigger_btree_coalesce, &sysfs_trigger_gc, &sysfs_prune_cache, - &sysfs_foreground_write_ratelimit_enabled, &sysfs_copy_gc_enabled, &sysfs_tiering_enabled, sysfs_pd_controller_files(tiering), - sysfs_pd_controller_files(foreground_write), &sysfs_internal_uuid, #define BCH_DEBUG_PARAM(name, description) &sysfs_##name, @@ -710,17 +702,23 @@ static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf) static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) { struct bch_fs *c = ca->fs; - struct bch_dev_usage stats = bch2_dev_usage_read(ca); + struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); return scnprintf(buf, PAGE_SIZE, "free_inc: %zu/%zu\n" "free[RESERVE_BTREE]: %zu/%zu\n" "free[RESERVE_MOVINGGC]: %zu/%zu\n" "free[RESERVE_NONE]: %zu/%zu\n" - "alloc: %llu/%llu\n" - "meta: %llu/%llu\n" - "dirty: %llu/%llu\n" - "available: %llu/%llu\n" + "buckets:\n" + " capacity: %llu\n" + " alloc: %llu\n" + " meta: %llu\n" + " dirty: %llu\n" + " available: %llu\n" + "sectors:\n" + " meta: %llu\n" + " dirty: %llu\n" + " cached: %llu\n" "freelist_wait: %s\n" "open buckets: %u/%u (reserved %u)\n" "open_buckets_wait: %s\n", @@ -728,10 +726,14 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size, fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, - stats.buckets_alloc, ca->mi.nbuckets - ca->mi.first_bucket, - stats.buckets[S_META], ca->mi.nbuckets - ca->mi.first_bucket, - stats.buckets[S_DIRTY], ca->mi.nbuckets - ca->mi.first_bucket, - __dev_buckets_available(ca, stats), ca->mi.nbuckets - ca->mi.first_bucket, + ca->mi.nbuckets - ca->mi.first_bucket, + stats.buckets_alloc, + stats.buckets[S_META], + stats.buckets[S_DIRTY], + __dev_buckets_available(ca, stats), + stats.sectors[S_META], + stats.sectors[S_DIRTY], + stats.sectors_cached, c->freelist_wait.list.first ? "waiting" : "empty", c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_RESERVE, c->open_buckets_wait.list.first ? "waiting" : "empty"); @@ -769,7 +771,7 @@ SHOW(bch2_dev) { struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); struct bch_fs *c = ca->fs; - struct bch_dev_usage stats = bch2_dev_usage_read(ca); + struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); char *out = buf, *end = buf + PAGE_SIZE; sysfs_printf(uuid, "%pU\n", ca->uuid.b); @@ -788,8 +790,8 @@ SHOW(bch2_dev) sysfs_print(cached_buckets, stats.buckets_cached); sysfs_print(meta_buckets, stats.buckets[S_META]); sysfs_print(alloc_buckets, stats.buckets_alloc); - sysfs_print(available_buckets, dev_buckets_available(ca)); - sysfs_print(free_buckets, dev_buckets_free(ca)); + sysfs_print(available_buckets, __dev_buckets_available(ca, stats)); + sysfs_print(free_buckets, __dev_buckets_free(ca, stats)); if (attr == &sysfs_has_data) { out += bch2_scnprint_flag_list(out, end - out, @@ -799,7 +801,7 @@ SHOW(bch2_dev) return out - buf; } - sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd); + sysfs_pd_controller_show(copy_gc, &ca->copygc_pd); if (attr == &sysfs_cache_replacement_policy) { out += bch2_scnprint_string_list(out, end - out, @@ -843,7 +845,7 @@ STORE(bch2_dev) struct bch_fs *c = ca->fs; struct bch_member *mi; - sysfs_pd_controller_store(copy_gc, &ca->moving_gc_pd); + sysfs_pd_controller_store(copy_gc, &ca->copygc_pd); if (attr == &sysfs_discard) { bool v = strtoul_or_return(buf); @@ -899,6 +901,9 @@ STORE(bch2_dev) bch2_tiering_start(c); } + if (attr == &sysfs_wake_allocator) + bch2_wake_allocator(ca); + return size; } SYSFS_OPS(bch2_dev); @@ -942,6 +947,7 @@ struct attribute *bch2_dev_files[] = { /* debug: */ &sysfs_alloc_debug, + &sysfs_wake_allocator, sysfs_pd_controller_files(copy_gc), NULL diff --git a/libbcachefs/tier.c b/libbcachefs/tier.c index cbfcfcc..2e29f74 100644 --- a/libbcachefs/tier.c +++ b/libbcachefs/tier.c @@ -15,105 +15,23 @@ #include #include -struct tiering_state { - struct bch_tier *tier; - unsigned sectors; - unsigned stripe_size; - unsigned dev_idx; - struct bch_dev *ca; -}; - -static bool tiering_pred(struct bch_fs *c, - struct bch_tier *tier, - struct bkey_s_c k) +static bool tiering_pred(void *arg, struct bkey_s_c_extent e) { - if (bkey_extent_is_data(k.k)) { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const struct bch_extent_ptr *ptr; - unsigned replicas = 0; - - /* Make sure we have room to add a new pointer: */ - if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX > - BKEY_EXTENT_VAL_U64s_MAX) - return false; - - extent_for_each_ptr(e, ptr) - if (c->devs[ptr->dev]->mi.tier >= tier->idx) - replicas++; - - return replicas < c->opts.data_replicas; - } - - return false; -} - -static int issue_tiering_move(struct bch_fs *c, - struct bch_tier *tier, - struct moving_context *ctxt, - struct bkey_s_c k) -{ - int ret; - - ret = bch2_data_move(c, ctxt, &tier->devs, k, NULL); - if (!ret) - trace_tiering_copy(k.k); - else - trace_tiering_alloc_fail(c, k.k->size); - - return ret; -} - -/** - * tiering_next_cache - issue a move to write an extent to the next cache - * device in round robin order - */ -static s64 read_tiering(struct bch_fs *c, struct bch_tier *tier) -{ - struct moving_context ctxt; - struct btree_iter iter; - struct bkey_s_c k; - unsigned nr_devices = dev_mask_nr(&tier->devs); - int ret; - - if (!nr_devices) - return 0; - - trace_tiering_start(c); - - bch2_move_ctxt_init(&ctxt, &tier->pd.rate, - nr_devices * SECTORS_IN_FLIGHT_PER_DEVICE); - bch2_btree_iter_init(&iter, c, BTREE_ID_EXTENTS, POS_MIN, - BTREE_ITER_PREFETCH); - - while (!kthread_should_stop() && - !bch2_move_ctxt_wait(&ctxt) && - (k = bch2_btree_iter_peek(&iter)).k && - !btree_iter_err(k)) { - if (!tiering_pred(c, tier, k)) - goto next; - - ret = issue_tiering_move(c, tier, &ctxt, k); - if (ret) { - bch2_btree_iter_unlock(&iter); - - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(&ctxt); - continue; - } -next: - bch2_btree_iter_advance_pos(&iter); - //bch2_btree_iter_cond_resched(&iter); + struct bch_tier *tier = arg; + struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]); + const struct bch_extent_ptr *ptr; + unsigned replicas = 0; - /* unlock before calling moving_context_wait() */ - bch2_btree_iter_unlock(&iter); - cond_resched(); - } + /* Make sure we have room to add a new pointer: */ + if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX > + BKEY_EXTENT_VAL_U64s_MAX) + return false; - bch2_btree_iter_unlock(&iter); - bch2_move_ctxt_exit(&ctxt); - trace_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved); + extent_for_each_ptr(e, ptr) + if (c->devs[ptr->dev]->mi.tier >= tier->idx) + replicas++; - return ctxt.sectors_moved; + return replicas < c->opts.data_replicas; } static int bch2_tiering_thread(void *arg) @@ -122,15 +40,15 @@ static int bch2_tiering_thread(void *arg) struct bch_fs *c = container_of(tier, struct bch_fs, tiers[tier->idx]); struct io_clock *clock = &c->io_clock[WRITE]; struct bch_dev *ca; - u64 tier_capacity, available_sectors; + u64 tier_capacity, available_sectors, keys_moved, sectors_moved; unsigned long last; - unsigned i; + unsigned i, nr_devices; set_freezable(); while (!kthread_should_stop()) { if (kthread_wait_freezable(c->tiering_enabled && - dev_mask_nr(&tier->devs))) + (nr_devices = dev_mask_nr(&tier->devs)))) break; while (1) { @@ -151,7 +69,7 @@ static int bch2_tiering_thread(void *arg) ca->mi.first_bucket); available_sectors += bucket_to_sector(ca, - dev_buckets_available(ca)); + dev_buckets_available(c, ca)); } rcu_read_unlock(); } @@ -167,7 +85,15 @@ static int bch2_tiering_thread(void *arg) return 0; } - read_tiering(c, tier); + bch2_move_data(c, &tier->pd.rate, + SECTORS_IN_FLIGHT_PER_DEVICE * nr_devices, + &tier->devs, + writepoint_ptr(&tier->wp), + 0, + -1, + tiering_pred, tier, + &keys_moved, + §ors_moved); } return 0; diff --git a/libbcachefs/util.c b/libbcachefs/util.c index 2eb8ca7..fa85375 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -291,13 +291,15 @@ void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done) int bch2_ratelimit_wait_freezable_stoppable(struct bch_ratelimit *d) { + bool kthread = (current->flags & PF_KTHREAD) != 0; + while (1) { u64 delay = bch2_ratelimit_delay(d); if (delay) set_current_state(TASK_INTERRUPTIBLE); - if (kthread_should_stop()) + if (kthread && kthread_should_stop()) return 1; if (!delay) @@ -434,8 +436,11 @@ size_t bch2_rand_range(size_t max) { size_t rand; + if (!max) + return 0; + do { - get_random_bytes(&rand, sizeof(rand)); + rand = get_random_long(); rand &= roundup_pow_of_two(max) - 1; } while (rand >= max); @@ -642,3 +647,129 @@ void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) return vpmalloc(size, gfp_mask); } + +#if 0 +void eytzinger1_test(void) +{ + unsigned inorder, eytz, size; + + pr_info("1 based eytzinger test:"); + + for (size = 2; + size < 65536; + size++) { + unsigned extra = eytzinger1_extra(size); + + if (!(size % 4096)) + pr_info("tree size %u", size); + + BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); + BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); + + BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); + BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); + + inorder = 1; + eytzinger1_for_each(eytz, size) { + BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz); + BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder); + BUG_ON(eytz != eytzinger1_last(size) && + eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz); + + inorder++; + } + } +} + +void eytzinger0_test(void) +{ + + unsigned inorder, eytz, size; + + pr_info("0 based eytzinger test:"); + + for (size = 1; + size < 65536; + size++) { + unsigned extra = eytzinger0_extra(size); + + if (!(size % 4096)) + pr_info("tree size %u", size); + + BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); + BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); + + BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); + BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); + + inorder = 0; + eytzinger0_for_each(eytz, size) { + BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz); + BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder); + BUG_ON(eytz != eytzinger0_last(size) && + eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz); + + inorder++; + } + } +} + +static inline int cmp_u16(const void *_l, const void *_r, size_t size) +{ + const u16 *l = _l, *r = _r; + + return (*l > *r) - (*r - *l); +} + +static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) +{ + int i, c1 = -1, c2 = -1; + ssize_t r; + + r = eytzinger0_find_le(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + if (r >= 0) + c1 = test_array[r]; + + for (i = 0; i < nr; i++) + if (test_array[i] <= search && test_array[i] > c2) + c2 = test_array[i]; + + if (c1 != c2) { + eytzinger0_for_each(i, nr) + pr_info("[%3u] = %12u", i, test_array[i]); + pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", + i, r, c1, c2); + } +} + +void eytzinger0_find_test(void) +{ + unsigned i, nr, allocated = 1 << 12; + u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); + + for (nr = 1; nr < allocated; nr++) { + pr_info("testing %u elems", nr); + + get_random_bytes(test_array, nr * sizeof(test_array[0])); + eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); + + /* verify array is sorted correctly: */ + eytzinger0_for_each(i, nr) + BUG_ON(i != eytzinger0_last(nr) && + test_array[i] > test_array[eytzinger0_next(i, nr)]); + + for (i = 0; i < U16_MAX; i += 1 << 12) + eytzinger0_find_test_val(test_array, nr, i); + + for (i = 0; i < nr; i++) { + eytzinger0_find_test_val(test_array, nr, test_array[i] - 1); + eytzinger0_find_test_val(test_array, nr, test_array[i]); + eytzinger0_find_test_val(test_array, nr, test_array[i] + 1); + } + } + + kfree(test_array); +} +#endif diff --git a/libbcachefs/util.h b/libbcachefs/util.h index b91b2dc..a251bf9 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -789,4 +789,28 @@ void sort_cmp_size(void *base, size_t num, size_t size, int (*cmp_func)(const void *, const void *, size_t), void (*swap_func)(void *, void *, size_t)); +/* just the memmove, doesn't update @_nr */ +#define __array_insert_item(_array, _nr, _pos) \ + memmove(&(_array)[(_pos) + 1], \ + &(_array)[(_pos)], \ + sizeof((_array)[0]) * ((_nr) - (_pos))) + +#define array_insert_item(_array, _nr, _pos, _new_item) \ +do { \ + __array_insert_item(_array, _nr, _pos); \ + (_nr)++; \ + (_array)[(_pos)] = (_new_item); \ +} while (0) + +#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ +do { \ + (_nr) -= (_nr_to_remove); \ + memmove(&(_array)[(_pos)], \ + &(_array)[(_pos) + (_nr_to_remove)], \ + sizeof((_array)[0]) * ((_nr) - (_pos))); \ +} while (0) + +#define array_remove_item(_array, _nr, _pos) \ + array_remove_items(_array, _nr, _pos, 1) + #endif /* _BCACHEFS_UTIL_H */ -- 2.39.2