From cd9892e543845e045142ed29e4a5a9ce446a205e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 24 Aug 2020 16:05:04 -0400 Subject: [PATCH] Update bcachefs sources to 10ab39f2fa bcachefs: Improvements to the journal read error paths --- .bcachefs_revision | 2 +- cmd_fs.c | 10 +- include/linux/blkdev.h | 1 + include/linux/vmalloc.h | 26 +- include/trace/events/bcachefs.h | 6 +- libbcachefs.c | 4 +- libbcachefs/alloc_background.c | 36 +- libbcachefs/alloc_foreground.c | 264 ++++++------- libbcachefs/alloc_foreground.h | 23 +- libbcachefs/alloc_types.h | 1 + libbcachefs/bcachefs.h | 27 +- libbcachefs/bcachefs_format.h | 19 +- libbcachefs/bset.c | 61 --- libbcachefs/bset.h | 34 +- libbcachefs/btree_cache.c | 59 +-- libbcachefs/btree_cache.h | 7 +- libbcachefs/btree_gc.c | 27 +- libbcachefs/btree_io.c | 117 +++--- libbcachefs/btree_io.h | 34 +- libbcachefs/btree_key_cache.c | 2 +- libbcachefs/btree_types.h | 6 +- libbcachefs/btree_update_interior.c | 46 +-- libbcachefs/btree_update_interior.h | 4 +- libbcachefs/btree_update_leaf.c | 18 +- libbcachefs/buckets.c | 447 +++++++++++----------- libbcachefs/buckets.h | 21 +- libbcachefs/buckets_types.h | 2 + libbcachefs/chardev.c | 2 +- libbcachefs/clock.c | 7 +- libbcachefs/clock.h | 2 +- libbcachefs/compress.c | 6 +- libbcachefs/disk_groups.c | 11 +- libbcachefs/disk_groups.h | 3 + libbcachefs/ec.c | 561 +++++++++++++++++++++------- libbcachefs/ec.h | 18 +- libbcachefs/ec_types.h | 1 + libbcachefs/extents.c | 16 +- libbcachefs/fs-io.c | 25 +- libbcachefs/fs-ioctl.c | 6 +- libbcachefs/fs.c | 9 +- libbcachefs/fsck.c | 4 + libbcachefs/io.c | 54 ++- libbcachefs/io.h | 2 + libbcachefs/io_types.h | 1 - libbcachefs/journal.c | 40 +- libbcachefs/journal.h | 4 +- libbcachefs/journal_io.c | 102 +++-- libbcachefs/journal_io.h | 2 + libbcachefs/journal_reclaim.c | 8 +- libbcachefs/journal_seq_blacklist.c | 9 - libbcachefs/journal_seq_blacklist.h | 9 + libbcachefs/move.c | 20 +- libbcachefs/movinggc.c | 245 ++++++------ libbcachefs/movinggc.h | 6 +- libbcachefs/opts.c | 9 +- libbcachefs/opts.h | 7 +- libbcachefs/rebalance.c | 19 +- libbcachefs/rebalance.h | 2 +- libbcachefs/recovery.c | 21 +- libbcachefs/replicas.c | 101 ++--- libbcachefs/replicas.h | 10 +- libbcachefs/super-io.c | 7 +- libbcachefs/super.c | 62 +-- libbcachefs/super.h | 9 + libbcachefs/sysfs.c | 271 +++++++------- libbcachefs/util.c | 25 +- libbcachefs/util.h | 4 +- libbcachefs/xattr.c | 6 +- linux/bio.c | 9 + 69 files changed, 1692 insertions(+), 1347 deletions(-) diff --git a/.bcachefs_revision b/.bcachefs_revision index ac11557..7f15ea6 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -f385d13bf8510277eedcc24de4a6a3c3bf8f334b +10ab39f2faede817eebfd04a4990e739d0cedcb8 diff --git a/cmd_fs.c b/cmd_fs.c index 499a2c4..f0b67b6 100644 --- a/cmd_fs.c +++ b/cmd_fs.c @@ -42,7 +42,7 @@ static void print_dev_usage(struct bchfs_handle fs, printf("%-20s%12s%12s%12s\n", "", "data", "buckets", "fragmented"); - for (i = BCH_DATA_SB; i < BCH_DATA_NR; i++) { + for (i = BCH_DATA_sb; i < BCH_DATA_NR; i++) { print_dev_usage_type(bch2_data_types[i], u.bucket_size, u.buckets[i], @@ -162,21 +162,21 @@ static void print_fs_usage(const char *path, enum units units) struct bch_replicas_usage *r; for_each_usage_replica(u, r) - if (r->r.data_type < BCH_DATA_USER) + if (r->r.data_type < BCH_DATA_user) print_replicas_usage(r, &dev_names, units); for_each_usage_replica(u, r) - if (r->r.data_type == BCH_DATA_USER && + if (r->r.data_type == BCH_DATA_user && r->r.nr_required <= 1) print_replicas_usage(r, &dev_names, units); for_each_usage_replica(u, r) - if (r->r.data_type == BCH_DATA_USER && + if (r->r.data_type == BCH_DATA_user && r->r.nr_required > 1) print_replicas_usage(r, &dev_names, units); for_each_usage_replica(u, r) - if (r->r.data_type > BCH_DATA_USER) + if (r->r.data_type > BCH_DATA_user) print_replicas_usage(r, &dev_names, units); free(u); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e4982f9..318bcfa 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -201,6 +201,7 @@ static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx) int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); +const char *blk_status_to_str(blk_status_t status); #endif /* __TOOLS_LINUX_BLKDEV_H */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c91e3a8..efcc191 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -12,7 +12,7 @@ #define vfree(p) free(p) -static inline void *__vmalloc(unsigned long size, gfp_t gfp_mask, unsigned prot) +static inline void *__vmalloc(unsigned long size, gfp_t gfp_mask) { void *p; @@ -22,26 +22,36 @@ static inline void *__vmalloc(unsigned long size, gfp_t gfp_mask, unsigned prot) if (!p) return NULL; - if (prot == PAGE_KERNEL_EXEC && - mprotect(p, size, PROT_READ|PROT_WRITE|PROT_EXEC)) { + if (gfp_mask & __GFP_ZERO) + memset(p, 0, size); + + return p; +} + +static inline void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) +{ + void *p; + + p = __vmalloc(size, gfp_mask); + if (!p) + return NULL; + + if (mprotect(p, size, PROT_READ|PROT_WRITE|PROT_EXEC)) { vfree(p); return NULL; } - if (gfp_mask & __GFP_ZERO) - memset(p, 0, size); - return p; } static inline void *vmalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL); } static inline void *vzalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL|__GFP_ZERO, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL|__GFP_ZERO); } #endif /* __TOOLS_LINUX_VMALLOC_H */ diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index bafbcca..9b4e829 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -470,10 +470,10 @@ TRACE_EVENT(move_data, ); TRACE_EVENT(copygc, - TP_PROTO(struct bch_dev *ca, + TP_PROTO(struct bch_fs *c, u64 sectors_moved, u64 sectors_not_moved, u64 buckets_moved, u64 buckets_not_moved), - TP_ARGS(ca, + TP_ARGS(c, sectors_moved, sectors_not_moved, buckets_moved, buckets_not_moved), @@ -486,7 +486,7 @@ TRACE_EVENT(copygc, ), TP_fast_assign( - memcpy(__entry->uuid, ca->uuid.b, 16); + memcpy(__entry->uuid, c->sb.user_uuid.b, 16); __entry->sectors_moved = sectors_moved; __entry->sectors_not_moved = sectors_not_moved; __entry->buckets_moved = buckets_moved; diff --git a/libbcachefs.c b/libbcachefs.c index 3122723..260e6c7 100644 --- a/libbcachefs.c +++ b/libbcachefs.c @@ -948,8 +948,8 @@ int bchu_data(struct bchfs_handle fs, struct bch_ioctl_data cmd) bch2_data_types[e.p.data_type]); switch (e.p.data_type) { - case BCH_DATA_BTREE: - case BCH_DATA_USER: + case BCH_DATA_btree: + case BCH_DATA_user: printf(" %s:%llu:%llu", bch2_btree_ids[e.p.btree_id], e.p.pos.inode, diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index cb720ee..9aa0b42 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -41,29 +41,26 @@ static void pd_controllers_update(struct work_struct *work) struct bch_fs, pd_controllers_update); struct bch_dev *ca; + s64 free = 0, fragmented = 0; unsigned i; for_each_member_device(ca, c, i) { - struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); + struct bch_dev_usage stats = bch2_dev_usage_read(ca); - u64 free = bucket_to_sector(ca, + free += bucket_to_sector(ca, __dev_buckets_free(ca, stats)) << 9; /* * Bytes of internal fragmentation, which can be * reclaimed by copy GC */ - s64 fragmented = (bucket_to_sector(ca, - stats.buckets[BCH_DATA_USER] + - stats.buckets[BCH_DATA_CACHED]) - - (stats.sectors[BCH_DATA_USER] + - stats.sectors[BCH_DATA_CACHED])) << 9; - - fragmented = max(0LL, fragmented); - - bch2_pd_controller_update(&ca->copygc_pd, - free, fragmented, -1); + fragmented += max_t(s64, 0, (bucket_to_sector(ca, + stats.buckets[BCH_DATA_user] + + stats.buckets[BCH_DATA_cached]) - + (stats.sectors[BCH_DATA_user] + + stats.sectors[BCH_DATA_cached])) << 9); } + bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1); schedule_delayed_work(&c->pd_controllers_update, c->pd_controllers_update_seconds * HZ); } @@ -353,6 +350,8 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) bch2_btree_iter_set_pos(iter, POS(i, first_bucket)); while (1) { + bch2_trans_cond_resched(&trans); + ret = bch2_alloc_write_key(&trans, iter, flags); if (ret < 0 || ret == ALLOC_END) break; @@ -517,11 +516,13 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) if (gc_count != c->gc_count) ca->inc_gen_really_needs_gc = 0; - available = max_t(s64, 0, dev_buckets_available(c, ca) - + available = max_t(s64, 0, dev_buckets_available(ca) - ca->inc_gen_really_needs_gc); if (available > fifo_free(&ca->free_inc) || - (available && !fifo_full(&ca->free[RESERVE_BTREE]))) + (available && + (!fifo_full(&ca->free[RESERVE_BTREE]) || + !fifo_full(&ca->free[RESERVE_MOVINGGC])))) break; up_read(&c->gc_lock); @@ -1191,7 +1192,7 @@ stop: void bch2_recalc_capacity(struct bch_fs *c) { struct bch_dev *ca; - u64 capacity = 0, reserved_sectors = 0, gc_reserve; + u64 capacity = 0, reserved_sectors = 0, gc_reserve, copygc_threshold = 0; unsigned bucket_size_max = 0; unsigned long ra_pages = 0; unsigned i, j; @@ -1234,7 +1235,7 @@ void bch2_recalc_capacity(struct bch_fs *c) dev_reserve *= ca->mi.bucket_size; - ca->copygc_threshold = dev_reserve; + copygc_threshold += dev_reserve; capacity += bucket_to_sector(ca, ca->mi.nbuckets - ca->mi.first_bucket); @@ -1253,6 +1254,7 @@ void bch2_recalc_capacity(struct bch_fs *c) reserved_sectors = min(reserved_sectors, capacity); + c->copygc_threshold = copygc_threshold; c->capacity = capacity - reserved_sectors; c->bucket_size_max = bucket_size_max; @@ -1312,7 +1314,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) for (i = 0; i < ARRAY_SIZE(c->write_points); i++) bch2_writepoint_stop(c, ca, &c->write_points[i]); - bch2_writepoint_stop(c, ca, &ca->copygc_write_point); + bch2_writepoint_stop(c, ca, &c->copygc_write_point); bch2_writepoint_stop(c, ca, &c->rebalance_write_point); bch2_writepoint_stop(c, ca, &c->btree_write_point); diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 979aba3..4a04882 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -70,12 +70,6 @@ #include #include -enum bucket_alloc_ret { - ALLOC_SUCCESS, - OPEN_BUCKETS_EMPTY, - FREELIST_EMPTY, /* Allocator thread not keeping up */ -}; - /* * Open buckets represent a bucket that's currently being allocated from. They * serve two purposes: @@ -150,12 +144,13 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) } static void open_bucket_free_unused(struct bch_fs *c, - struct open_bucket *ob, - bool may_realloc) + struct write_point *wp, + struct open_bucket *ob) { struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); + bool may_realloc = wp->type == BCH_DATA_user; - BUG_ON(ca->open_buckets_partial_nr >= + BUG_ON(ca->open_buckets_partial_nr > ARRAY_SIZE(ca->open_buckets_partial)); if (ca->open_buckets_partial_nr < @@ -234,13 +229,22 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, spin_lock(&c->freelist_lock); - if (may_alloc_partial && - ca->open_buckets_partial_nr) { - ob = c->open_buckets + - ca->open_buckets_partial[--ca->open_buckets_partial_nr]; - ob->on_partial_list = false; - spin_unlock(&c->freelist_lock); - return ob; + if (may_alloc_partial) { + int i; + + for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { + ob = c->open_buckets + ca->open_buckets_partial[i]; + + if (reserve <= ob->alloc_reserve) { + array_remove_item(ca->open_buckets_partial, + ca->open_buckets_partial_nr, + i); + ob->on_partial_list = false; + ob->alloc_reserve = reserve; + spin_unlock(&c->freelist_lock); + return ob; + } + } } if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { @@ -297,6 +301,7 @@ out: ob->valid = true; ob->sectors_free = ca->mi.bucket_size; + ob->alloc_reserve = reserve; ob->ptr = (struct bch_extent_ptr) { .type = 1 << BCH_EXTENT_ENTRY_ptr, .gen = buckets->b[bucket].mark.gen, @@ -344,21 +349,20 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, struct bch_devs_mask *devs) { struct dev_alloc_list ret = { .nr = 0 }; - struct bch_dev *ca; unsigned i; - for_each_member_device_rcu(ca, c, i, devs) + for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) ret.devs[ret.nr++] = i; bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); return ret; } -void bch2_dev_stripe_increment(struct bch_fs *c, struct bch_dev *ca, +void bch2_dev_stripe_increment(struct bch_dev *ca, struct dev_stripe_state *stripe) { u64 *v = stripe->next_alloc + ca->dev_idx; - u64 free_space = dev_buckets_free(c, ca); + u64 free_space = dev_buckets_free(ca); u64 free_space_inv = free_space ? div64_u64(1ULL << 48, free_space) : 1ULL << 48; @@ -396,21 +400,22 @@ static void add_new_bucket(struct bch_fs *c, ob_push(c, ptrs, ob); } -static int bch2_bucket_alloc_set(struct bch_fs *c, - struct open_buckets *ptrs, - struct dev_stripe_state *stripe, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum alloc_reserve reserve, - unsigned flags, - struct closure *cl) +enum bucket_alloc_ret +bch2_bucket_alloc_set(struct bch_fs *c, + struct open_buckets *ptrs, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + enum alloc_reserve reserve, + unsigned flags, + struct closure *cl) { struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); struct bch_dev *ca; - bool alloc_failure = false; + enum bucket_alloc_ret ret = INSUFFICIENT_DEVICES; unsigned i; BUG_ON(*nr_effective >= nr_replicas); @@ -428,101 +433,27 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, ob = bch2_bucket_alloc(c, ca, reserve, flags & BUCKET_MAY_ALLOC_PARTIAL, cl); if (IS_ERR(ob)) { - enum bucket_alloc_ret ret = -PTR_ERR(ob); - - WARN_ON(reserve == RESERVE_MOVINGGC && - ret != OPEN_BUCKETS_EMPTY); + ret = -PTR_ERR(ob); if (cl) - return -EAGAIN; - if (ret == OPEN_BUCKETS_EMPTY) - return -ENOSPC; - alloc_failure = true; + return ret; continue; } add_new_bucket(c, ptrs, devs_may_alloc, nr_effective, have_cache, flags, ob); - bch2_dev_stripe_increment(c, ca, stripe); + bch2_dev_stripe_increment(ca, stripe); if (*nr_effective >= nr_replicas) - return 0; + return ALLOC_SUCCESS; } - return alloc_failure ? -ENOSPC : -EROFS; + return ret; } /* Allocate from stripes: */ -/* - * XXX: use a higher watermark for allocating open buckets here: - */ -static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) -{ - struct bch_devs_mask devs; - struct open_bucket *ob; - unsigned i, nr_have = 0, nr_data = - min_t(unsigned, h->nr_active_devs, - EC_STRIPE_MAX) - h->redundancy; - bool have_cache = true; - int ret = 0; - - BUG_ON(h->blocks.nr > nr_data); - BUG_ON(h->parity.nr > h->redundancy); - - devs = h->devs; - - open_bucket_for_each(c, &h->parity, ob, i) - __clear_bit(ob->ptr.dev, devs.d); - open_bucket_for_each(c, &h->blocks, ob, i) - __clear_bit(ob->ptr.dev, devs.d); - - percpu_down_read(&c->mark_lock); - rcu_read_lock(); - - if (h->parity.nr < h->redundancy) { - nr_have = h->parity.nr; - - ret = bch2_bucket_alloc_set(c, &h->parity, - &h->parity_stripe, - &devs, - h->redundancy, - &nr_have, - &have_cache, - RESERVE_NONE, - 0, - NULL); - if (ret) - goto err; - } - - if (h->blocks.nr < nr_data) { - nr_have = h->blocks.nr; - - ret = bch2_bucket_alloc_set(c, &h->blocks, - &h->block_stripe, - &devs, - nr_data, - &nr_have, - &have_cache, - RESERVE_NONE, - 0, - NULL); - if (ret) - goto err; - } - - rcu_read_unlock(); - percpu_up_read(&c->mark_lock); - - return bch2_ec_stripe_new_alloc(c, h); -err: - rcu_read_unlock(); - percpu_up_read(&c->mark_lock); - return -1; -} - /* * if we can't allocate a new stripe because there are already too many * partially filled stripes, force allocating from an existing stripe even when @@ -555,34 +486,30 @@ static void bucket_alloc_from_stripe(struct bch_fs *c, if (ec_open_bucket(c, ptrs)) return; - h = bch2_ec_stripe_head_get(c, target, erasure_code, nr_replicas - 1); + h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1); if (!h) return; - if (!h->s && ec_stripe_alloc(c, h)) - goto out_put_head; - - rcu_read_lock(); devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); - rcu_read_unlock(); for (i = 0; i < devs_sorted.nr; i++) open_bucket_for_each(c, &h->s->blocks, ob, ec_idx) if (ob->ptr.dev == devs_sorted.devs[i] && - !test_and_set_bit(ec_idx, h->s->blocks_allocated)) + !test_and_set_bit(h->s->data_block_idx[ec_idx], + h->s->blocks_allocated)) goto got_bucket; goto out_put_head; got_bucket: ca = bch_dev_bkey_exists(c, ob->ptr.dev); - ob->ec_idx = ec_idx; + ob->ec_idx = h->s->data_block_idx[ec_idx]; ob->ec = h->s; add_new_bucket(c, ptrs, devs_may_alloc, nr_effective, have_cache, flags, ob); atomic_inc(&h->s->pin); out_put_head: - bch2_ec_stripe_head_put(h); + bch2_ec_stripe_head_put(c, h); } /* Sector allocator */ @@ -607,7 +534,7 @@ static void get_buckets_from_writepoint(struct bch_fs *c, if (*nr_effective < nr_replicas && test_bit(ob->ptr.dev, devs_may_alloc->d) && (ca->mi.durability || - (wp->type == BCH_DATA_USER && !*have_cache)) && + (wp->type == BCH_DATA_user && !*have_cache)) && (ob->ec || !need_ec)) { add_new_bucket(c, ptrs, devs_may_alloc, nr_effective, have_cache, @@ -619,24 +546,25 @@ static void get_buckets_from_writepoint(struct bch_fs *c, wp->ptrs = ptrs_skip; } -static int open_bucket_add_buckets(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_list *devs_have, - u16 target, - unsigned erasure_code, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum alloc_reserve reserve, - unsigned flags, - struct closure *_cl) +static enum bucket_alloc_ret +open_bucket_add_buckets(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_list *devs_have, + u16 target, + unsigned erasure_code, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + enum alloc_reserve reserve, + unsigned flags, + struct closure *_cl) { struct bch_devs_mask devs; struct open_bucket *ob; struct closure *cl = NULL; + enum bucket_alloc_ret ret; unsigned i; - int ret; rcu_read_lock(); devs = target_rw_devs(c, wp->type, target); @@ -650,18 +578,22 @@ static int open_bucket_add_buckets(struct bch_fs *c, __clear_bit(ob->ptr.dev, devs.d); if (erasure_code) { - get_buckets_from_writepoint(c, ptrs, wp, &devs, - nr_replicas, nr_effective, - have_cache, flags, true); - if (*nr_effective >= nr_replicas) - return 0; + if (!ec_open_bucket(c, ptrs)) { + get_buckets_from_writepoint(c, ptrs, wp, &devs, + nr_replicas, nr_effective, + have_cache, flags, true); + if (*nr_effective >= nr_replicas) + return 0; + } - bucket_alloc_from_stripe(c, ptrs, wp, &devs, - target, erasure_code, - nr_replicas, nr_effective, - have_cache, flags); - if (*nr_effective >= nr_replicas) - return 0; + if (!ec_open_bucket(c, ptrs)) { + bucket_alloc_from_stripe(c, ptrs, wp, &devs, + target, erasure_code, + nr_replicas, nr_effective, + have_cache, flags); + if (*nr_effective >= nr_replicas) + return 0; + } } get_buckets_from_writepoint(c, ptrs, wp, &devs, @@ -681,7 +613,7 @@ retry_blocking: ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, nr_replicas, nr_effective, have_cache, reserve, flags, cl); - if (ret && ret != -EROFS && !cl && _cl) { + if (ret && ret != INSUFFICIENT_DEVICES && !cl && _cl) { cl = _cl; goto retry_blocking; } @@ -872,7 +804,8 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, unsigned nr_effective, write_points_nr; unsigned ob_flags = 0; bool have_cache; - int ret, i; + enum bucket_alloc_ret ret; + int i; if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) ob_flags |= BUCKET_ALLOC_USE_DURABILITY; @@ -886,11 +819,11 @@ retry: wp = writepoint_find(c, write_point.v); - if (wp->type == BCH_DATA_USER) + if (wp->type == BCH_DATA_user) ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; /* metadata may not allocate on cache devices: */ - if (wp->type != BCH_DATA_USER) + if (wp->type != BCH_DATA_user) have_cache = true; if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { @@ -920,7 +853,7 @@ alloc_done: if (erasure_code && !ec_open_bucket(c, &ptrs)) pr_debug("failed to get ec bucket: ret %u", ret); - if (ret == -EROFS && + if (ret == INSUFFICIENT_DEVICES && nr_effective >= nr_replicas_required) ret = 0; @@ -929,7 +862,7 @@ alloc_done: /* Free buckets we didn't use: */ open_bucket_for_each(c, &wp->ptrs, ob, i) - open_bucket_free_unused(c, ob, wp->type == BCH_DATA_USER); + open_bucket_free_unused(c, wp, ob); wp->ptrs = ptrs; @@ -948,17 +881,24 @@ err: if (ptrs.nr < ARRAY_SIZE(ptrs.v)) ob_push(c, &ptrs, ob); else - open_bucket_free_unused(c, ob, - wp->type == BCH_DATA_USER); + open_bucket_free_unused(c, wp, ob); wp->ptrs = ptrs; mutex_unlock(&wp->lock); - if (ret == -ENOSPC && + if (ret == FREELIST_EMPTY && try_decrease_writepoints(c, write_points_nr)) goto retry; - return ERR_PTR(ret); + switch (ret) { + case OPEN_BUCKETS_EMPTY: + case FREELIST_EMPTY: + return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC); + case INSUFFICIENT_DEVICES: + return ERR_PTR(-EROFS); + default: + BUG(); + } } /* @@ -980,7 +920,7 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, struct bch_extent_ptr tmp = ob->ptr; tmp.cached = !ca->mi.durability && - wp->type == BCH_DATA_USER; + wp->type == BCH_DATA_user; tmp.offset += ca->mi.bucket_size - ob->sectors_free; bch2_bkey_append_ptr(k, tmp); @@ -1009,6 +949,13 @@ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) bch2_open_buckets_put(c, &ptrs); } +static inline void writepoint_init(struct write_point *wp, + enum bch_data_type type) +{ + mutex_init(&wp->lock); + wp->type = type; +} + void bch2_fs_allocator_foreground_init(struct bch_fs *c) { struct open_bucket *ob; @@ -1029,12 +976,13 @@ void bch2_fs_allocator_foreground_init(struct bch_fs *c) c->open_buckets_freelist = ob - c->open_buckets; } - writepoint_init(&c->btree_write_point, BCH_DATA_BTREE); - writepoint_init(&c->rebalance_write_point, BCH_DATA_USER); + writepoint_init(&c->btree_write_point, BCH_DATA_btree); + writepoint_init(&c->rebalance_write_point, BCH_DATA_user); + writepoint_init(&c->copygc_write_point, BCH_DATA_user); for (wp = c->write_points; wp < c->write_points + c->write_points_nr; wp++) { - writepoint_init(wp, BCH_DATA_USER); + writepoint_init(wp, BCH_DATA_user); wp->last_used = sched_clock(); wp->write_point = (unsigned long) wp; diff --git a/libbcachefs/alloc_foreground.h b/libbcachefs/alloc_foreground.h index 687f973..c658295 100644 --- a/libbcachefs/alloc_foreground.h +++ b/libbcachefs/alloc_foreground.h @@ -12,6 +12,13 @@ struct bch_dev; struct bch_fs; struct bch_devs_List; +enum bucket_alloc_ret { + ALLOC_SUCCESS, + OPEN_BUCKETS_EMPTY, + FREELIST_EMPTY, /* Allocator thread not keeping up */ + INSUFFICIENT_DEVICES, +}; + struct dev_alloc_list { unsigned nr; u8 devs[BCH_SB_MEMBERS_MAX]; @@ -20,8 +27,7 @@ struct dev_alloc_list { struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, struct dev_stripe_state *, struct bch_devs_mask *); -void bch2_dev_stripe_increment(struct bch_fs *, struct bch_dev *, - struct dev_stripe_state *); +void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); long bch2_bucket_alloc_new_fs(struct bch_dev *); @@ -92,6 +98,12 @@ static inline void bch2_open_bucket_get(struct bch_fs *c, } } +enum bucket_alloc_ret +bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, + struct dev_stripe_state *, struct bch_devs_mask *, + unsigned, unsigned *, bool *, enum alloc_reserve, + unsigned, struct closure *); + struct write_point *bch2_alloc_sectors_start(struct bch_fs *, unsigned, unsigned, struct write_point_specifier, @@ -121,13 +133,6 @@ static inline struct write_point_specifier writepoint_ptr(struct write_point *wp return (struct write_point_specifier) { .v = (unsigned long) wp }; } -static inline void writepoint_init(struct write_point *wp, - enum bch_data_type type) -{ - mutex_init(&wp->lock); - wp->type = type; -} - void bch2_fs_allocator_foreground_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/libbcachefs/alloc_types.h b/libbcachefs/alloc_types.h index 4f14650..2070546 100644 --- a/libbcachefs/alloc_types.h +++ b/libbcachefs/alloc_types.h @@ -66,6 +66,7 @@ struct open_bucket { u8 type; unsigned valid:1; unsigned on_partial_list:1; + int alloc_reserve:3; unsigned sectors_free; struct bch_extent_ptr ptr; struct ec_stripe_new *ec; diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index 893c89d..3a5a00e 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -202,7 +202,8 @@ #include "opts.h" #include "util.h" -#include +#define dynamic_fault(...) 0 +#define race_fault(...) 0 #define bch2_fs_init_fault(name) \ dynamic_fault("bcachefs:bch_fs_init:" name) @@ -451,13 +452,6 @@ struct bch_dev { alloc_heap alloc_heap; - /* Copying GC: */ - struct task_struct *copygc_thread; - copygc_heap copygc_heap; - struct bch_pd_controller copygc_pd; - struct write_point copygc_write_point; - u64 copygc_threshold; - atomic64_t rebalance_work; struct journal_device journal; @@ -751,16 +745,27 @@ struct bch_fs { /* REBALANCE */ struct bch_fs_rebalance rebalance; + /* COPYGC */ + struct task_struct *copygc_thread; + copygc_heap copygc_heap; + struct bch_pd_controller copygc_pd; + struct write_point copygc_write_point; + u64 copygc_threshold; + /* STRIPES: */ GENRADIX(struct stripe) stripes[2]; - struct mutex ec_stripe_create_lock; ec_stripes_heap ec_stripes_heap; spinlock_t ec_stripes_heap_lock; /* ERASURE CODING */ - struct list_head ec_new_stripe_list; - struct mutex ec_new_stripe_lock; + struct list_head ec_stripe_head_list; + struct mutex ec_stripe_head_lock; + + struct list_head ec_stripe_new_list; + struct mutex ec_stripe_new_lock; + + struct work_struct ec_stripe_create_work; u64 ec_stripe_hint; struct bio_set ec_bioset; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index f808e63..d5a2230 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -1026,14 +1026,19 @@ LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); /* BCH_SB_FIELD_replicas: */ +#define BCH_DATA_TYPES() \ + x(none, 0) \ + x(sb, 1) \ + x(journal, 2) \ + x(btree, 3) \ + x(user, 4) \ + x(cached, 5) + enum bch_data_type { - BCH_DATA_NONE = 0, - BCH_DATA_SB = 1, - BCH_DATA_JOURNAL = 2, - BCH_DATA_BTREE = 3, - BCH_DATA_USER = 4, - BCH_DATA_CACHED = 5, - BCH_DATA_NR = 6, +#define x(t, n) BCH_DATA_##t, + BCH_DATA_TYPES() +#undef x + BCH_DATA_NR }; struct bch_replicas_entry_v0 { diff --git a/libbcachefs/bset.c b/libbcachefs/bset.c index 6fc91e6..f7c2841 100644 --- a/libbcachefs/bset.c +++ b/libbcachefs/bset.c @@ -313,44 +313,6 @@ struct rw_aux_tree { struct bpos k; }; -/* - * BSET_CACHELINE was originally intended to match the hardware cacheline size - - * it used to be 64, but I realized the lookup code would touch slightly less - * memory if it was 128. - * - * It definites the number of bytes (in struct bset) per struct bkey_float in - * the auxiliar search tree - when we're done searching the bset_float tree we - * have this many bytes left that we do a linear search over. - * - * Since (after level 5) every level of the bset_tree is on a new cacheline, - * we're touching one fewer cacheline in the bset tree in exchange for one more - * cacheline in the linear search - but the linear search might stop before it - * gets to the second cacheline. - */ - -#define BSET_CACHELINE 128 - -/* Space required for the btree node keys */ -static inline size_t btree_keys_bytes(struct btree *b) -{ - return PAGE_SIZE << b->page_order; -} - -static inline size_t btree_keys_cachelines(struct btree *b) -{ - return btree_keys_bytes(b) / BSET_CACHELINE; -} - -static inline size_t btree_aux_data_bytes(struct btree *b) -{ - return btree_keys_cachelines(b) * 8; -} - -static inline size_t btree_aux_data_u64s(struct btree *b) -{ - return btree_aux_data_bytes(b) / sizeof(u64); -} - static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) { BUG_ON(t->aux_data_offset == U16_MAX); @@ -426,29 +388,6 @@ static void bset_aux_tree_verify(struct btree *b) #endif } -/* Memory allocation */ - -void bch2_btree_keys_free(struct btree *b) -{ - vfree(b->aux_data); - b->aux_data = NULL; -} - -#ifndef PAGE_KERNEL_EXEC -# define PAGE_KERNEL_EXEC PAGE_KERNEL -#endif - -int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp) -{ - b->page_order = page_order; - b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp, - PAGE_KERNEL_EXEC); - if (!b->aux_data) - return -ENOMEM; - - return 0; -} - void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks) { unsigned i; diff --git a/libbcachefs/bset.h b/libbcachefs/bset.h index 652ffed..5921cf6 100644 --- a/libbcachefs/bset.h +++ b/libbcachefs/bset.h @@ -184,6 +184,38 @@ static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree } } +/* + * BSET_CACHELINE was originally intended to match the hardware cacheline size - + * it used to be 64, but I realized the lookup code would touch slightly less + * memory if it was 128. + * + * It definites the number of bytes (in struct bset) per struct bkey_float in + * the auxiliar search tree - when we're done searching the bset_float tree we + * have this many bytes left that we do a linear search over. + * + * Since (after level 5) every level of the bset_tree is on a new cacheline, + * we're touching one fewer cacheline in the bset tree in exchange for one more + * cacheline in the linear search - but the linear search might stop before it + * gets to the second cacheline. + */ + +#define BSET_CACHELINE 128 + +static inline size_t btree_keys_cachelines(struct btree *b) +{ + return (1U << b->byte_order) / BSET_CACHELINE; +} + +static inline size_t btree_aux_data_bytes(struct btree *b) +{ + return btree_keys_cachelines(b) * 8; +} + +static inline size_t btree_aux_data_u64s(struct btree *b) +{ + return btree_aux_data_bytes(b) / sizeof(u64); +} + typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); static inline void @@ -334,8 +366,6 @@ static inline struct bset *bset_next_set(struct btree *b, return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); } -void bch2_btree_keys_free(struct btree *); -int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t); void bch2_btree_keys_init(struct btree *, bool *); void bch2_bset_init_first(struct btree *, struct bset *); diff --git a/libbcachefs/btree_cache.c b/libbcachefs/btree_cache.c index d3addd3..7366711 100644 --- a/libbcachefs/btree_cache.c +++ b/libbcachefs/btree_cache.c @@ -44,7 +44,8 @@ static void __btree_node_data_free(struct bch_fs *c, struct btree *b) kvpfree(b->data, btree_bytes(c)); b->data = NULL; - bch2_btree_keys_free(b); + vfree(b->aux_data); + b->aux_data = NULL; } static void btree_node_data_free(struct bch_fs *c, struct btree *b) @@ -72,7 +73,7 @@ static const struct rhashtable_params bch_btree_cache_params = { .obj_cmpfn = bch2_btree_cache_cmp_fn, }; -static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) +static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { BUG_ON(b->data || b->aux_data); @@ -80,7 +81,8 @@ static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) if (!b->data) return -ENOMEM; - if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp)) { + b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); + if (!b->aux_data) { kvpfree(b->data, btree_bytes(c)); b->data = NULL; return -ENOMEM; @@ -89,21 +91,9 @@ static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) return 0; } -static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) +static struct btree *__btree_node_mem_alloc(struct bch_fs *c) { - struct btree_cache *bc = &c->btree_cache; - - if (!__btree_node_data_alloc(c, b, gfp)) { - bc->used++; - list_move(&b->list, &bc->freeable); - } else { - list_move(&b->list, &bc->freed); - } -} - -static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) -{ - struct btree *b = kzalloc(sizeof(struct btree), gfp); + struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL); if (!b) return NULL; @@ -111,9 +101,25 @@ static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) six_lock_init(&b->c.lock); INIT_LIST_HEAD(&b->list); INIT_LIST_HEAD(&b->write_blocked); + b->byte_order = ilog2(btree_bytes(c)); + return b; +} + +static struct btree *btree_node_mem_alloc(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b = __btree_node_mem_alloc(c); + if (!b) + return NULL; - btree_node_data_alloc(c, b, gfp); - return b->data ? b : NULL; + if (btree_node_data_alloc(c, b, GFP_KERNEL)) { + kfree(b); + return NULL; + } + + bc->used++; + list_add(&b->list, &bc->freeable); + return b; } /* Btree in memory cache - hash table */ @@ -124,6 +130,8 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) /* Cause future lookups for this node to fail: */ b->hash_val = 0; + + six_lock_wakeup_all(&b->c.lock); } int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) @@ -402,7 +410,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) bch2_recalc_btree_reserve(c); for (i = 0; i < bc->reserve; i++) - if (!btree_node_mem_alloc(c, GFP_KERNEL)) { + if (!btree_node_mem_alloc(c)) { ret = -ENOMEM; goto out; } @@ -418,7 +426,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) goto out; } - c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL); + c->verify_data = btree_node_mem_alloc(c); if (!c->verify_data) { ret = -ENOMEM; goto out; @@ -550,21 +558,16 @@ got_node: mutex_unlock(&bc->lock); if (!b) { - b = kzalloc(sizeof(struct btree), GFP_KERNEL); + b = __btree_node_mem_alloc(c); if (!b) goto err; - bkey_btree_ptr_init(&b->key); - six_lock_init(&b->c.lock); - INIT_LIST_HEAD(&b->list); - INIT_LIST_HEAD(&b->write_blocked); - BUG_ON(!six_trylock_intent(&b->c.lock)); BUG_ON(!six_trylock_write(&b->c.lock)); } if (!b->data) { - if (__btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) + if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) goto err; mutex_lock(&bc->lock); diff --git a/libbcachefs/btree_cache.h b/libbcachefs/btree_cache.h index 2160012..d0d3a85 100644 --- a/libbcachefs/btree_cache.h +++ b/libbcachefs/btree_cache.h @@ -79,14 +79,9 @@ static inline size_t btree_max_u64s(struct bch_fs *c) return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); } -static inline size_t btree_page_order(struct bch_fs *c) -{ - return get_order(btree_bytes(c)); -} - static inline size_t btree_pages(struct bch_fs *c) { - return 1 << btree_page_order(c); + return btree_bytes(c) / PAGE_SIZE; } static inline unsigned btree_blocks(struct bch_fs *c) diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index 8771ef1..4f58113 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -109,7 +109,7 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, atomic64_set(&c->key_version, k.k->version.lo); if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_bkey_replicas_marked(c, k, false), c, + fsck_err_on(!bch2_bkey_replicas_marked(c, k), c, "superblock not marked as containing replicas (type %u)", k.k->type)) { ret = bch2_mark_bkey_replicas(c, k); @@ -433,16 +433,16 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, if (offset == BCH_SB_SECTOR) mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, - BCH_DATA_SB, flags); + BCH_DATA_sb, flags); mark_metadata_sectors(c, ca, offset, offset + (1 << layout->sb_max_size_bits), - BCH_DATA_SB, flags); + BCH_DATA_sb, flags); } for (i = 0; i < ca->journal.nr; i++) { b = ca->journal.buckets[i]; - bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL, + bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, ca->mi.bucket_size, gc_phase(GC_PHASE_SB), flags); } @@ -617,8 +617,11 @@ static int bch2_gc_done(struct bch_fs *c, copy_stripe_field(block_sectors[i], "block_sectors[%u]", i); - if (dst->alive) + if (dst->alive) { + spin_lock(&c->ec_stripes_heap_lock); bch2_stripes_heap_insert(c, dst, dst_iter.pos); + spin_unlock(&c->ec_stripes_heap_lock); + } genradix_iter_advance(&dst_iter, &c->stripes[0]); genradix_iter_advance(&src_iter, &c->stripes[1]); @@ -673,8 +676,8 @@ static int bch2_gc_done(struct bch_fs *c, char buf[80]; if (metadata_only && - (e->data_type == BCH_DATA_USER || - e->data_type == BCH_DATA_CACHED)) + (e->data_type == BCH_DATA_user || + e->data_type == BCH_DATA_cached)) continue; bch2_replicas_entry_to_text(&PBUF(buf), e); @@ -759,8 +762,8 @@ static int bch2_gc_start(struct bch_fs *c, d->gen_valid = s->gen_valid; if (metadata_only && - (s->mark.data_type == BCH_DATA_USER || - s->mark.data_type == BCH_DATA_CACHED)) { + (s->mark.data_type == BCH_DATA_user || + s->mark.data_type == BCH_DATA_cached)) { d->_mark = s->mark; d->_mark.owned_by_allocator = 0; } @@ -949,8 +952,10 @@ int bch2_gc_gens(struct bch_fs *c) for (i = 0; i < BTREE_ID_NR; i++) if (btree_node_type_needs_gc(i)) { ret = bch2_gc_btree_gens(c, i); - if (ret) + if (ret) { + bch_err(c, "error recalculating oldest_gen: %i", ret); goto err; + } } for_each_member_device(ca, c, i) { @@ -961,6 +966,8 @@ int bch2_gc_gens(struct bch_fs *c) g->oldest_gen = g->gc_gen; up_read(&ca->bucket_lock); } + + c->gc_count++; err: up_read(&c->gc_lock); return ret; diff --git a/libbcachefs/btree_io.c b/libbcachefs/btree_io.c index bb3aecc..2f50972 100644 --- a/libbcachefs/btree_io.c +++ b/libbcachefs/btree_io.c @@ -57,25 +57,25 @@ static void set_needs_whiteout(struct bset *i, int v) k->needs_whiteout = v; } -static void btree_bounce_free(struct bch_fs *c, unsigned order, +static void btree_bounce_free(struct bch_fs *c, size_t size, bool used_mempool, void *p) { if (used_mempool) mempool_free(p, &c->btree_bounce_pool); else - vpfree(p, PAGE_SIZE << order); + vpfree(p, size); } -static void *btree_bounce_alloc(struct bch_fs *c, unsigned order, +static void *btree_bounce_alloc(struct bch_fs *c, size_t size, bool *used_mempool) { unsigned flags = memalloc_nofs_save(); void *p; - BUG_ON(order > btree_page_order(c)); + BUG_ON(size > btree_bytes(c)); *used_mempool = false; - p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order); + p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); if (!p) { *used_mempool = true; p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); @@ -125,16 +125,14 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) { struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; bool used_mempool = false; - unsigned order; + size_t bytes = b->whiteout_u64s * sizeof(u64); if (!b->whiteout_u64s) return; - order = get_order(b->whiteout_u64s * sizeof(u64)); + new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); - new_whiteouts = btree_bounce_alloc(c, order, &used_mempool); - - ptrs = ptrs_end = ((void *) new_whiteouts + (PAGE_SIZE << order)); + ptrs = ptrs_end = ((void *) new_whiteouts + bytes); for (k = unwritten_whiteouts_start(c, b); k != unwritten_whiteouts_end(c, b); @@ -158,7 +156,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) memcpy_u64s(unwritten_whiteouts_start(c, b), new_whiteouts, b->whiteout_u64s); - btree_bounce_free(c, order, used_mempool, new_whiteouts); + btree_bounce_free(c, bytes, used_mempool, new_whiteouts); } static bool should_compact_bset(struct btree *b, struct bset_tree *t, @@ -187,7 +185,7 @@ static bool bch2_compact_extent_whiteouts(struct bch_fs *c, struct bkey_packed *whiteouts = NULL; struct bkey_packed *u_start, *u_pos; struct sort_iter sort_iter; - unsigned order, whiteout_u64s = 0, u64s; + unsigned bytes, whiteout_u64s = 0, u64s; bool used_mempool, compacting = false; BUG_ON(!btree_node_is_extents(b)); @@ -204,9 +202,9 @@ static bool bch2_compact_extent_whiteouts(struct bch_fs *c, sort_iter_init(&sort_iter, b); whiteout_u64s += b->whiteout_u64s; - order = get_order(whiteout_u64s * sizeof(u64)); + bytes = whiteout_u64s * sizeof(u64); - whiteouts = btree_bounce_alloc(c, order, &used_mempool); + whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); u_start = u_pos = whiteouts; memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b), @@ -306,7 +304,7 @@ static bool bch2_compact_extent_whiteouts(struct bch_fs *c, unwritten_whiteouts_end(c, b), true); - btree_bounce_free(c, order, used_mempool, whiteouts); + btree_bounce_free(c, bytes, used_mempool, whiteouts); bch2_btree_build_aux_trees(b); @@ -401,7 +399,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, struct bset *start_bset = bset(b, &b->set[start_idx]); bool used_mempool = false; u64 start_time, seq = 0; - unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1; + unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1; bool sorting_entire_node = start_idx == 0 && end_idx == b->nsets; @@ -416,11 +414,11 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, btree_bkey_last(b, t)); } - order = sorting_entire_node - ? btree_page_order(c) - : get_order(__vstruct_bytes(struct btree_node, u64s)); + bytes = sorting_entire_node + ? btree_bytes(c) + : __vstruct_bytes(struct btree_node, u64s); - out = btree_bounce_alloc(c, order, &used_mempool); + out = btree_bounce_alloc(c, bytes, &used_mempool); start_time = local_clock(); @@ -435,7 +433,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, out->keys.u64s = cpu_to_le16(u64s); - BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order)); + BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes); if (sorting_entire_node) bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], @@ -449,7 +447,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, if (sorting_entire_node) { unsigned u64s = le16_to_cpu(out->keys.u64s); - BUG_ON(order != btree_page_order(c)); + BUG_ON(bytes != btree_bytes(c)); /* * Our temporary buffer is the same size as the btree node's @@ -484,7 +482,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b, set_btree_bset_end(b, &b->set[start_idx]); bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); - btree_bounce_free(c, order, used_mempool, out); + btree_bounce_free(c, bytes, used_mempool, out); bch2_verify_btree_nr_keys(b); } @@ -599,34 +597,6 @@ void bch2_btree_init_next(struct bch_fs *c, struct btree *b, bch2_btree_iter_reinit_node(iter, b); } -static struct nonce btree_nonce(struct bset *i, unsigned offset) -{ - return (struct nonce) {{ - [0] = cpu_to_le32(offset), - [1] = ((__le32 *) &i->seq)[0], - [2] = ((__le32 *) &i->seq)[1], - [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, - }}; -} - -static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) -{ - struct nonce nonce = btree_nonce(i, offset); - - if (!offset) { - struct btree_node *bn = container_of(i, struct btree_node, keys); - unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; - - bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, - bytes); - - nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); - } - - bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, - vstruct_end(i) - (void *) i->_data); -} - static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct btree *b, struct bset *i, unsigned offset, int write) @@ -917,6 +887,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry struct sort_iter *iter; struct btree_node *sorted; struct bkey_packed *k; + struct bch_extent_ptr *ptr; struct bset *i; bool used_mempool, blacklisted; unsigned u64s; @@ -971,8 +942,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry bset_encrypt(c, i, b->written << 9); if (btree_node_is_extents(b) && - !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) + !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { set_btree_node_old_extent_overwrite(b); + set_btree_node_need_rewrite(b); + } sectors = vstruct_sectors(b->data, c->block_bits); } else { @@ -1040,7 +1013,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry BTREE_ERR_WANT_RETRY, c, b, NULL, "found bset signature after last bset"); - sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool); + sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); sorted->keys.u64s = 0; set_btree_bset(b, b->set, &b->data->keys); @@ -1058,7 +1031,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry BUG_ON(b->nr.live_u64s != u64s); - btree_bounce_free(c, btree_page_order(c), used_mempool, sorted); + btree_bounce_free(c, btree_bytes(c), used_mempool, sorted); i = &b->data->keys; for (k = i->start; k != vstruct_last(i);) { @@ -1098,6 +1071,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry set_needs_whiteout(btree_bset_first(b), true); btree_node_reset_sib_u64s(b); + + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (ca->mi.state != BCH_MEMBER_STATE_RW) + set_btree_node_need_rewrite(b); + } out: mempool_free(iter, &c->fill_iter); return retry_read; @@ -1139,7 +1119,8 @@ static void btree_node_read_work(struct work_struct *work) bio->bi_status = BLK_STS_REMOVED; } start: - bch2_dev_io_err_on(bio->bi_status, ca, "btree read"); + bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s", + bch2_blk_status_to_str(bio->bi_status)); if (rb->have_ioref) percpu_ref_put(&ca->io_ref); rb->have_ioref = false; @@ -1220,7 +1201,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b, set_btree_node_read_in_flight(b); if (rb->have_ioref) { - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE], + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], bio_sectors(bio)); bio_set_dev(bio, ca->disk_sb.bdev); @@ -1392,7 +1373,7 @@ static void btree_node_write_work(struct work_struct *work) struct btree *b = wbio->wbio.bio.bi_private; btree_bounce_free(c, - wbio->wbio.order, + wbio->bytes, wbio->wbio.used_mempool, wbio->data); @@ -1423,8 +1404,8 @@ static void btree_node_write_endio(struct bio *bio) if (wbio->have_ioref) bch2_latency_acct(ca, wbio->submit_time, WRITE); - if (bio->bi_status == BLK_STS_REMOVED || - bch2_dev_io_err_on(bio->bi_status, ca, "btree write") || + if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s", + bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("btree")) { spin_lock_irqsave(&c->btree_write_error_lock, flags); bch2_dev_list_add_dev(&orig->failed, wbio->dev); @@ -1475,7 +1456,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, struct bch_extent_ptr *ptr; struct sort_iter sort_iter; struct nonce nonce; - unsigned bytes_to_write, sectors_to_write, order, bytes, u64s; + unsigned bytes_to_write, sectors_to_write, bytes, u64s; u64 seq = 0; bool used_mempool; unsigned long old, new; @@ -1545,8 +1526,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, seq = max(seq, le64_to_cpu(i->journal_seq)); } - order = get_order(bytes); - data = btree_bounce_alloc(c, order, &used_mempool); + data = btree_bounce_alloc(c, bytes, &used_mempool); if (!b->written) { bn = data; @@ -1658,7 +1638,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, struct btree_write_bio, wbio.bio); wbio_init(&wbio->wbio.bio); wbio->data = data; - wbio->wbio.order = order; + wbio->bytes = bytes; wbio->wbio.used_mempool = used_mempool; wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; wbio->wbio.bio.bi_end_io = btree_node_write_endio; @@ -1689,13 +1669,13 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, b->written += sectors_to_write; /* XXX: submitting IO with btree locks held: */ - bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key); + bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, &k.key); return; err: set_btree_node_noevict(b); b->written += sectors_to_write; nowrite: - btree_bounce_free(c, order, used_mempool, data); + btree_bounce_free(c, bytes, used_mempool, data); btree_node_write_done(c, b); } @@ -1826,9 +1806,8 @@ void bch2_btree_verify_flushed(struct bch_fs *c) rcu_read_unlock(); } -ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) +void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); struct bucket_table *tbl; struct rhash_head *pos; struct btree *b; @@ -1841,7 +1820,7 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) if (!(flags & (1 << BTREE_NODE_dirty))) continue; - pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", + pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", b, (flags & (1 << BTREE_NODE_dirty)) != 0, (flags & (1 << BTREE_NODE_need_write)) != 0, @@ -1852,6 +1831,4 @@ ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) b->will_make_reachable & 1); } rcu_read_unlock(); - - return out.pos - buf; } diff --git a/libbcachefs/btree_io.h b/libbcachefs/btree_io.h index f3d7ec7..626d0f0 100644 --- a/libbcachefs/btree_io.h +++ b/libbcachefs/btree_io.h @@ -5,6 +5,7 @@ #include "bkey_methods.h" #include "bset.h" #include "btree_locking.h" +#include "checksum.h" #include "extents.h" #include "io_types.h" @@ -23,8 +24,9 @@ struct btree_read_bio { }; struct btree_write_bio { - void *data; struct work_struct work; + void *data; + unsigned bytes; struct bch_write_bio wbio; }; @@ -81,6 +83,34 @@ static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree * return false; } +static inline struct nonce btree_nonce(struct bset *i, unsigned offset) +{ + return (struct nonce) {{ + [0] = cpu_to_le32(offset), + [1] = ((__le32 *) &i->seq)[0], + [2] = ((__le32 *) &i->seq)[1], + [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, + }}; +} + +static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) +{ + struct nonce nonce = btree_nonce(i, offset); + + if (!offset) { + struct btree_node *bn = container_of(i, struct btree_node, keys); + unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; + + bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, + bytes); + + nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); + } + + bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, + vstruct_end(i) - (void *) i->_data); +} + void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); void bch2_btree_build_aux_trees(struct btree *); @@ -139,7 +169,7 @@ do { \ void bch2_btree_flush_all_reads(struct bch_fs *); void bch2_btree_flush_all_writes(struct bch_fs *); void bch2_btree_verify_flushed(struct bch_fs *); -ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *); +void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *); static inline void compat_bformat(unsigned level, enum btree_id btree_id, unsigned version, unsigned big_endian, diff --git a/libbcachefs/btree_key_cache.c b/libbcachefs/btree_key_cache.c index d73cc8d..6166275 100644 --- a/libbcachefs/btree_key_cache.c +++ b/libbcachefs/btree_key_cache.c @@ -391,7 +391,7 @@ static void btree_key_cache_journal_flush(struct journal *j, struct btree_trans trans; six_lock_read(&ck->c.lock, NULL, NULL); - key = READ_ONCE(ck->key); + key = ck->key; if (ck->journal.seq != seq || !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index 16c4d05..683b416 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -94,7 +94,7 @@ struct btree { struct btree_nr_keys nr; u16 sib_u64s[2]; u16 whiteout_u64s; - u8 page_order; + u8 byte_order; u8 unpack_fn_len; /* @@ -409,6 +409,7 @@ enum btree_flags { BTREE_NODE_dying, BTREE_NODE_fake, BTREE_NODE_old_extent_overwrite, + BTREE_NODE_need_rewrite, }; BTREE_FLAG(read_in_flight); @@ -423,6 +424,7 @@ BTREE_FLAG(just_written); BTREE_FLAG(dying); BTREE_FLAG(fake); BTREE_FLAG(old_extent_overwrite); +BTREE_FLAG(need_rewrite); static inline struct btree_write *btree_current_write(struct btree *b) { @@ -593,7 +595,6 @@ static inline bool btree_iter_is_extents(struct btree_iter *iter) enum btree_trigger_flags { __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ - __BTREE_TRIGGER_NOOVERWRITES, /* Don't run triggers on overwrites */ __BTREE_TRIGGER_INSERT, __BTREE_TRIGGER_OVERWRITE, @@ -606,7 +607,6 @@ enum btree_trigger_flags { }; #define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) -#define BTREE_TRIGGER_NOOVERWRITES (1U << __BTREE_TRIGGER_NOOVERWRITES) #define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) #define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index a8cd6ff..a2604b0 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -26,7 +26,7 @@ /* * Verify that child nodes correctly span parent node's range: */ -static void btree_node_interior_verify(struct btree *b) +static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) { #ifdef CONFIG_BCACHEFS_DEBUG struct bpos next_node = b->data->min_key; @@ -37,6 +37,9 @@ static void btree_node_interior_verify(struct btree *b) BUG_ON(!b->c.level); + if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) + return; + bch2_btree_node_iter_init_from_start(&iter, b); while (1) { @@ -135,8 +138,6 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b) bch2_btree_node_hash_remove(&c->btree_cache, b); - six_lock_wakeup_all(&b->c.lock); - mutex_lock(&c->btree_cache.lock); list_move(&b->list, &c->btree_cache.freeable); mutex_unlock(&c->btree_cache.lock); @@ -290,8 +291,10 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned lev SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); if (btree_node_is_extents(b) && - !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) + !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { set_btree_node_old_extent_overwrite(b); + set_btree_node_need_rewrite(b); + } bch2_btree_build_aux_trees(b); @@ -1118,8 +1121,8 @@ static struct btree *__btree_split_node(struct btree_update *as, bch2_verify_btree_nr_keys(n2); if (n1->c.level) { - btree_node_interior_verify(n1); - btree_node_interior_verify(n2); + btree_node_interior_verify(as->c, n1); + btree_node_interior_verify(as->c, n2); } return n2; @@ -1178,7 +1181,7 @@ static void btree_split_insert_keys(struct btree_update *as, struct btree *b, BUG_ON(b->nsets != 1 || b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s)); - btree_node_interior_verify(b); + btree_node_interior_verify(as->c, b); } static void btree_split(struct btree_update *as, struct btree *b, @@ -1376,7 +1379,7 @@ void bch2_btree_insert_node(struct btree_update *as, struct btree *b, bch2_btree_node_unlock_write(b, iter); - btree_node_interior_verify(b); + btree_node_interior_verify(c, b); /* * when called from the btree_split path the new nodes aren't added to @@ -1864,7 +1867,7 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, new_hash = bch2_btree_node_mem_alloc(c); } - +retry: as = bch2_btree_update_start(iter->trans, iter->btree_id, parent ? btree_update_reserve_required(c, parent) : 0, BTREE_INSERT_NOFAIL| @@ -1877,16 +1880,17 @@ int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, if (ret == -EAGAIN) ret = -EINTR; - if (ret != -EINTR) - goto err; + if (ret == -EINTR) { + bch2_trans_unlock(iter->trans); + up_read(&c->gc_lock); + closure_sync(&cl); + down_read(&c->gc_lock); - bch2_trans_unlock(iter->trans); - up_read(&c->gc_lock); - closure_sync(&cl); - down_read(&c->gc_lock); + if (bch2_trans_relock(iter->trans)) + goto retry; + } - if (!bch2_trans_relock(iter->trans)) - goto err; + goto err; } ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key)); @@ -1943,6 +1947,7 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) bch2_btree_cache_cannibalize_unlock(c); set_btree_node_fake(b); + set_btree_node_need_rewrite(b); b->c.level = 0; b->c.btree_id = id; @@ -1969,22 +1974,19 @@ void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) six_unlock_intent(&b->c.lock); } -ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf) +void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); struct btree_update *as; mutex_lock(&c->btree_interior_update_lock); list_for_each_entry(as, &c->btree_interior_update_list, list) - pr_buf(&out, "%p m %u w %u r %u j %llu\n", + pr_buf(out, "%p m %u w %u r %u j %llu\n", as, as->mode, as->nodes_written, atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK, as->journal.seq); mutex_unlock(&c->btree_interior_update_lock); - - return out.pos - buf; } size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c) diff --git a/libbcachefs/btree_update_interior.h b/libbcachefs/btree_update_interior.h index 4a5b9dc..7668225 100644 --- a/libbcachefs/btree_update_interior.h +++ b/libbcachefs/btree_update_interior.h @@ -311,13 +311,13 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b, static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, struct btree *b, unsigned u64s) { - if (unlikely(btree_node_fake(b))) + if (unlikely(btree_node_need_rewrite(b))) return false; return u64s <= bch_btree_keys_u64s_remaining(c, b); } -ssize_t bch2_btree_updates_print(struct bch_fs *, char *); +void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *); diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 57c0311..cd699c2 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -264,23 +264,12 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans, static enum btree_insert_ret btree_key_can_insert(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *insert, unsigned u64s) { struct bch_fs *c = trans->c; struct btree *b = iter_l(iter)->b; - if (unlikely(btree_node_fake(b))) - return BTREE_INSERT_BTREE_NODE_FULL; - - /* - * old bch2_extent_sort_fix_overlapping() algorithm won't work with new - * style extent updates: - */ - if (unlikely(btree_node_old_extent_overwrite(b))) - return BTREE_INSERT_BTREE_NODE_FULL; - - if (unlikely(u64s > bch_btree_keys_u64s_remaining(c, b))) + if (!bch2_btree_node_insert_fits(c, b, u64s)) return BTREE_INSERT_BTREE_NODE_FULL; return BTREE_INSERT_OK; @@ -289,7 +278,6 @@ btree_key_can_insert(struct btree_trans *trans, static enum btree_insert_ret btree_key_can_insert_cached(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *insert, unsigned u64s) { struct bkey_cached *ck = (void *) iter->l[0].b; @@ -407,8 +395,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, u64s += i->k->k.u64s; ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED - ? btree_key_can_insert(trans, i->iter, i->k, u64s) - : btree_key_can_insert_cached(trans, i->iter, i->k, u64s); + ? btree_key_can_insert(trans, i->iter, u64s) + : btree_key_can_insert_cached(trans, i->iter, u64s); if (ret) { *stopped_at = i; return ret; diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 0ec194b..97a8af3 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -133,13 +133,13 @@ void bch2_fs_usage_initialize(struct bch_fs *c) cpu_replicas_entry(&c->replicas, i); switch (e->data_type) { - case BCH_DATA_BTREE: + case BCH_DATA_btree: usage->btree += usage->replicas[i]; break; - case BCH_DATA_USER: + case BCH_DATA_user: usage->data += usage->replicas[i]; break; - case BCH_DATA_CACHED: + case BCH_DATA_cached: usage->cached += usage->replicas[i]; break; } @@ -179,7 +179,7 @@ out_pool: return ret; } -struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca) +struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) { struct bch_dev_usage ret; @@ -367,7 +367,7 @@ static inline int is_fragmented_bucket(struct bucket_mark m, struct bch_dev *ca) { if (!m.owned_by_allocator && - m.data_type == BCH_DATA_USER && + m.data_type == BCH_DATA_user && bucket_sectors_used(m)) return max_t(int, 0, (int) ca->mi.bucket_size - bucket_sectors_used(m)); @@ -382,7 +382,7 @@ static inline int bucket_stripe_sectors(struct bucket_mark m) static inline enum bch_data_type bucket_type(struct bucket_mark m) { return m.cached_sectors && !m.dirty_sectors - ? BCH_DATA_CACHED + ? BCH_DATA_cached : m.data_type; } @@ -435,7 +435,7 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage, enum bch_data_type type, int nr, s64 size) { - if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL) + if (type == BCH_DATA_sb || type == BCH_DATA_journal) fs_usage->hidden += size; dev_usage->buckets[type] += nr; @@ -472,7 +472,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, u->sectors[old.data_type] -= old.dirty_sectors; u->sectors[new.data_type] += new.dirty_sectors; - u->sectors[BCH_DATA_CACHED] += + u->sectors[BCH_DATA_cached] += (int) new.cached_sectors - (int) old.cached_sectors; u->sectors_fragmented += is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca); @@ -520,13 +520,13 @@ static inline int update_replicas(struct bch_fs *c, return 0; switch (r->data_type) { - case BCH_DATA_BTREE: + case BCH_DATA_btree: fs_usage->btree += sectors; break; - case BCH_DATA_USER: + case BCH_DATA_user: fs_usage->data += sectors; break; - case BCH_DATA_CACHED: + case BCH_DATA_cached: fs_usage->cached += sectors; break; } @@ -713,7 +713,8 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, preempt_enable(); } -static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, +static int bch2_mark_alloc(struct bch_fs *c, + struct bkey_s_c old, struct bkey_s_c new, struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { @@ -721,7 +722,11 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, struct bkey_alloc_unpacked u; struct bch_dev *ca; struct bucket *g; - struct bucket_mark old, m; + struct bucket_mark old_m, m; + + /* We don't do anything for deletions - do we?: */ + if (new.k->type != KEY_TYPE_alloc) + return 0; /* * alloc btree is read in by bch2_alloc_read, not gc: @@ -730,15 +735,15 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) return 0; - ca = bch_dev_bkey_exists(c, k.k->p.inode); + ca = bch_dev_bkey_exists(c, new.k->p.inode); - if (k.k->p.offset >= ca->mi.nbuckets) + if (new.k->p.offset >= ca->mi.nbuckets) return 0; - g = __bucket(ca, k.k->p.offset, gc); - u = bch2_alloc_unpack(k); + g = __bucket(ca, new.k->p.offset, gc); + u = bch2_alloc_unpack(new); - old = bucket_cmpxchg(g, m, ({ + old_m = bucket_cmpxchg(g, m, ({ m.gen = u.gen; m.data_type = u.data_type; m.dirty_sectors = u.dirty_sectors; @@ -751,7 +756,7 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, })); if (!(flags & BTREE_TRIGGER_ALLOC_READ)) - bch2_dev_usage_update(c, ca, fs_usage, old, m, gc); + bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc); g->io_time[READ] = u.read_time; g->io_time[WRITE] = u.write_time; @@ -764,11 +769,11 @@ static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, */ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && - old.cached_sectors) { + old_m.cached_sectors) { update_cached_sectors(c, fs_usage, ca->dev_idx, - -old.cached_sectors); - trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset), - old.cached_sectors); + -old_m.cached_sectors); + trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset), + old_m.cached_sectors); } return 0; @@ -792,8 +797,8 @@ static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, struct bucket_mark old, new; bool overflow; - BUG_ON(data_type != BCH_DATA_SB && - data_type != BCH_DATA_JOURNAL); + BUG_ON(data_type != BCH_DATA_sb && + data_type != BCH_DATA_journal); old = bucket_cmpxchg(g, new, ({ new.data_type = data_type; @@ -824,8 +829,8 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, unsigned sectors, struct gc_pos pos, unsigned flags) { - BUG_ON(type != BCH_DATA_SB && - type != BCH_DATA_JOURNAL); + BUG_ON(type != BCH_DATA_sb && + type != BCH_DATA_journal); preempt_disable(); @@ -878,51 +883,46 @@ static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, } static void bucket_set_stripe(struct bch_fs *c, - const struct bch_stripe *v, + const struct bch_extent_ptr *ptr, struct bch_fs_usage *fs_usage, u64 journal_seq, - unsigned flags) + unsigned flags, + bool enabled) { - bool enabled = !(flags & BTREE_TRIGGER_OVERWRITE); bool gc = flags & BTREE_TRIGGER_GC; - unsigned i; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_BUCKET(ca, ptr, gc); + struct bucket_mark new, old; - for (i = 0; i < v->nr_blocks; i++) { - const struct bch_extent_ptr *ptr = v->ptrs + i; - struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - struct bucket *g = PTR_BUCKET(ca, ptr, gc); - struct bucket_mark new, old; - - old = bucket_cmpxchg(g, new, ({ - new.stripe = enabled; - if (journal_seq) { - new.journal_seq_valid = 1; - new.journal_seq = journal_seq; - } - })); + old = bucket_cmpxchg(g, new, ({ + new.stripe = enabled; + if (journal_seq) { + new.journal_seq_valid = 1; + new.journal_seq = journal_seq; + } + })); - bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); + bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); - /* - * XXX write repair code for these, flag stripe as possibly bad - */ - if (old.gen != ptr->gen) - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "stripe with stale pointer"); + /* + * XXX write repair code for these, flag stripe as possibly bad + */ + if (old.gen != ptr->gen) + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "stripe with stale pointer"); #if 0 - /* - * We'd like to check for these, but these checks don't work - * yet: - */ - if (old.stripe && enabled) - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "multiple stripes using same bucket"); - - if (!old.stripe && !enabled) - bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, - "deleting stripe but bucket not marked as stripe bucket"); + /* + * We'd like to check for these, but these checks don't work + * yet: + */ + if (old.stripe && enabled) + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "multiple stripes using same bucket"); + + if (!old.stripe && !enabled) + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "deleting stripe but bucket not marked as stripe bucket"); #endif - } } static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, @@ -1064,8 +1064,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, { bool gc = flags & BTREE_TRIGGER_GC; struct stripe *m; - unsigned old, new; - int blocks_nonempty_delta; + unsigned i, blocks_nonempty = 0; m = genradix_ptr(&c->stripes[gc], p.idx); @@ -1084,31 +1083,30 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, *nr_parity = m->nr_redundant; *r = m->r; - old = m->block_sectors[p.block]; m->block_sectors[p.block] += sectors; - new = m->block_sectors[p.block]; - blocks_nonempty_delta = (int) !!new - (int) !!old; - if (blocks_nonempty_delta) { - m->blocks_nonempty += blocks_nonempty_delta; + for (i = 0; i < m->nr_blocks; i++) + blocks_nonempty += m->block_sectors[i] != 0; + if (m->blocks_nonempty != blocks_nonempty) { + m->blocks_nonempty = blocks_nonempty; if (!gc) bch2_stripes_heap_update(c, m, p.idx); } - m->dirty = true; - spin_unlock(&c->ec_stripes_heap_lock); return 0; } -static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, +static int bch2_mark_extent(struct bch_fs *c, + struct bkey_s_c old, struct bkey_s_c new, unsigned offset, s64 sectors, enum bch_data_type data_type, struct bch_fs_usage *fs_usage, unsigned journal_seq, unsigned flags) { + struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; @@ -1124,7 +1122,7 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, BUG_ON(!sectors); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors = data_type == BCH_DATA_BTREE + s64 disk_sectors = data_type == BCH_DATA_btree ? sectors : ptr_disk_sectors_delta(p, offset, sectors, flags); @@ -1177,72 +1175,98 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, return 0; } -static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, +static int bch2_mark_stripe(struct bch_fs *c, + struct bkey_s_c old, struct bkey_s_c new, struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { bool gc = flags & BTREE_TRIGGER_GC; - struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); - size_t idx = s.k->p.offset; + size_t idx = new.k->p.offset; + const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(old).v : NULL; + const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(new).v : NULL; struct stripe *m = genradix_ptr(&c->stripes[gc], idx); unsigned i; - spin_lock(&c->ec_stripes_heap_lock); - - if (!m || ((flags & BTREE_TRIGGER_OVERWRITE) && !m->alive)) { - spin_unlock(&c->ec_stripes_heap_lock); + if (!m || (old_s && !m->alive)) { bch_err_ratelimited(c, "error marking nonexistent stripe %zu", idx); return -1; } - if (!(flags & BTREE_TRIGGER_OVERWRITE)) { - m->sectors = le16_to_cpu(s.v->sectors); - m->algorithm = s.v->algorithm; - m->nr_blocks = s.v->nr_blocks; - m->nr_redundant = s.v->nr_redundant; + if (!new_s) { + /* Deleting: */ + for (i = 0; i < old_s->nr_blocks; i++) + bucket_set_stripe(c, old_s->ptrs + i, fs_usage, + journal_seq, flags, false); - bch2_bkey_to_replicas(&m->r.e, k); + if (!gc && m->on_heap) { + spin_lock(&c->ec_stripes_heap_lock); + bch2_stripes_heap_del(c, m, idx); + spin_unlock(&c->ec_stripes_heap_lock); + } - /* - * XXX: account for stripes somehow here - */ -#if 0 - update_replicas(c, fs_usage, &m->r.e, stripe_sectors); -#endif + memset(m, 0, sizeof(*m)); + } else { + BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks); + BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant); + + for (i = 0; i < new_s->nr_blocks; i++) { + if (!old_s || + memcmp(new_s->ptrs + i, + old_s->ptrs + i, + sizeof(struct bch_extent_ptr))) { + + if (old_s) + bucket_set_stripe(c, old_s->ptrs + i, fs_usage, + journal_seq, flags, false); + bucket_set_stripe(c, new_s->ptrs + i, fs_usage, + journal_seq, flags, true); + } + } + + m->alive = true; + m->sectors = le16_to_cpu(new_s->sectors); + m->algorithm = new_s->algorithm; + m->nr_blocks = new_s->nr_blocks; + m->nr_redundant = new_s->nr_redundant; + + bch2_bkey_to_replicas(&m->r.e, new); /* gc recalculates these fields: */ if (!(flags & BTREE_TRIGGER_GC)) { - for (i = 0; i < s.v->nr_blocks; i++) { + m->blocks_nonempty = 0; + + for (i = 0; i < new_s->nr_blocks; i++) { m->block_sectors[i] = - stripe_blockcount_get(s.v, i); + stripe_blockcount_get(new_s, i); m->blocks_nonempty += !!m->block_sectors[i]; } } - if (!gc) + if (!gc) { + spin_lock(&c->ec_stripes_heap_lock); bch2_stripes_heap_update(c, m, idx); - m->alive = true; - } else { - if (!gc) - bch2_stripes_heap_del(c, m, idx); - memset(m, 0, sizeof(*m)); + spin_unlock(&c->ec_stripes_heap_lock); + } } - spin_unlock(&c->ec_stripes_heap_lock); - - bucket_set_stripe(c, s.v, fs_usage, 0, flags); return 0; } static int bch2_mark_key_locked(struct bch_fs *c, - struct bkey_s_c k, + struct bkey_s_c old, + struct bkey_s_c new, unsigned offset, s64 sectors, struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { + struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; int ret = 0; + BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE))); + preempt_disable(); if (!fs_usage || (flags & BTREE_TRIGGER_GC)) @@ -1251,7 +1275,7 @@ static int bch2_mark_key_locked(struct bch_fs *c, switch (k.k->type) { case KEY_TYPE_alloc: - ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags); + ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags); break; case KEY_TYPE_btree_ptr: case KEY_TYPE_btree_ptr_v2: @@ -1259,16 +1283,16 @@ static int bch2_mark_key_locked(struct bch_fs *c, ? c->opts.btree_node_size : -c->opts.btree_node_size; - ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_BTREE, - fs_usage, journal_seq, flags); + ret = bch2_mark_extent(c, old, new, offset, sectors, + BCH_DATA_btree, fs_usage, journal_seq, flags); break; case KEY_TYPE_extent: case KEY_TYPE_reflink_v: - ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER, - fs_usage, journal_seq, flags); + ret = bch2_mark_extent(c, old, new, offset, sectors, + BCH_DATA_user, fs_usage, journal_seq, flags); break; case KEY_TYPE_stripe: - ret = bch2_mark_stripe(c, k, fs_usage, journal_seq, flags); + ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags); break; case KEY_TYPE_inode: if (!(flags & BTREE_TRIGGER_OVERWRITE)) @@ -1294,82 +1318,38 @@ static int bch2_mark_key_locked(struct bch_fs *c, return ret; } -int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, +int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, unsigned offset, s64 sectors, struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { + struct bkey deleted; + struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; int ret; + bkey_init(&deleted); + percpu_down_read(&c->mark_lock); - ret = bch2_mark_key_locked(c, k, offset, sectors, - fs_usage, journal_seq, flags); + ret = bch2_mark_key_locked(c, old, new, offset, sectors, + fs_usage, journal_seq, + BTREE_TRIGGER_INSERT|flags); percpu_up_read(&c->mark_lock); return ret; } -inline int bch2_mark_overwrite(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c old, - struct bkey_i *new, - struct bch_fs_usage *fs_usage, - unsigned flags, - bool is_extents) -{ - struct bch_fs *c = trans->c; - unsigned offset = 0; - s64 sectors = -((s64) old.k->size); - - flags |= BTREE_TRIGGER_OVERWRITE; - - if (is_extents - ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0 - : bkey_cmp(new->k.p, old.k->p)) - return 0; - - if (is_extents) { - switch (bch2_extent_overlap(&new->k, old.k)) { - case BCH_EXTENT_OVERLAP_ALL: - offset = 0; - sectors = -((s64) old.k->size); - break; - case BCH_EXTENT_OVERLAP_BACK: - offset = bkey_start_offset(&new->k) - - bkey_start_offset(old.k); - sectors = bkey_start_offset(&new->k) - - old.k->p.offset; - break; - case BCH_EXTENT_OVERLAP_FRONT: - offset = 0; - sectors = bkey_start_offset(old.k) - - new->k.p.offset; - break; - case BCH_EXTENT_OVERLAP_MIDDLE: - offset = bkey_start_offset(&new->k) - - bkey_start_offset(old.k); - sectors = -((s64) new->k.size); - flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; - break; - } - - BUG_ON(sectors >= 0); - } - - return bch2_mark_key_locked(c, old, offset, sectors, fs_usage, - trans->journal_res.seq, flags) ?: 1; -} - int bch2_mark_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *insert, + struct bkey_i *new, struct bch_fs_usage *fs_usage, unsigned flags) { struct bch_fs *c = trans->c; struct btree *b = iter_l(iter)->b; struct btree_node_iter node_iter = iter_l(iter)->iter; - struct bkey_packed *_k; + struct bkey_packed *_old; + struct bkey_s_c old; + struct bkey unpacked; int ret = 0; if (unlikely(flags & BTREE_TRIGGER_NORUN)) @@ -1378,34 +1358,87 @@ int bch2_mark_update(struct btree_trans *trans, if (!btree_node_type_needs_gc(iter->btree_id)) return 0; - bch2_mark_key_locked(c, bkey_i_to_s_c(insert), - 0, insert->k.size, - fs_usage, trans->journal_res.seq, - BTREE_TRIGGER_INSERT|flags); + bkey_init(&unpacked); + old = (struct bkey_s_c) { &unpacked, NULL }; - if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES)) - return 0; + if (!btree_node_type_is_extents(iter->btree_id)) { + if (btree_iter_type(iter) != BTREE_ITER_CACHED) { + _old = bch2_btree_node_iter_peek(&node_iter, b); + if (_old) + old = bkey_disassemble(b, _old, &unpacked); + } else { + struct bkey_cached *ck = (void *) iter->l[0].b; - /* - * For non extents, we only mark the new key, not the key being - * overwritten - unless we're actually deleting: - */ - if ((iter->btree_id == BTREE_ID_ALLOC || - iter->btree_id == BTREE_ID_EC) && - !bkey_deleted(&insert->k)) - return 0; + if (ck->valid) + old = bkey_i_to_s_c(ck->k); + } - while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { - struct bkey unpacked; - struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); + if (old.k->type == new->k.type) { + bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, + fs_usage, trans->journal_res.seq, + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); - ret = bch2_mark_overwrite(trans, iter, k, insert, - fs_usage, flags, - btree_node_type_is_extents(iter->btree_id)); - if (ret <= 0) - break; + } else { + bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, + fs_usage, trans->journal_res.seq, + BTREE_TRIGGER_INSERT|flags); + bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, + fs_usage, trans->journal_res.seq, + BTREE_TRIGGER_OVERWRITE|flags); + } + } else { + BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); + bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), + 0, new->k.size, + fs_usage, trans->journal_res.seq, + BTREE_TRIGGER_INSERT|flags); - bch2_btree_node_iter_advance(&node_iter, b); + while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) { + unsigned offset = 0; + s64 sectors; + + old = bkey_disassemble(b, _old, &unpacked); + sectors = -((s64) old.k->size); + + flags |= BTREE_TRIGGER_OVERWRITE; + + if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) + return 0; + + switch (bch2_extent_overlap(&new->k, old.k)) { + case BCH_EXTENT_OVERLAP_ALL: + offset = 0; + sectors = -((s64) old.k->size); + break; + case BCH_EXTENT_OVERLAP_BACK: + offset = bkey_start_offset(&new->k) - + bkey_start_offset(old.k); + sectors = bkey_start_offset(&new->k) - + old.k->p.offset; + break; + case BCH_EXTENT_OVERLAP_FRONT: + offset = 0; + sectors = bkey_start_offset(old.k) - + new->k.p.offset; + break; + case BCH_EXTENT_OVERLAP_MIDDLE: + offset = bkey_start_offset(&new->k) - + bkey_start_offset(old.k); + sectors = -((s64) new->k.size); + flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; + break; + } + + BUG_ON(sectors >= 0); + + ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), + offset, sectors, fs_usage, + trans->journal_res.seq, flags) ?: 1; + if (ret <= 0) + break; + + bch2_btree_node_iter_advance(&node_iter, b); + } } return ret; @@ -1460,8 +1493,10 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, } else { struct bkey_cached *ck = (void *) i->iter->l[0].b; - bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); - pr_err("%s", buf); + if (ck->valid) { + bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); + pr_err("%s", buf); + } } } } @@ -1632,7 +1667,7 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, BUG_ON(!sectors); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors = data_type == BCH_DATA_BTREE + s64 disk_sectors = data_type == BCH_DATA_btree ? sectors : ptr_disk_sectors_delta(p, offset, sectors, flags); @@ -1774,11 +1809,11 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, : -c->opts.btree_node_size; return bch2_trans_mark_extent(trans, k, offset, sectors, - flags, BCH_DATA_BTREE); + flags, BCH_DATA_btree); case KEY_TYPE_extent: case KEY_TYPE_reflink_v: return bch2_trans_mark_extent(trans, k, offset, sectors, - flags, BCH_DATA_USER); + flags, BCH_DATA_user); case KEY_TYPE_inode: d = replicas_deltas_realloc(trans, 0); @@ -1829,9 +1864,6 @@ int bch2_trans_mark_update(struct btree_trans *trans, if (ret) return ret; - if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES)) - return 0; - if (btree_iter_type(iter) == BTREE_ITER_CACHED) { struct bkey_cached *ck = (void *) iter->l[0].b; @@ -1992,7 +2024,6 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) alloc_fifo free[RESERVE_NR]; alloc_fifo free_inc; alloc_heap alloc_heap; - copygc_heap copygc_heap; size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, ca->mi.bucket_size / c->opts.btree_node_size); @@ -2001,15 +2032,13 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7); size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), btree_reserve * 2); - bool resize = ca->buckets[0] != NULL, - start_copygc = ca->copygc_thread != NULL; + bool resize = ca->buckets[0] != NULL; int ret = -ENOMEM; unsigned i; memset(&free, 0, sizeof(free)); memset(&free_inc, 0, sizeof(free_inc)); memset(&alloc_heap, 0, sizeof(alloc_heap)); - memset(©gc_heap, 0, sizeof(copygc_heap)); if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + nbuckets * sizeof(struct bucket), @@ -2022,14 +2051,13 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) copygc_reserve, GFP_KERNEL) || !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || - !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) || - !init_heap(©gc_heap, copygc_reserve, GFP_KERNEL)) + !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL)) goto err; buckets->first_bucket = ca->mi.first_bucket; buckets->nbuckets = nbuckets; - bch2_copygc_stop(ca); + bch2_copygc_stop(c); if (resize) { down_write(&c->gc_lock); @@ -2072,21 +2100,13 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) /* with gc lock held, alloc_heap can't be in use: */ swap(ca->alloc_heap, alloc_heap); - /* and we shut down copygc: */ - swap(ca->copygc_heap, copygc_heap); - nbuckets = ca->mi.nbuckets; if (resize) up_write(&ca->bucket_lock); - if (start_copygc && - bch2_copygc_start(c, ca)) - bch_err(ca, "error restarting copygc thread"); - ret = 0; err: - free_heap(©gc_heap); free_heap(&alloc_heap); free_fifo(&free_inc); for (i = 0; i < RESERVE_NR; i++) @@ -2103,7 +2123,6 @@ void bch2_dev_buckets_free(struct bch_dev *ca) { unsigned i; - free_heap(&ca->copygc_heap); free_heap(&ca->alloc_heap); free_fifo(&ca->free_inc); for (i = 0; i < RESERVE_NR; i++) diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 97265fe..653f676 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -99,9 +99,9 @@ static inline enum bch_data_type ptr_data_type(const struct bkey *k, { if (k->type == KEY_TYPE_btree_ptr || k->type == KEY_TYPE_btree_ptr_v2) - return BCH_DATA_BTREE; + return BCH_DATA_btree; - return ptr->cached ? BCH_DATA_CACHED : BCH_DATA_USER; + return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; } static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, @@ -182,7 +182,7 @@ static inline bool bucket_needs_journal_commit(struct bucket_mark m, /* Device usage: */ -struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *); +struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); void bch2_dev_usage_from_buckets(struct bch_fs *); @@ -202,9 +202,9 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca, /* * Number of reclaimable buckets - only for use by the allocator thread: */ -static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca) +static inline u64 dev_buckets_available(struct bch_dev *ca) { - return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca)); + return __dev_buckets_available(ca, bch2_dev_usage_read(ca)); } static inline u64 __dev_buckets_free(struct bch_dev *ca, @@ -215,9 +215,9 @@ static inline u64 __dev_buckets_free(struct bch_dev *ca, fifo_used(&ca->free_inc); } -static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca) +static inline u64 dev_buckets_free(struct bch_dev *ca) { - return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca)); + return __dev_buckets_free(ca, bch2_dev_usage_read(ca)); } /* Filesystem usage: */ @@ -259,14 +259,11 @@ void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, size_t, enum bch_data_type, unsigned, struct gc_pos, unsigned); -int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64, - struct bch_fs_usage *, u64, unsigned); +int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, + s64, struct bch_fs_usage *, u64, unsigned); int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, struct disk_reservation *, unsigned); -int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *, - struct bkey_s_c, struct bkey_i *, - struct bch_fs_usage *, unsigned, bool); int bch2_mark_update(struct btree_trans *, struct btree_iter *, struct bkey_i *, struct bch_fs_usage *, unsigned); diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 53f2272..d5215b1 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -123,7 +123,9 @@ struct disk_reservation { }; struct copygc_heap_entry { + u8 dev; u8 gen; + u16 fragmentation; u32 sectors; u64 offset; }; diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 3af5219..0377f90 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -468,7 +468,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, if (IS_ERR(ca)) return PTR_ERR(ca); - src = bch2_dev_usage_read(c, ca); + src = bch2_dev_usage_read(ca); arg.state = ca->mi.state; arg.bucket_size = ca->mi.bucket_size; diff --git a/libbcachefs/clock.c b/libbcachefs/clock.c index a9f5d56..1d1590d 100644 --- a/libbcachefs/clock.c +++ b/libbcachefs/clock.c @@ -152,9 +152,8 @@ void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) timer->fn(timer); } -ssize_t bch2_io_timers_show(struct io_clock *clock, char *buf) +void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); unsigned long now; unsigned i; @@ -162,12 +161,10 @@ ssize_t bch2_io_timers_show(struct io_clock *clock, char *buf) now = atomic_long_read(&clock->now); for (i = 0; i < clock->timers.used; i++) - pr_buf(&out, "%ps:\t%li\n", + pr_buf(out, "%ps:\t%li\n", clock->timers.data[i]->fn, clock->timers.data[i]->expire - now); spin_unlock(&clock->timer_lock); - - return out.pos - buf; } void bch2_io_clock_exit(struct io_clock *clock) diff --git a/libbcachefs/clock.h b/libbcachefs/clock.h index da50afe..70a0f74 100644 --- a/libbcachefs/clock.h +++ b/libbcachefs/clock.h @@ -30,7 +30,7 @@ void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); __ret; \ }) -ssize_t bch2_io_timers_show(struct io_clock *, char *); +void bch2_io_timers_to_text(struct printbuf *, struct io_clock *); void bch2_io_clock_exit(struct io_clock *); int bch2_io_clock_init(struct io_clock *); diff --git a/libbcachefs/compress.c b/libbcachefs/compress.c index 3d75527..b50d2b0 100644 --- a/libbcachefs/compress.c +++ b/libbcachefs/compress.c @@ -7,7 +7,6 @@ #include "super-io.h" #include -#include #include #include @@ -64,7 +63,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, struct bbuf ret; struct bio_vec bv; struct bvec_iter iter; - unsigned nr_pages = 0, flags; + unsigned nr_pages = 0; struct page *stack_pages[16]; struct page **pages = NULL; void *data; @@ -104,10 +103,7 @@ static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, __bio_for_each_segment(bv, bio, iter, start) pages[nr_pages++] = bv.bv_page; - flags = memalloc_nofs_save(); data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); - memalloc_nofs_restore(flags); - if (pages != stack_pages) kfree(pages); diff --git a/libbcachefs/disk_groups.c b/libbcachefs/disk_groups.c index 4a4ec8f..c52b6fa 100644 --- a/libbcachefs/disk_groups.c +++ b/libbcachefs/disk_groups.c @@ -183,7 +183,7 @@ const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned targe case TARGET_GROUP: { struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - return t.group < g->nr && !g->entries[t.group].deleted + return g && t.group < g->nr && !g->entries[t.group].deleted ? &g->entries[t.group].devs : NULL; } @@ -208,7 +208,7 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) rcu_read_lock(); g = rcu_dereference(c->disk_groups); - m = t.group < g->nr && !g->entries[t.group].deleted + m = g && t.group < g->nr && !g->entries[t.group].deleted ? &g->entries[t.group].devs : NULL; @@ -387,6 +387,7 @@ int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) { struct bch_member *mi; int v = -1; + int ret = 0; mutex_lock(&c->sb_lock); @@ -399,14 +400,18 @@ int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) return v; } + ret = bch2_sb_disk_groups_to_cpu(c); + if (ret) + goto unlock; write_sb: mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; SET_BCH_MEMBER_GROUP(mi, v + 1); bch2_write_super(c); +unlock: mutex_unlock(&c->sb_lock); - return 0; + return ret; } int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) diff --git a/libbcachefs/disk_groups.h b/libbcachefs/disk_groups.h index c8e0c37..3d84f23 100644 --- a/libbcachefs/disk_groups.h +++ b/libbcachefs/disk_groups.h @@ -71,7 +71,10 @@ static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); int bch2_disk_path_find(struct bch_sb_handle *, const char *); + +/* Exported for userspace bcachefs-tools: */ int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); + void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *, unsigned); diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index 9442d6e..5514f65 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -200,40 +200,6 @@ static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) return false; } -static void ec_stripe_key_init(struct bch_fs *c, - struct bkey_i_stripe *s, - struct open_buckets *blocks, - struct open_buckets *parity, - unsigned stripe_size) -{ - struct open_bucket *ob; - unsigned i, u64s; - - bkey_stripe_init(&s->k_i); - s->v.sectors = cpu_to_le16(stripe_size); - s->v.algorithm = 0; - s->v.nr_blocks = parity->nr + blocks->nr; - s->v.nr_redundant = parity->nr; - s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); - s->v.csum_type = BCH_CSUM_CRC32C; - s->v.pad = 0; - - open_bucket_for_each(c, blocks, ob, i) - s->v.ptrs[i] = ob->ptr; - - open_bucket_for_each(c, parity, ob, i) - s->v.ptrs[blocks->nr + i] = ob->ptr; - - while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { - BUG_ON(1 << s->v.csum_granularity_bits >= - le16_to_cpu(s->v.sectors) || - s->v.csum_granularity_bits == U8_MAX); - s->v.csum_granularity_bits++; - } - - set_bkey_val_u64s(&s->k, u64s); -} - /* Checksumming: */ static void ec_generate_checksums(struct ec_stripe_buf *buf) @@ -360,7 +326,9 @@ static void ec_block_endio(struct bio *bio) struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; - if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding")) + if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s", + bio_data_dir(bio) ? "write" : "read", + bch2_blk_status_to_str(bio->bi_status))) clear_bit(ec_bio->idx, ec_bio->buf->valid); bio_put(&ec_bio->bio); @@ -605,39 +573,16 @@ static void heap_verify_backpointer(struct bch_fs *c, size_t idx) BUG_ON(h->data[m->heap_idx].idx != idx); } -void bch2_stripes_heap_update(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - size_t i; - - if (m->alive) { - heap_verify_backpointer(c, idx); - - h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; - - i = m->heap_idx; - heap_sift_up(h, i, ec_stripes_heap_cmp, - ec_stripes_heap_set_backpointer); - heap_sift_down(h, i, ec_stripes_heap_cmp, - ec_stripes_heap_set_backpointer); - - heap_verify_backpointer(c, idx); - } else { - bch2_stripes_heap_insert(c, m, idx); - } - - if (stripe_idx_to_delete(c) >= 0 && - !percpu_ref_is_dying(&c->writes)) - schedule_work(&c->ec_stripe_delete_work); -} - void bch2_stripes_heap_del(struct bch_fs *c, struct stripe *m, size_t idx) { + if (!m->on_heap) + return; + + m->on_heap = false; + heap_verify_backpointer(c, idx); - m->alive = false; heap_del(&c->ec_stripes_heap, m->heap_idx, ec_stripes_heap_cmp, ec_stripes_heap_set_backpointer); @@ -646,23 +591,54 @@ void bch2_stripes_heap_del(struct bch_fs *c, void bch2_stripes_heap_insert(struct bch_fs *c, struct stripe *m, size_t idx) { + if (m->on_heap) + return; + BUG_ON(heap_full(&c->ec_stripes_heap)); + m->on_heap = true; + heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { .idx = idx, .blocks_nonempty = m->blocks_nonempty, }), ec_stripes_heap_cmp, ec_stripes_heap_set_backpointer); - m->alive = true; heap_verify_backpointer(c, idx); } +void bch2_stripes_heap_update(struct bch_fs *c, + struct stripe *m, size_t idx) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + size_t i; + + if (!m->on_heap) + return; + + heap_verify_backpointer(c, idx); + + h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; + + i = m->heap_idx; + heap_sift_up(h, i, ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + heap_sift_down(h, i, ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + + heap_verify_backpointer(c, idx); + + if (stripe_idx_to_delete(c) >= 0 && + !percpu_ref_is_dying(&c->writes)) + schedule_work(&c->ec_stripe_delete_work); +} + /* stripe deletion */ static int ec_stripe_delete(struct bch_fs *c, size_t idx) { + //pr_info("deleting stripe %zu", idx); return bch2_btree_delete_range(c, BTREE_ID_EC, POS(0, idx), POS(0, idx + 1), @@ -675,23 +651,20 @@ static void ec_stripe_delete_work(struct work_struct *work) container_of(work, struct bch_fs, ec_stripe_delete_work); ssize_t idx; - down_read(&c->gc_lock); - mutex_lock(&c->ec_stripe_create_lock); - while (1) { spin_lock(&c->ec_stripes_heap_lock); idx = stripe_idx_to_delete(c); - spin_unlock(&c->ec_stripes_heap_lock); - - if (idx < 0) + if (idx < 0) { + spin_unlock(&c->ec_stripes_heap_lock); break; + } + + bch2_stripes_heap_del(c, genradix_ptr(&c->stripes[0], idx), idx); + spin_unlock(&c->ec_stripes_heap_lock); if (ec_stripe_delete(c, idx)) break; } - - mutex_unlock(&c->ec_stripe_create_lock); - up_read(&c->gc_lock); } /* stripe creation: */ @@ -784,6 +757,8 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, bkey_on_stack_init(&sk); bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + /* XXX this doesn't support the reflink btree */ + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, bkey_start_pos(pos), BTREE_ITER_INTENT); @@ -809,12 +784,9 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, bkey_on_stack_reassemble(&sk, c, k); e = bkey_i_to_s_extent(sk.k); - extent_for_each_ptr(e, ptr) { - if (ptr->dev == dev) - ec_ptr = ptr; - else - ptr->cached = true; - } + bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev); + ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev); + BUG_ON(!ec_ptr); extent_stripe_ptr_add(e, s, ec_ptr, idx); @@ -844,6 +816,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) struct bch_fs *c = s->c; struct open_bucket *ob; struct bkey_i *k; + struct stripe *m; struct bch_stripe *v = &s->stripe.key.v; unsigned i, nr_data = v->nr_blocks - v->nr_redundant; struct closure cl; @@ -854,10 +827,13 @@ static void ec_stripe_create(struct ec_stripe_new *s) closure_init_stack(&cl); if (s->err) { - bch_err(c, "error creating stripe: error writing data buckets"); + if (s->err != -EROFS) + bch_err(c, "error creating stripe: error writing data buckets"); goto err; } + BUG_ON(!s->allocated); + if (!percpu_ref_tryget(&c->writes)) goto err; @@ -880,22 +856,33 @@ static void ec_stripe_create(struct ec_stripe_new *s) goto err_put_writes; } - mutex_lock(&c->ec_stripe_create_lock); - - ret = ec_stripe_bkey_insert(c, &s->stripe.key); + ret = s->existing_stripe + ? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i, + NULL, NULL, BTREE_INSERT_NOFAIL) + : ec_stripe_bkey_insert(c, &s->stripe.key); if (ret) { bch_err(c, "error creating stripe: error creating stripe key"); - goto err_unlock; + goto err_put_writes; } for_each_keylist_key(&s->keys, k) { ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); - if (ret) + if (ret) { + bch_err(c, "error creating stripe: error updating pointers"); break; + } } -err_unlock: - mutex_unlock(&c->ec_stripe_create_lock); + spin_lock(&c->ec_stripes_heap_lock); + m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset); +#if 0 + pr_info("created a %s stripe %llu", + s->existing_stripe ? "existing" : "new", + s->stripe.key.k.p.offset); +#endif + BUG_ON(m->on_heap); + bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset); + spin_unlock(&c->ec_stripes_heap_lock); err_put_writes: percpu_ref_put(&c->writes); err: @@ -908,30 +895,52 @@ err: bch2_keylist_free(&s->keys, s->inline_keys); - mutex_lock(&s->h->lock); - list_del(&s->list); - mutex_unlock(&s->h->lock); - for (i = 0; i < s->stripe.key.v.nr_blocks; i++) kvpfree(s->stripe.data[i], s->stripe.size << 9); kfree(s); } -static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h) +static void ec_stripe_create_work(struct work_struct *work) { - struct ec_stripe_new *s = h->s; + struct bch_fs *c = container_of(work, + struct bch_fs, ec_stripe_create_work); + struct ec_stripe_new *s, *n; +restart: + mutex_lock(&c->ec_stripe_new_lock); + list_for_each_entry_safe(s, n, &c->ec_stripe_new_list, list) + if (!atomic_read(&s->pin)) { + list_del(&s->list); + mutex_unlock(&c->ec_stripe_new_lock); + ec_stripe_create(s); + goto restart; + } + mutex_unlock(&c->ec_stripe_new_lock); +} - list_add(&s->list, &h->stripes); - h->s = NULL; +static void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s) +{ + BUG_ON(atomic_read(&s->pin) <= 0); - return s; + if (atomic_dec_and_test(&s->pin)) { + BUG_ON(!s->pending); + queue_work(system_long_wq, &c->ec_stripe_create_work); + } } -static void ec_stripe_new_put(struct ec_stripe_new *s) +static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) { - BUG_ON(atomic_read(&s->pin) <= 0); - if (atomic_dec_and_test(&s->pin)) - ec_stripe_create(s); + struct ec_stripe_new *s = h->s; + + BUG_ON(!s->allocated && !s->err); + + h->s = NULL; + s->pending = true; + + mutex_lock(&c->ec_stripe_new_lock); + list_add(&s->list, &c->ec_stripe_new_list); + mutex_unlock(&c->ec_stripe_new_lock); + + ec_stripe_new_put(c, s); } /* have a full bucket - hand it off to be erasure coded: */ @@ -942,7 +951,7 @@ void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob) if (ob->sectors_free) s->err = -1; - ec_stripe_new_put(s); + ec_stripe_new_put(c, s); } void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) @@ -976,6 +985,8 @@ void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, if (!ob) return; + //pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset); + ec = ob->ec; mutex_lock(&ec->lock); @@ -1034,14 +1045,43 @@ static unsigned pick_blocksize(struct bch_fs *c, return best.size; } -int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h) +static bool may_create_new_stripe(struct bch_fs *c) +{ + return false; +} + +static void ec_stripe_key_init(struct bch_fs *c, + struct bkey_i_stripe *s, + unsigned nr_data, + unsigned nr_parity, + unsigned stripe_size) +{ + unsigned u64s; + + bkey_stripe_init(&s->k_i); + s->v.sectors = cpu_to_le16(stripe_size); + s->v.algorithm = 0; + s->v.nr_blocks = nr_data + nr_parity; + s->v.nr_redundant = nr_parity; + s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); + s->v.csum_type = BCH_CSUM_CRC32C; + s->v.pad = 0; + + while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { + BUG_ON(1 << s->v.csum_granularity_bits >= + le16_to_cpu(s->v.sectors) || + s->v.csum_granularity_bits == U8_MAX); + s->v.csum_granularity_bits++; + } + + set_bkey_val_u64s(&s->k, u64s); +} + +static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) { struct ec_stripe_new *s; unsigned i; - BUG_ON(h->parity.nr != h->redundancy); - BUG_ON(!h->blocks.nr); - BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX); lockdep_assert_held(&h->lock); s = kzalloc(sizeof(*s), GFP_KERNEL); @@ -1052,11 +1092,9 @@ int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h) atomic_set(&s->pin, 1); s->c = c; s->h = h; - s->blocks = h->blocks; - s->parity = h->parity; - - memset(&h->blocks, 0, sizeof(h->blocks)); - memset(&h->parity, 0, sizeof(h->parity)); + s->nr_data = min_t(unsigned, h->nr_active_devs, + EC_STRIPE_MAX) - h->redundancy; + s->nr_parity = h->redundancy; bch2_keylist_init(&s->keys, s->inline_keys); @@ -1064,9 +1102,8 @@ int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h) s->stripe.size = h->blocksize; memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid)); - ec_stripe_key_init(c, &s->stripe.key, - &s->blocks, &s->parity, - h->blocksize); + ec_stripe_key_init(c, &s->stripe.key, s->nr_data, + s->nr_parity, h->blocksize); for (i = 0; i < s->stripe.key.v.nr_blocks; i++) { s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL); @@ -1098,14 +1135,13 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, mutex_init(&h->lock); mutex_lock(&h->lock); - INIT_LIST_HEAD(&h->stripes); h->target = target; h->algo = algo; h->redundancy = redundancy; rcu_read_lock(); - h->devs = target_rw_devs(c, BCH_DATA_USER, target); + h->devs = target_rw_devs(c, BCH_DATA_user, target); for_each_member_device_rcu(ca, c, i, &h->devs) if (!ca->mi.durability) @@ -1118,26 +1154,22 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, h->nr_active_devs++; rcu_read_unlock(); - list_add(&h->list, &c->ec_new_stripe_list); + list_add(&h->list, &c->ec_stripe_head_list); return h; } -void bch2_ec_stripe_head_put(struct ec_stripe_head *h) +void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) { - struct ec_stripe_new *s = NULL; - if (h->s && + h->s->allocated && bitmap_weight(h->s->blocks_allocated, h->s->blocks.nr) == h->s->blocks.nr) - s = ec_stripe_set_pending(h); + ec_stripe_set_pending(c, h); mutex_unlock(&h->lock); - - if (s) - ec_stripe_new_put(s); } -struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, +struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c, unsigned target, unsigned algo, unsigned redundancy) @@ -1147,8 +1179,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, if (!redundancy) return NULL; - mutex_lock(&c->ec_new_stripe_lock); - list_for_each_entry(h, &c->ec_new_stripe_list, list) + mutex_lock(&c->ec_stripe_head_lock); + list_for_each_entry(h, &c->ec_stripe_head_list, list) if (h->target == target && h->algo == algo && h->redundancy == redundancy) { @@ -1158,7 +1190,196 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, h = ec_new_stripe_head_alloc(c, target, algo, redundancy); found: - mutex_unlock(&c->ec_new_stripe_lock); + mutex_unlock(&c->ec_stripe_head_lock); + return h; +} + +/* + * XXX: use a higher watermark for allocating open buckets here: + */ +static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h) +{ + struct bch_devs_mask devs; + struct open_bucket *ob; + unsigned i, nr_have, nr_data = + min_t(unsigned, h->nr_active_devs, + EC_STRIPE_MAX) - h->redundancy; + bool have_cache = true; + int ret = 0; + + devs = h->devs; + + for_each_set_bit(i, h->s->blocks_allocated, EC_STRIPE_MAX) { + __clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d); + --nr_data; + } + + BUG_ON(h->s->blocks.nr > nr_data); + BUG_ON(h->s->parity.nr > h->redundancy); + + open_bucket_for_each(c, &h->s->parity, ob, i) + __clear_bit(ob->ptr.dev, devs.d); + open_bucket_for_each(c, &h->s->blocks, ob, i) + __clear_bit(ob->ptr.dev, devs.d); + + percpu_down_read(&c->mark_lock); + rcu_read_lock(); + + if (h->s->parity.nr < h->redundancy) { + nr_have = h->s->parity.nr; + + ret = bch2_bucket_alloc_set(c, &h->s->parity, + &h->parity_stripe, + &devs, + h->redundancy, + &nr_have, + &have_cache, + RESERVE_NONE, + 0, + NULL); + if (ret) + goto err; + } + + if (h->s->blocks.nr < nr_data) { + nr_have = h->s->blocks.nr; + + ret = bch2_bucket_alloc_set(c, &h->s->blocks, + &h->block_stripe, + &devs, + nr_data, + &nr_have, + &have_cache, + RESERVE_NONE, + 0, + NULL); + if (ret) + goto err; + } +err: + rcu_read_unlock(); + percpu_up_read(&c->mark_lock); + return ret; +} + +/* XXX: doesn't obey target: */ +static s64 get_existing_stripe(struct bch_fs *c, + unsigned target, + unsigned algo, + unsigned redundancy) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + struct stripe *m; + size_t heap_idx; + u64 stripe_idx; + + if (may_create_new_stripe(c)) + return -1; + + spin_lock(&c->ec_stripes_heap_lock); + for (heap_idx = 0; heap_idx < h->used; heap_idx++) { + if (!h->data[heap_idx].blocks_nonempty) + continue; + + stripe_idx = h->data[heap_idx].idx; + m = genradix_ptr(&c->stripes[0], stripe_idx); + + if (m->algorithm == algo && + m->nr_redundant == redundancy && + m->blocks_nonempty < m->nr_blocks - m->nr_redundant) { + bch2_stripes_heap_del(c, m, stripe_idx); + spin_unlock(&c->ec_stripes_heap_lock); + return stripe_idx; + } + } + + spin_unlock(&c->ec_stripes_heap_lock); + return -1; +} + +static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) +{ + struct btree_trans trans; + struct btree_iter *iter; + struct bkey_s_c k; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (!ret) + bkey_reassemble(&stripe->key.k_i, k); + bch2_trans_exit(&trans); + + return ret; +} + +struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, + unsigned target, + unsigned algo, + unsigned redundancy) +{ + struct closure cl; + struct ec_stripe_head *h; + struct open_bucket *ob; + unsigned i, data_idx = 0; + s64 idx; + + closure_init_stack(&cl); + + h = __bch2_ec_stripe_head_get(c, target, algo, redundancy); + if (!h) + return NULL; + + if (!h->s && ec_new_stripe_alloc(c, h)) { + bch2_ec_stripe_head_put(c, h); + return NULL; + } + + if (!h->s->allocated) { + if (!h->s->existing_stripe && + (idx = get_existing_stripe(c, target, algo, redundancy)) >= 0) { + //pr_info("got existing stripe %llu", idx); + + h->s->existing_stripe = true; + h->s->existing_stripe_idx = idx; + if (get_stripe_key(c, idx, &h->s->stripe)) { + /* btree error */ + BUG(); + } + + for (i = 0; i < h->s->stripe.key.v.nr_blocks; i++) + if (stripe_blockcount_get(&h->s->stripe.key.v, i)) { + __set_bit(i, h->s->blocks_allocated); + ec_block_io(c, &h->s->stripe, READ, i, &cl); + } + } + + if (new_stripe_alloc_buckets(c, h)) { + bch2_ec_stripe_head_put(c, h); + h = NULL; + goto out; + } + + open_bucket_for_each(c, &h->s->blocks, ob, i) { + data_idx = find_next_zero_bit(h->s->blocks_allocated, + h->s->nr_data, data_idx); + BUG_ON(data_idx >= h->s->nr_data); + + h->s->stripe.key.v.ptrs[data_idx] = ob->ptr; + h->s->data_block_idx[i] = data_idx; + data_idx++; + } + + open_bucket_for_each(c, &h->s->parity, ob, i) + h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr; + + //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]); + h->s->allocated = true; + } +out: + closure_sync(&cl); return h; } @@ -1168,14 +1389,10 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) struct open_bucket *ob; unsigned i; - mutex_lock(&c->ec_new_stripe_lock); - list_for_each_entry(h, &c->ec_new_stripe_list, list) { - struct ec_stripe_new *s = NULL; + mutex_lock(&c->ec_stripe_head_lock); + list_for_each_entry(h, &c->ec_stripe_head_list, list) { mutex_lock(&h->lock); - bch2_open_buckets_stop_dev(c, ca, &h->blocks); - bch2_open_buckets_stop_dev(c, ca, &h->parity); - if (!h->s) goto unlock; @@ -1187,15 +1404,12 @@ void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) goto found; goto unlock; found: - h->s->err = -1; - s = ec_stripe_set_pending(h); + h->s->err = -EROFS; + ec_stripe_set_pending(c, h); unlock: mutex_unlock(&h->lock); - - if (s) - ec_stripe_new_put(s); } - mutex_unlock(&c->ec_new_stripe_lock); + mutex_unlock(&c->ec_stripe_head_lock); } static int __bch2_stripe_write_key(struct btree_trans *trans, @@ -1278,11 +1492,21 @@ static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id, { int ret = 0; - if (k.k->type == KEY_TYPE_stripe) + if (k.k->type == KEY_TYPE_stripe) { + struct stripe *m; + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: bch2_mark_key(c, k, 0, 0, NULL, 0, BTREE_TRIGGER_ALLOC_READ| BTREE_TRIGGER_NOATOMIC); + if (ret) + return ret; + + spin_lock(&c->ec_stripes_heap_lock); + m = genradix_ptr(&c->stripes[0], k.k->p.offset); + bch2_stripes_heap_insert(c, m, k.k->p.offset); + spin_unlock(&c->ec_stripes_heap_lock); + } return ret; } @@ -1333,25 +1557,73 @@ int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) return 0; } +void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + struct stripe *m; + size_t i; + + spin_lock(&c->ec_stripes_heap_lock); + for (i = 0; i < min(h->used, 20UL); i++) { + m = genradix_ptr(&c->stripes[0], h->data[i].idx); + + pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx, + h->data[i].blocks_nonempty, + m->nr_blocks - m->nr_redundant, + m->nr_redundant); + } + spin_unlock(&c->ec_stripes_heap_lock); +} + +void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct ec_stripe_head *h; + struct ec_stripe_new *s; + + mutex_lock(&c->ec_stripe_head_lock); + list_for_each_entry(h, &c->ec_stripe_head_list, list) { + pr_buf(out, "target %u algo %u redundancy %u:\n", + h->target, h->algo, h->redundancy); + + if (h->s) + pr_buf(out, "\tpending: blocks %u allocated %u\n", + h->s->blocks.nr, + bitmap_weight(h->s->blocks_allocated, + h->s->blocks.nr)); + } + mutex_unlock(&c->ec_stripe_head_lock); + + mutex_lock(&c->ec_stripe_new_lock); + list_for_each_entry(s, &c->ec_stripe_new_list, list) { + pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n", + s->blocks.nr, + bitmap_weight(s->blocks_allocated, + s->blocks.nr), + atomic_read(&s->pin)); + } + mutex_unlock(&c->ec_stripe_new_lock); +} + void bch2_fs_ec_exit(struct bch_fs *c) { struct ec_stripe_head *h; while (1) { - mutex_lock(&c->ec_new_stripe_lock); - h = list_first_entry_or_null(&c->ec_new_stripe_list, + mutex_lock(&c->ec_stripe_head_lock); + h = list_first_entry_or_null(&c->ec_stripe_head_list, struct ec_stripe_head, list); if (h) list_del(&h->list); - mutex_unlock(&c->ec_new_stripe_lock); + mutex_unlock(&c->ec_stripe_head_lock); if (!h) break; BUG_ON(h->s); - BUG_ON(!list_empty(&h->stripes)); kfree(h); } + BUG_ON(!list_empty(&c->ec_stripe_new_list)); + free_heap(&c->ec_stripes_heap); genradix_free(&c->stripes[0]); bioset_exit(&c->ec_bioset); @@ -1359,6 +1631,7 @@ void bch2_fs_ec_exit(struct bch_fs *c) int bch2_fs_ec_init(struct bch_fs *c) { + INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), diff --git a/libbcachefs/ec.h b/libbcachefs/ec.h index 4dfaac0..f8fc3d6 100644 --- a/libbcachefs/ec.h +++ b/libbcachefs/ec.h @@ -93,9 +93,17 @@ struct ec_stripe_new { int err; + u8 nr_data; + u8 nr_parity; + bool allocated; + bool pending; + bool existing_stripe; + u64 existing_stripe_idx; + unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)]; struct open_buckets blocks; + u8 data_block_idx[EC_STRIPE_MAX]; struct open_buckets parity; struct keylist keys; @@ -108,8 +116,6 @@ struct ec_stripe_head { struct list_head list; struct mutex lock; - struct list_head stripes; - unsigned target; unsigned algo; unsigned redundancy; @@ -122,9 +128,6 @@ struct ec_stripe_head { struct dev_stripe_state block_stripe; struct dev_stripe_state parity_stripe; - struct open_buckets blocks; - struct open_buckets parity; - struct ec_stripe_new *s; }; @@ -139,7 +142,7 @@ void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); -void bch2_ec_stripe_head_put(struct ec_stripe_head *); +void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned, unsigned, unsigned); @@ -157,6 +160,9 @@ int bch2_stripes_write(struct bch_fs *, unsigned, bool *); int bch2_ec_mem_alloc(struct bch_fs *, bool); +void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); +void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); + void bch2_fs_ec_exit(struct bch_fs *); int bch2_fs_ec_init(struct bch_fs *); diff --git a/libbcachefs/ec_types.h b/libbcachefs/ec_types.h index 5c3f77c..e4d633f 100644 --- a/libbcachefs/ec_types.h +++ b/libbcachefs/ec_types.h @@ -22,6 +22,7 @@ struct stripe { unsigned alive:1; unsigned dirty:1; + unsigned on_heap:1; u8 blocks_nonempty; u16 block_sectors[EC_STRIPE_MAX]; diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index 251d4af..568f039 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -179,11 +179,6 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) if (!percpu_down_read_trylock(&c->mark_lock)) return; - bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked_locked(c, k, false), c, - "btree key bad (replicas not marked in superblock):\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); - bkey_for_each_ptr(ptrs, ptr) { ca = bch_dev_bkey_exists(c, ptr->dev); @@ -194,7 +189,7 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) goto err; err = "inconsistent"; - if (mark.data_type != BCH_DATA_BTREE || + if (mark.data_type != BCH_DATA_btree || mark.dirty_sectors < c->opts.btree_node_size) goto err; } @@ -267,11 +262,6 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) if (!percpu_down_read_trylock(&c->mark_lock)) return; - bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, - "extent key bad (replicas not marked in superblock):\n%s", - (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); - extent_for_each_ptr_decode(e, p, entry) { struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); @@ -289,7 +279,7 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) "key too stale: %i", stale); bch2_fs_inconsistent_on(!stale && - (mark.data_type != BCH_DATA_USER || + (mark.data_type != BCH_DATA_user || mark_sectors < disk_sectors), c, "extent pointer not marked: %s:\n" "type %u sectors %u < %u", @@ -724,7 +714,7 @@ static unsigned bch2_extent_ptr_durability(struct bch_fs *c, if (WARN_ON(!s)) goto out; - durability = max_t(unsigned, durability, s->nr_redundant); + durability += s->nr_redundant; } out: return durability; diff --git a/libbcachefs/fs-io.c b/libbcachefs/fs-io.c index ec78e7b..5500499 100644 --- a/libbcachefs/fs-io.c +++ b/libbcachefs/fs-io.c @@ -1516,24 +1516,24 @@ retry_reservation: if (!pg_copied) break; + if (!PageUptodate(page) && + pg_copied != PAGE_SIZE && + pos + copied + pg_copied < inode->v.i_size) { + zero_user(page, 0, PAGE_SIZE); + break; + } + flush_dcache_page(page); iov_iter_advance(iter, pg_copied); copied += pg_copied; + + if (pg_copied != pg_len) + break; } if (!copied) goto out; - if (copied < len && - ((offset + copied) & (PAGE_SIZE - 1))) { - struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; - - if (!PageUptodate(page)) { - zero_user(page, 0, PAGE_SIZE); - copied -= (offset + copied) & (PAGE_SIZE - 1); - } - } - spin_lock(&inode->v.i_lock); if (pos + copied > inode->v.i_size) i_size_write(&inode->v, pos + copied); @@ -1630,6 +1630,7 @@ again: } pos += ret; written += ret; + ret = 0; balance_dirty_pages_ratelimited(mapping); } while (iov_iter_count(iter)); @@ -1818,7 +1819,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) while (1) { if (kthread) - use_mm(dio->mm); + kthread_use_mm(dio->mm); BUG_ON(current->faults_disabled_mapping); current->faults_disabled_mapping = mapping; @@ -1826,7 +1827,7 @@ static long bch2_dio_write_loop(struct dio_write *dio) current->faults_disabled_mapping = NULL; if (kthread) - unuse_mm(dio->mm); + kthread_unuse_mm(dio->mm); if (unlikely(ret < 0)) goto err; diff --git a/libbcachefs/fs-ioctl.c b/libbcachefs/fs-ioctl.c index 031e6d9..0873d2f 100644 --- a/libbcachefs/fs-ioctl.c +++ b/libbcachefs/fs-ioctl.c @@ -138,6 +138,10 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, if (fa.fsx_projid >= U32_MAX) return -EINVAL; + /* + * inode fields accessible via the xattr interface are stored with a +1 + * bias, so that 0 means unset: + */ s.projid = fa.fsx_projid + 1; ret = mnt_want_write_file(file); @@ -151,7 +155,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, } mutex_lock(&inode->ei_update_lock); - ret = bch2_set_projid(c, inode, s.projid); + ret = bch2_set_projid(c, inode, fa.fsx_projid); if (ret) goto err_unlock; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index a47923d..121150b 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -860,6 +861,10 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bool have_extent = false; int ret = 0; + ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + if (start + len < start) return -EINVAL; @@ -1236,8 +1241,8 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = usage.capacity >> shift; buf->f_bfree = (usage.capacity - usage.used) >> shift; buf->f_bavail = buf->f_bfree; - buf->f_files = usage.nr_inodes; - buf->f_ffree = U64_MAX; + buf->f_files = 0; + buf->f_ffree = 0; fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index c6ca596..5a6df3d 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -1265,6 +1265,8 @@ static int check_inode(struct btree_trans *trans, u.bi_inum))) { bch_verbose(c, "deleting inode %llu", u.bi_inum); + bch2_fs_lazy_rw(c); + ret = bch2_inode_rm(c, u.bi_inum); if (ret) bch_err(c, "error in fsck: error %i while deleting inode", ret); @@ -1277,6 +1279,8 @@ static int check_inode(struct btree_trans *trans, u.bi_inum))) { bch_verbose(c, "truncating inode %llu", u.bi_inum); + bch2_fs_lazy_rw(c); + /* * XXX: need to truncate partial blocks too here - or ideally * just switch units to bytes and that issue goes away diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 8c44105..5c9c3cf 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -31,9 +31,17 @@ #include #include +#include #include +const char *bch2_blk_status_to_str(blk_status_t status) +{ + if (status == BLK_STS_REMOVED) + return "device removed"; + return blk_status_to_str(status); +} + static bool bch2_target_congested(struct bch_fs *c, u16 target) { const struct bch_devs_mask *devs; @@ -46,7 +54,9 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) return false; rcu_read_lock(); - devs = bch2_target_to_mask(c, target); + devs = bch2_target_to_mask(c, target) ?: + &c->rw_devs[BCH_DATA_user]; + for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { ca = rcu_dereference(c->devs[d]); if (!ca) @@ -463,7 +473,8 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, n->c = c; n->dev = ptr->dev; - n->have_ioref = bch2_dev_get_ioref(ca, WRITE); + n->have_ioref = bch2_dev_get_ioref(ca, + type == BCH_DATA_btree ? READ : WRITE); n->submit_time = local_clock(); n->bio.bi_iter.bi_sector = ptr->offset; @@ -611,7 +622,8 @@ static void bch2_write_endio(struct bio *bio) struct bch_fs *c = wbio->c; struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); - if (bch2_dev_io_err_on(bio->bi_status, ca, "data write")) + if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s", + bch2_blk_status_to_str(bio->bi_status))) set_bit(wbio->dev, op->failed.d); if (wbio->have_ioref) { @@ -1053,7 +1065,10 @@ static void __bch2_write(struct closure *cl) struct write_point *wp; struct bio *bio; bool skip_put = true; + unsigned nofs_flags; int ret; + + nofs_flags = memalloc_nofs_save(); again: memset(&op->failed, 0, sizeof(op->failed)); @@ -1079,6 +1094,11 @@ again: goto err; } + /* + * The copygc thread is now global, which means it's no longer + * freeing up space on specific disks, which means that + * allocations for specific disks may hang arbitrarily long: + */ wp = bch2_alloc_sectors_start(c, op->target, op->opts.erasure_code, @@ -1088,7 +1108,8 @@ again: op->nr_replicas_required, op->alloc_reserve, op->flags, - (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl); + (op->flags & (BCH_WRITE_ALLOC_NOWAIT| + BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl); EBUG_ON(!wp); if (unlikely(IS_ERR(wp))) { @@ -1100,6 +1121,16 @@ again: goto flush_io; } + /* + * It's possible for the allocator to fail, put us on the + * freelist waitlist, and then succeed in one of various retry + * paths: if that happens, we need to disable the skip_put + * optimization because otherwise there won't necessarily be a + * barrier before we free the bch_write_op: + */ + if (atomic_read(&cl->remaining) & CLOSURE_WAITING) + skip_put = false; + bch2_open_bucket_get(c, wp, &op->open_buckets); ret = bch2_write_extent(op, wp, &bio); bch2_alloc_sectors_done(c, wp); @@ -1129,19 +1160,21 @@ again: key_to_write = (void *) (op->insert_keys.keys_p + key_to_write_offset); - bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER, + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, key_to_write); } while (ret); if (!skip_put) continue_at(cl, bch2_write_index, index_update_wq(op)); +out: + memalloc_nofs_restore(nofs_flags); return; err: op->error = ret; op->flags |= BCH_WRITE_DONE; continue_at(cl, bch2_write_index, index_update_wq(op)); - return; + goto out; flush_io: /* * If the write can't all be submitted at once, we generally want to @@ -1152,7 +1185,7 @@ flush_io: */ if (current->flags & PF_WQ_WORKER) { continue_at(cl, bch2_write_index, index_update_wq(op)); - return; + goto out; } closure_sync(cl); @@ -1163,7 +1196,7 @@ flush_io: if (op->error) { op->flags |= BCH_WRITE_DONE; continue_at_nobarrier(cl, bch2_write_done, NULL); - return; + goto out; } } @@ -1921,7 +1954,8 @@ static void bch2_read_endio(struct bio *bio) if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; - if (bch2_dev_io_err_on(bio->bi_status, ca, "data read")) { + if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s", + bch2_blk_status_to_str(bio->bi_status))) { bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); return; } @@ -2174,7 +2208,7 @@ get_bio: goto out; } - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], bio_sectors(&rbio->bio)); bio_set_dev(&rbio->bio, ca->disk_sb.bdev); diff --git a/libbcachefs/io.h b/libbcachefs/io.h index 0ad293b..ded468d 100644 --- a/libbcachefs/io.h +++ b/libbcachefs/io.h @@ -22,6 +22,8 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, #define BLK_STS_REMOVED ((__force blk_status_t)128) +const char *bch2_blk_status_to_str(blk_status_t); + enum bch_write_flags { BCH_WRITE_ALLOC_NOWAIT = (1 << 0), BCH_WRITE_CACHED = (1 << 1), diff --git a/libbcachefs/io_types.h b/libbcachefs/io_types.h index 684e4c9..b23727d 100644 --- a/libbcachefs/io_types.h +++ b/libbcachefs/io_types.h @@ -78,7 +78,6 @@ struct bch_write_bio { u64 submit_time; struct bch_devs_list failed; - u8 order; u8 dev; unsigned split:1, diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index b4f7b61..210ad1b 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -847,7 +847,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (pos <= ja->cur_idx) ja->cur_idx = (ja->cur_idx + 1) % ja->nr; - bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL, + bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal, ca->mi.bucket_size, gc_phase(GC_PHASE_SB), 0); @@ -1135,9 +1135,8 @@ out: /* debug: */ -ssize_t bch2_journal_print_debug(struct journal *j, char *buf) +void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); struct bch_fs *c = container_of(j, struct bch_fs, journal); union journal_res_state s; struct bch_dev *ca; @@ -1147,7 +1146,7 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) spin_lock(&j->lock); s = READ_ONCE(j->reservations); - pr_buf(&out, + pr_buf(out, "active journal entries:\t%llu\n" "seq:\t\t\t%llu\n" "last_seq:\t\t%llu\n" @@ -1165,44 +1164,44 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) switch (s.cur_entry_offset) { case JOURNAL_ENTRY_ERROR_VAL: - pr_buf(&out, "error\n"); + pr_buf(out, "error\n"); break; case JOURNAL_ENTRY_CLOSED_VAL: - pr_buf(&out, "closed\n"); + pr_buf(out, "closed\n"); break; default: - pr_buf(&out, "%u/%u\n", + pr_buf(out, "%u/%u\n", s.cur_entry_offset, j->cur_entry_u64s); break; } - pr_buf(&out, + pr_buf(out, "current entry refs:\t%u\n" "prev entry unwritten:\t", journal_state_count(s, s.idx)); if (s.prev_buf_unwritten) - pr_buf(&out, "yes, ref %u sectors %u\n", + pr_buf(out, "yes, ref %u sectors %u\n", journal_state_count(s, !s.idx), journal_prev_buf(j)->sectors); else - pr_buf(&out, "no\n"); + pr_buf(out, "no\n"); - pr_buf(&out, + pr_buf(out, "need write:\t\t%i\n" "replay done:\t\t%i\n", test_bit(JOURNAL_NEED_WRITE, &j->flags), test_bit(JOURNAL_REPLAY_DONE, &j->flags)); for_each_member_device_rcu(ca, c, iter, - &c->rw_devs[BCH_DATA_JOURNAL]) { + &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; if (!ja->nr) continue; - pr_buf(&out, + pr_buf(out, "dev %u:\n" "\tnr\t\t%u\n" "\tavailable\t%u:%u\n" @@ -1221,34 +1220,29 @@ ssize_t bch2_journal_print_debug(struct journal *j, char *buf) spin_unlock(&j->lock); rcu_read_unlock(); - - return out.pos - buf; } -ssize_t bch2_journal_print_pins(struct journal *j, char *buf) +void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); struct journal_entry_pin_list *pin_list; struct journal_entry_pin *pin; u64 i; spin_lock(&j->lock); fifo_for_each_entry_ptr(pin_list, &j->pin, i) { - pr_buf(&out, "%llu: count %u\n", + pr_buf(out, "%llu: count %u\n", i, atomic_read(&pin_list->count)); list_for_each_entry(pin, &pin_list->list, list) - pr_buf(&out, "\t%px %ps\n", + pr_buf(out, "\t%px %ps\n", pin, pin->flush); if (!list_empty(&pin_list->flushed)) - pr_buf(&out, "flushed:\n"); + pr_buf(out, "flushed:\n"); list_for_each_entry(pin, &pin_list->flushed, list) - pr_buf(&out, "\t%px %ps\n", + pr_buf(out, "\t%px %ps\n", pin, pin->flush); } spin_unlock(&j->lock); - - return out.pos - buf; } diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 30de6d9..5643884 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -499,8 +499,8 @@ static inline void bch2_journal_set_replay_done(struct journal *j) void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); -ssize_t bch2_journal_print_debug(struct journal *, char *); -ssize_t bch2_journal_print_pins(struct journal *, char *); +void bch2_journal_debug_to_text(struct printbuf *, struct journal *); +void bch2_journal_pins_to_text(struct printbuf *, struct journal *); int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned nr); diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index b762528..bd0e6b3 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -6,6 +6,7 @@ #include "buckets.h" #include "checksum.h" #include "error.h" +#include "io.h" #include "journal.h" #include "journal_io.h" #include "journal_reclaim.h" @@ -28,9 +29,11 @@ struct journal_list { * be replayed: */ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, - struct journal_list *jlist, struct jset *j) + struct journal_list *jlist, struct jset *j, + bool bad) { struct journal_replay *i, *pos; + struct bch_devs_list devs = { .nr = 0 }; struct list_head *where; size_t bytes = vstruct_bytes(j); __le64 last_seq; @@ -59,8 +62,31 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, } list_for_each_entry_reverse(i, jlist->head, list) { - /* Duplicate? */ - if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { + if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { + where = &i->list; + goto add; + } + } + + where = jlist->head; +add: + i = where->next != jlist->head + ? container_of(where->next, struct journal_replay, list) + : NULL; + + /* + * Duplicate journal entries? If so we want the one that didn't have a + * checksum error: + */ + if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { + if (i->bad) { + devs = i->devs; + list_del(&i->list); + kvpfree(i, offsetof(struct journal_replay, j) + + vstruct_bytes(&i->j)); + } else if (bad) { + goto found; + } else { fsck_err_on(bytes != vstruct_bytes(&i->j) || memcmp(j, &i->j, bytes), c, "found duplicate but non identical journal entries (seq %llu)", @@ -68,14 +94,8 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, goto found; } - if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { - where = &i->list; - goto add; - } } - where = jlist->head; -add: i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); if (!i) { ret = -ENOMEM; @@ -83,7 +103,8 @@ add: } list_add(&i->list, where); - i->devs.nr = 0; + i->devs = devs; + i->bad = bad; memcpy(&i->j, j, bytes); found: if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) @@ -390,6 +411,7 @@ fsck_err: } static int jset_validate(struct bch_fs *c, + struct bch_dev *ca, struct jset *jset, u64 sector, unsigned bucket_sectors_left, unsigned sectors_read, @@ -404,16 +426,19 @@ static int jset_validate(struct bch_fs *c, return JOURNAL_ENTRY_NONE; version = le32_to_cpu(jset->version); - if ((version != BCH_JSET_VERSION_OLD && - version < bcachefs_metadata_version_min) || - version >= bcachefs_metadata_version_max) { - bch_err(c, "unknown journal entry version %u", jset->version); - return BCH_FSCK_UNKNOWN_VERSION; + if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD && + version < bcachefs_metadata_version_min) || + version >= bcachefs_metadata_version_max, c, + "%s sector %llu seq %llu: unknown journal entry version %u", + ca->name, sector, le64_to_cpu(jset->seq), + version)) { + /* XXX: note we might have missing journal entries */ + return JOURNAL_ENTRY_BAD; } if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, - "journal entry too big (%zu bytes), sector %lluu", - bytes, sector)) { + "%s sector %llu seq %llu: journal entry too big (%zu bytes)", + ca->name, sector, le64_to_cpu(jset->seq), bytes)) { /* XXX: note we might have missing journal entries */ return JOURNAL_ENTRY_BAD; } @@ -422,13 +447,15 @@ static int jset_validate(struct bch_fs *c, return JOURNAL_ENTRY_REREAD; if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, - "journal entry with unknown csum type %llu sector %lluu", - JSET_CSUM_TYPE(jset), sector)) + "%s sector %llu seq %llu: journal entry with unknown csum type %llu", + ca->name, sector, le64_to_cpu(jset->seq), + JSET_CSUM_TYPE(jset))) return JOURNAL_ENTRY_BAD; csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, - "journal checksum bad, sector %llu", sector)) { + "%s sector %llu seq %llu: journal checksum bad", + ca->name, sector, le64_to_cpu(jset->seq))) { /* XXX: retry IO, when we start retrying checksum errors */ /* XXX: note we might have missing journal entries */ return JOURNAL_ENTRY_BAD; @@ -439,8 +466,10 @@ static int jset_validate(struct bch_fs *c, vstruct_end(jset) - (void *) jset->encrypted_start); if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, - "invalid journal entry: last_seq > seq")) + "invalid journal entry: last_seq > seq")) { jset->last_seq = jset->seq; + return JOURNAL_ENTRY_BAD; + } return 0; fsck_err: @@ -515,11 +544,12 @@ reread: j = buf->data; } - ret = jset_validate(c, j, offset, + ret = jset_validate(c, ca, j, offset, end - offset, sectors_read, READ); switch (ret) { case BCH_FSCK_OK: + sectors = vstruct_sectors(j, c->block_bits); break; case JOURNAL_ENTRY_REREAD: if (vstruct_bytes(j) > buf->size) { @@ -536,8 +566,13 @@ reread: goto next_block; case JOURNAL_ENTRY_BAD: saw_bad = true; + /* + * On checksum error we don't really trust the size + * field of the journal entry we read, so try reading + * again at next block boundary: + */ sectors = c->opts.block_size; - goto next_block; + break; default: return ret; } @@ -554,7 +589,7 @@ reread: ja->bucket_seq[bucket] = le64_to_cpu(j->seq); mutex_lock(&jlist->lock); - ret = journal_entry_add(c, ca, jlist, j); + ret = journal_entry_add(c, ca, jlist, j, ret != 0); mutex_unlock(&jlist->lock); switch (ret) { @@ -565,8 +600,6 @@ reread: default: return ret; } - - sectors = vstruct_sectors(j, c->block_bits); next_block: pr_debug("next"); offset += sectors; @@ -661,7 +694,7 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) for_each_member_device(ca, c, iter) { if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL))) + !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) continue; if ((ca->mi.state == BCH_MEMBER_STATE_RW || @@ -695,11 +728,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list) * the devices - this is wrong: */ - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs); + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs); if (!degraded && (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || - fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c, + fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, "superblock not marked as containing replicas %s", (bch2_replicas_entry_to_text(&PBUF(buf), &replicas.e), buf)))) { @@ -759,7 +792,7 @@ static void __journal_write_alloc(struct journal *j, sectors > ja->sectors_free) continue; - bch2_dev_stripe_increment(c, ca, &j->wp.stripe); + bch2_dev_stripe_increment(ca, &j->wp.stripe); bch2_bkey_append_ptr(&w->key, (struct bch_extent_ptr) { @@ -796,7 +829,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, rcu_read_lock(); devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, - &c->rw_devs[BCH_DATA_JOURNAL]); + &c->rw_devs[BCH_DATA_journal]); __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); @@ -914,7 +947,7 @@ static void journal_write_done(struct closure *cl) goto err; } - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs); + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs); if (bch2_mark_replicas(c, &replicas.e)) goto err; @@ -961,7 +994,8 @@ static void journal_write_endio(struct bio *bio) struct bch_dev *ca = bio->bi_private; struct journal *j = &ca->fs->journal; - if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write") || + if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s", + bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("journal")) { struct journal_buf *w = journal_prev_buf(j); unsigned long flags; @@ -1105,7 +1139,7 @@ retry_alloc: continue; } - this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL], + this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], sectors); bio = ca->journal.bio; diff --git a/libbcachefs/journal_io.h b/libbcachefs/journal_io.h index 72e575f..6958ee0 100644 --- a/libbcachefs/journal_io.h +++ b/libbcachefs/journal_io.h @@ -9,6 +9,8 @@ struct journal_replay { struct list_head list; struct bch_devs_list devs; + /* checksum error, but we may want to try using it anyways: */ + bool bad; /* must be last: */ struct jset j; }; diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 4811ab9..5759198 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -70,7 +70,7 @@ static struct journal_space { rcu_read_lock(); for_each_member_device_rcu(ca, c, i, - &c->rw_devs[BCH_DATA_JOURNAL]) { + &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; unsigned buckets_this_device, sectors_this_device; @@ -139,7 +139,7 @@ void bch2_journal_space_available(struct journal *j) rcu_read_lock(); for_each_member_device_rcu(ca, c, i, - &c->rw_devs[BCH_DATA_JOURNAL]) { + &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; if (!ja->nr) @@ -618,7 +618,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) return ret; mutex_lock(&c->replicas_gc_lock); - bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL); + bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); seq = 0; @@ -627,7 +627,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) struct bch_replicas_padded replicas; seq = max(seq, journal_last_seq(j)); - bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, journal_seq_pin(j, seq)->devs); seq++; diff --git a/libbcachefs/journal_seq_blacklist.c b/libbcachefs/journal_seq_blacklist.c index a21de00..d0f1bbf 100644 --- a/libbcachefs/journal_seq_blacklist.c +++ b/libbcachefs/journal_seq_blacklist.c @@ -36,15 +36,6 @@ * that bset, until that btree node is rewritten. */ -static unsigned -blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) -{ - return bl - ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / - sizeof(struct journal_seq_blacklist_entry)) - : 0; -} - static unsigned sb_blacklist_u64s(unsigned nr) { struct bch_sb_field_journal_seq_blacklist *bl; diff --git a/libbcachefs/journal_seq_blacklist.h b/libbcachefs/journal_seq_blacklist.h index 03f4b97..afb886e 100644 --- a/libbcachefs/journal_seq_blacklist.h +++ b/libbcachefs/journal_seq_blacklist.h @@ -2,6 +2,15 @@ #ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H #define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H +static inline unsigned +blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) +{ + return bl + ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / + sizeof(struct journal_seq_blacklist_entry)) + : 0; +} + bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); int bch2_blacklist_table_initialize(struct bch_fs *); diff --git a/libbcachefs/move.c b/libbcachefs/move.c index b42350f..2f3be48 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -247,11 +247,15 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, m->op.target = data_opts.target, m->op.write_point = wp; - if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) + if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) { m->op.alloc_reserve = RESERVE_MOVINGGC; + m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; + } else { + /* XXX: this should probably be passed in */ + m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; + } - m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS| - BCH_WRITE_PAGES_STABLE| + m->op.flags |= BCH_WRITE_PAGES_STABLE| BCH_WRITE_PAGES_OWNED| BCH_WRITE_DATA_ENCODED| BCH_WRITE_FROM_INTERNAL; @@ -517,7 +521,7 @@ static int __bch2_move_data(struct bch_fs *c, bkey_on_stack_init(&sk); bch2_trans_init(&trans, c, 0, 0); - stats->data_type = BCH_DATA_USER; + stats->data_type = BCH_DATA_user; stats->btree_id = btree_id; stats->pos = POS_MIN; @@ -642,7 +646,7 @@ int bch2_move_data(struct bch_fs *c, INIT_LIST_HEAD(&ctxt.reads); init_waitqueue_head(&ctxt.wait); - stats->data_type = BCH_DATA_USER; + stats->data_type = BCH_DATA_user; ret = __bch2_move_data(c, &ctxt, rate, wp, start, end, pred, arg, stats, BTREE_ID_EXTENTS) ?: @@ -677,7 +681,7 @@ static int bch2_move_btree(struct bch_fs *c, bch2_trans_init(&trans, c, 0, 0); - stats->data_type = BCH_DATA_BTREE; + stats->data_type = BCH_DATA_btree; for (id = 0; id < BTREE_ID_NR; id++) { stats->btree_id = id; @@ -773,7 +777,7 @@ int bch2_data_job(struct bch_fs *c, switch (op.op) { case BCH_DATA_OP_REREPLICATE: - stats->data_type = BCH_DATA_JOURNAL; + stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, -1); ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; @@ -794,7 +798,7 @@ int bch2_data_job(struct bch_fs *c, if (op.migrate.dev >= c->sb.nr_devices) return -EINVAL; - stats->data_type = BCH_DATA_JOURNAL; + stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret; diff --git a/libbcachefs/movinggc.c b/libbcachefs/movinggc.c index 0a87cd7..de0a797 100644 --- a/libbcachefs/movinggc.c +++ b/libbcachefs/movinggc.c @@ -12,6 +12,7 @@ #include "buckets.h" #include "clock.h" #include "disk_groups.h" +#include "error.h" #include "extents.h" #include "eytzinger.h" #include "io.h" @@ -43,37 +44,27 @@ #define COPYGC_BUCKETS_PER_ITER(ca) \ ((ca)->free[RESERVE_MOVINGGC].size / 2) -/* - * Max sectors to move per iteration: Have to take into account internal - * fragmentation from the multiple write points for each generation: - */ -#define COPYGC_SECTORS_PER_ITER(ca) \ - ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca)) - -static inline int sectors_used_cmp(copygc_heap *heap, - struct copygc_heap_entry l, - struct copygc_heap_entry r) -{ - return cmp_int(l.sectors, r.sectors); -} - static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) { const struct copygc_heap_entry *l = _l; const struct copygc_heap_entry *r = _r; - return cmp_int(l->offset, r->offset); + return cmp_int(l->dev, r->dev) ?: + cmp_int(l->offset, r->offset); } -static bool __copygc_pred(struct bch_dev *ca, - struct bkey_s_c k) +static int __copygc_pred(struct bch_fs *c, struct bkey_s_c k) { - copygc_heap *h = &ca->copygc_heap; - const struct bch_extent_ptr *ptr = - bch2_bkey_has_device(k, ca->dev_idx); - - if (ptr) { - struct copygc_heap_entry search = { .offset = ptr->offset }; + copygc_heap *h = &c->copygc_heap; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct copygc_heap_entry search = { + .dev = ptr->dev, + .offset = ptr->offset + }; ssize_t i = eytzinger0_find_le(h->data, h->used, sizeof(h->data[0]), @@ -89,12 +80,13 @@ static bool __copygc_pred(struct bch_dev *ca, BUG_ON(i != j); #endif - return (i >= 0 && - ptr->offset < h->data[i].offset + ca->mi.bucket_size && - ptr->gen == h->data[i].gen); + if (i >= 0 && + ptr->offset < h->data[i].offset + ca->mi.bucket_size && + ptr->gen == h->data[i].gen) + return ptr->dev; } - return false; + return -1; } static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, @@ -102,14 +94,13 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, struct bch_io_opts *io_opts, struct data_opts *data_opts) { - struct bch_dev *ca = arg; - - if (!__copygc_pred(ca, k)) + int dev_idx = __copygc_pred(c, k); + if (dev_idx < 0) return DATA_SKIP; - data_opts->target = dev_to_target(ca->dev_idx); + data_opts->target = io_opts->background_target; data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; - data_opts->rewrite_dev = ca->dev_idx; + data_opts->rewrite_dev = dev_idx; return DATA_REWRITE; } @@ -125,20 +116,28 @@ static bool have_copygc_reserve(struct bch_dev *ca) return ret; } -static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) +static inline int fragmentation_cmp(copygc_heap *heap, + struct copygc_heap_entry l, + struct copygc_heap_entry r) { - copygc_heap *h = &ca->copygc_heap; + return cmp_int(l.fragmentation, r.fragmentation); +} + +static int bch2_copygc(struct bch_fs *c) +{ + copygc_heap *h = &c->copygc_heap; struct copygc_heap_entry e, *i; struct bucket_array *buckets; struct bch_move_stats move_stats; u64 sectors_to_move = 0, sectors_not_moved = 0; + u64 sectors_reserved = 0; u64 buckets_to_move, buckets_not_moved = 0; - size_t b; + struct bch_dev *ca; + unsigned dev_idx; + size_t b, heap_size = 0; int ret; memset(&move_stats, 0, sizeof(move_stats)); - closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); - /* * Find buckets with lowest sector counts, skipping completely * empty buckets, by building a maxheap sorted by sector count, @@ -147,69 +146,99 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) */ h->used = 0; - /* - * We need bucket marks to be up to date - gc can't be recalculating - * them: - */ - down_read(&c->gc_lock); - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { - struct bucket_mark m = READ_ONCE(buckets->b[b].mark); - struct copygc_heap_entry e; - - if (m.owned_by_allocator || - m.data_type != BCH_DATA_USER || - !bucket_sectors_used(m) || - bucket_sectors_used(m) >= ca->mi.bucket_size) - continue; + for_each_rw_member(ca, c, dev_idx) + heap_size += ca->mi.nbuckets >> 7; - e = (struct copygc_heap_entry) { - .gen = m.gen, - .sectors = bucket_sectors_used(m), - .offset = bucket_to_sector(ca, b), - }; - heap_add_or_replace(h, e, -sectors_used_cmp, NULL); + if (h->size < heap_size) { + free_heap(&c->copygc_heap); + if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) { + bch_err(c, "error allocating copygc heap"); + return 0; + } + } + + for_each_rw_member(ca, c, dev_idx) { + closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); + + spin_lock(&ca->fs->freelist_lock); + sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size; + spin_unlock(&ca->fs->freelist_lock); + + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + + for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { + struct bucket_mark m = READ_ONCE(buckets->b[b].mark); + struct copygc_heap_entry e; + + if (m.owned_by_allocator || + m.data_type != BCH_DATA_user || + !bucket_sectors_used(m) || + bucket_sectors_used(m) >= ca->mi.bucket_size) + continue; + + e = (struct copygc_heap_entry) { + .dev = dev_idx, + .gen = m.gen, + .fragmentation = bucket_sectors_used(m) * (1U << 15) + / ca->mi.bucket_size, + .sectors = bucket_sectors_used(m), + .offset = bucket_to_sector(ca, b), + }; + heap_add_or_replace(h, e, -fragmentation_cmp, NULL); + } + up_read(&ca->bucket_lock); + } + + if (!sectors_reserved) { + bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!"); + return -1; } - up_read(&ca->bucket_lock); - up_read(&c->gc_lock); for (i = h->data; i < h->data + h->used; i++) sectors_to_move += i->sectors; - while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) { - BUG_ON(!heap_pop(h, e, -sectors_used_cmp, NULL)); + while (sectors_to_move > sectors_reserved) { + BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL)); sectors_to_move -= e.sectors; } buckets_to_move = h->used; if (!buckets_to_move) - return; + return 0; eytzinger0_sort(h->data, h->used, sizeof(h->data[0]), bucket_offset_cmp, NULL); - ret = bch2_move_data(c, &ca->copygc_pd.rate, - writepoint_ptr(&ca->copygc_write_point), + ret = bch2_move_data(c, &c->copygc_pd.rate, + writepoint_ptr(&c->copygc_write_point), POS_MIN, POS_MAX, - copygc_pred, ca, + copygc_pred, NULL, &move_stats); - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - for (i = h->data; i < h->data + h->used; i++) { - size_t b = sector_to_bucket(ca, i->offset); - struct bucket_mark m = READ_ONCE(buckets->b[b].mark); + for_each_rw_member(ca, c, dev_idx) { + down_read(&ca->bucket_lock); + buckets = bucket_array(ca); + for (i = h->data; i < h->data + h->used; i++) { + struct bucket_mark m; + size_t b; + + if (i->dev != dev_idx) + continue; - if (i->gen == m.gen && bucket_sectors_used(m)) { - sectors_not_moved += bucket_sectors_used(m); - buckets_not_moved++; + b = sector_to_bucket(ca, i->offset); + m = READ_ONCE(buckets->b[b].mark); + + if (i->gen == m.gen && + bucket_sectors_used(m)) { + sectors_not_moved += bucket_sectors_used(m); + buckets_not_moved++; + } } + up_read(&ca->bucket_lock); } - up_read(&ca->bucket_lock); if (sectors_not_moved && !ret) bch_warn_ratelimited(c, @@ -220,9 +249,10 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) atomic64_read(&move_stats.keys_raced), atomic64_read(&move_stats.sectors_raced)); - trace_copygc(ca, + trace_copygc(c, atomic64_read(&move_stats.sectors_moved), sectors_not_moved, buckets_to_move, buckets_not_moved); + return 0; } /* @@ -239,20 +269,27 @@ static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) * often and continually reduce the amount of fragmented space as the device * fills up. So, we increase the threshold by half the current free space. */ -unsigned long bch2_copygc_wait_amount(struct bch_dev *ca) +unsigned long bch2_copygc_wait_amount(struct bch_fs *c) { - struct bch_fs *c = ca->fs; - struct bch_dev_usage usage = bch2_dev_usage_read(c, ca); - u64 fragmented_allowed = ca->copygc_threshold + - ((__dev_buckets_available(ca, usage) * ca->mi.bucket_size) >> 1); + struct bch_dev *ca; + unsigned dev_idx; + u64 fragmented_allowed = c->copygc_threshold; + u64 fragmented = 0; - return max_t(s64, 0, fragmented_allowed - usage.sectors_fragmented); + for_each_rw_member(ca, c, dev_idx) { + struct bch_dev_usage usage = bch2_dev_usage_read(ca); + + fragmented_allowed += ((__dev_buckets_available(ca, usage) * + ca->mi.bucket_size) >> 1); + fragmented += usage.sectors_fragmented; + } + + return max_t(s64, 0, fragmented_allowed - fragmented); } static int bch2_copygc_thread(void *arg) { - struct bch_dev *ca = arg; - struct bch_fs *c = ca->fs; + struct bch_fs *c = arg; struct io_clock *clock = &c->io_clock[WRITE]; unsigned long last, wait; @@ -263,7 +300,7 @@ static int bch2_copygc_thread(void *arg) break; last = atomic_long_read(&clock->now); - wait = bch2_copygc_wait_amount(ca); + wait = bch2_copygc_wait_amount(c); if (wait > clock->max_slop) { bch2_kthread_io_clock_wait(clock, last + wait, @@ -271,29 +308,30 @@ static int bch2_copygc_thread(void *arg) continue; } - bch2_copygc(c, ca); + if (bch2_copygc(c)) + break; } return 0; } -void bch2_copygc_stop(struct bch_dev *ca) +void bch2_copygc_stop(struct bch_fs *c) { - ca->copygc_pd.rate.rate = UINT_MAX; - bch2_ratelimit_reset(&ca->copygc_pd.rate); + c->copygc_pd.rate.rate = UINT_MAX; + bch2_ratelimit_reset(&c->copygc_pd.rate); - if (ca->copygc_thread) { - kthread_stop(ca->copygc_thread); - put_task_struct(ca->copygc_thread); + if (c->copygc_thread) { + kthread_stop(c->copygc_thread); + put_task_struct(c->copygc_thread); } - ca->copygc_thread = NULL; + c->copygc_thread = NULL; } -int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca) +int bch2_copygc_start(struct bch_fs *c) { struct task_struct *t; - if (ca->copygc_thread) + if (c->copygc_thread) return 0; if (c->opts.nochanges) @@ -302,21 +340,20 @@ int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca) if (bch2_fs_init_fault("copygc_start")) return -ENOMEM; - t = kthread_create(bch2_copygc_thread, ca, - "bch_copygc[%s]", ca->name); + t = kthread_create(bch2_copygc_thread, c, "bch_copygc"); if (IS_ERR(t)) return PTR_ERR(t); get_task_struct(t); - ca->copygc_thread = t; - wake_up_process(ca->copygc_thread); + c->copygc_thread = t; + wake_up_process(c->copygc_thread); return 0; } -void bch2_dev_copygc_init(struct bch_dev *ca) +void bch2_fs_copygc_init(struct bch_fs *c) { - bch2_pd_controller_init(&ca->copygc_pd); - ca->copygc_pd.d_term = 0; + bch2_pd_controller_init(&c->copygc_pd); + c->copygc_pd.d_term = 0; } diff --git a/libbcachefs/movinggc.h b/libbcachefs/movinggc.h index dcd4796..9227382 100644 --- a/libbcachefs/movinggc.h +++ b/libbcachefs/movinggc.h @@ -2,8 +2,8 @@ #ifndef _BCACHEFS_MOVINGGC_H #define _BCACHEFS_MOVINGGC_H -void bch2_copygc_stop(struct bch_dev *); -int bch2_copygc_start(struct bch_fs *, struct bch_dev *); -void bch2_dev_copygc_init(struct bch_dev *); +void bch2_copygc_stop(struct bch_fs *); +int bch2_copygc_start(struct bch_fs *); +void bch2_fs_copygc_init(struct bch_fs *); #endif /* _BCACHEFS_MOVINGGC_H */ diff --git a/libbcachefs/opts.c b/libbcachefs/opts.c index 94d6c04..afe25cd 100644 --- a/libbcachefs/opts.c +++ b/libbcachefs/opts.c @@ -45,12 +45,9 @@ const char * const bch2_str_hash_types[] = { }; const char * const bch2_data_types[] = { - "none", - "sb", - "journal", - "btree", - "data", - "cached", +#define x(t, n) #t, + BCH_DATA_TYPES() +#undef x NULL }; diff --git a/libbcachefs/opts.h b/libbcachefs/opts.h index 3b051e7..014c608 100644 --- a/libbcachefs/opts.h +++ b/libbcachefs/opts.h @@ -83,7 +83,7 @@ enum opt_type { "size", NULL) \ x(btree_node_size, u16, \ OPT_FORMAT, \ - OPT_SECTORS(1, 128), \ + OPT_SECTORS(1, 512), \ BCH_SB_BTREE_NODE_SIZE, 512, \ "size", "Btree node size, default 256k") \ x(errors, u8, \ @@ -260,6 +260,11 @@ enum opt_type { OPT_BOOL(), \ NO_SB_OPT, false, \ NULL, "Don't replay the journal") \ + x(rebuild_replicas, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + NO_SB_OPT, false, \ + NULL, "Rebuild the superblock replicas section") \ x(keep_journal, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/libbcachefs/rebalance.c b/libbcachefs/rebalance.c index e15a2b1..56a1f76 100644 --- a/libbcachefs/rebalance.c +++ b/libbcachefs/rebalance.c @@ -249,45 +249,42 @@ static int bch2_rebalance_thread(void *arg) return 0; } -ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf) +void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); struct bch_fs_rebalance *r = &c->rebalance; struct rebalance_work w = rebalance_work(c); char h1[21], h2[21]; bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9); bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9); - pr_buf(&out, "fullest_dev (%i):\t%s/%s\n", + pr_buf(out, "fullest_dev (%i):\t%s/%s\n", w.dev_most_full_idx, h1, h2); bch2_hprint(&PBUF(h1), w.total_work << 9); bch2_hprint(&PBUF(h2), c->capacity << 9); - pr_buf(&out, "total work:\t\t%s/%s\n", h1, h2); + pr_buf(out, "total work:\t\t%s/%s\n", h1, h2); - pr_buf(&out, "rate:\t\t\t%u\n", r->pd.rate.rate); + pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate); switch (r->state) { case REBALANCE_WAITING: - pr_buf(&out, "waiting\n"); + pr_buf(out, "waiting\n"); break; case REBALANCE_THROTTLED: bch2_hprint(&PBUF(h1), (r->throttled_until_iotime - atomic_long_read(&c->io_clock[WRITE].now)) << 9); - pr_buf(&out, "throttled for %lu sec or %s io\n", + pr_buf(out, "throttled for %lu sec or %s io\n", (r->throttled_until_cputime - jiffies) / HZ, h1); break; case REBALANCE_RUNNING: - pr_buf(&out, "running\n"); - pr_buf(&out, "pos %llu:%llu\n", + pr_buf(out, "running\n"); + pr_buf(out, "pos %llu:%llu\n", r->move_stats.pos.inode, r->move_stats.pos.offset); break; } - - return out.pos - buf; } void bch2_rebalance_stop(struct bch_fs *c) diff --git a/libbcachefs/rebalance.h b/libbcachefs/rebalance.h index 99e2a1f..7ade0bb 100644 --- a/libbcachefs/rebalance.h +++ b/libbcachefs/rebalance.h @@ -19,7 +19,7 @@ void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c, struct bch_io_opts *); void bch2_rebalance_add_work(struct bch_fs *, u64); -ssize_t bch2_rebalance_work_show(struct bch_fs *, char *); +void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *); void bch2_rebalance_stop(struct bch_fs *); int bch2_rebalance_start(struct bch_fs *); diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index 41b864d..6e829bf 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -442,11 +442,18 @@ retry: * regular keys */ __bch2_btree_iter_set_pos(split_iter, split->k.p, false); - bch2_trans_update(&trans, split_iter, split, !remark - ? BTREE_TRIGGER_NORUN - : BTREE_TRIGGER_NOOVERWRITES); + bch2_trans_update(&trans, split_iter, split, + BTREE_TRIGGER_NORUN); bch2_btree_iter_set_pos(iter, split->k.p); + + if (remark) { + ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(split), + 0, split->k.size, + BTREE_TRIGGER_INSERT); + if (ret) + goto err; + } } while (bkey_cmp(iter->pos, k->k.p) < 0); if (remark) { @@ -967,7 +974,8 @@ int bch2_fs_recovery(struct bch_fs *c) bch_info(c, "recovering from clean shutdown, journal seq %llu", le64_to_cpu(clean->journal_seq)); - if (!c->replicas.entries) { + if (!c->replicas.entries || + c->opts.rebuild_replicas) { bch_info(c, "building replicas info"); set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); } @@ -1031,6 +1039,11 @@ int bch2_fs_recovery(struct bch_fs *c) } journal_seq += 4; + + /* + * The superblock needs to be written before we do any btree + * node writes: it will be in the read_write() path + */ } ret = bch2_blacklist_table_initialize(c); diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 67a7128..6b6506c 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -113,16 +113,16 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, switch (k.k->type) { case KEY_TYPE_btree_ptr: case KEY_TYPE_btree_ptr_v2: - e->data_type = BCH_DATA_BTREE; + e->data_type = BCH_DATA_btree; extent_to_replicas(k, e); break; case KEY_TYPE_extent: case KEY_TYPE_reflink_v: - e->data_type = BCH_DATA_USER; + e->data_type = BCH_DATA_user; extent_to_replicas(k, e); break; case KEY_TYPE_stripe: - e->data_type = BCH_DATA_USER; + e->data_type = BCH_DATA_user; stripe_to_replicas(k, e); break; } @@ -137,7 +137,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e, unsigned i; BUG_ON(!data_type || - data_type == BCH_DATA_SB || + data_type == BCH_DATA_sb || data_type >= BCH_DATA_NR); e->data_type = data_type; @@ -213,29 +213,20 @@ static bool __replicas_has_entry(struct bch_replicas_cpu *r, return __replicas_entry_idx(r, search) >= 0; } -static bool bch2_replicas_marked_locked(struct bch_fs *c, - struct bch_replicas_entry *search, - bool check_gc_replicas) +bool bch2_replicas_marked(struct bch_fs *c, + struct bch_replicas_entry *search) { + bool marked; + if (!search->nr_devs) return true; verify_replicas_entry(search); - return __replicas_has_entry(&c->replicas, search) && - (!check_gc_replicas || - likely((!c->replicas_gc.entries)) || - __replicas_has_entry(&c->replicas_gc, search)); -} - -bool bch2_replicas_marked(struct bch_fs *c, - struct bch_replicas_entry *search, - bool check_gc_replicas) -{ - bool marked; - percpu_down_read(&c->mark_lock); - marked = bch2_replicas_marked_locked(c, search, check_gc_replicas); + marked = __replicas_has_entry(&c->replicas, search) && + (likely((!c->replicas_gc.entries)) || + __replicas_has_entry(&c->replicas_gc, search)); percpu_up_read(&c->mark_lock); return marked; @@ -423,66 +414,50 @@ err: goto out; } -int bch2_mark_replicas(struct bch_fs *c, - struct bch_replicas_entry *r) +static int __bch2_mark_replicas(struct bch_fs *c, + struct bch_replicas_entry *r, + bool check) { - return likely(bch2_replicas_marked(c, r, true)) - ? 0 + return likely(bch2_replicas_marked(c, r)) ? 0 + : check ? -1 : bch2_mark_replicas_slowpath(c, r); } -bool bch2_bkey_replicas_marked_locked(struct bch_fs *c, - struct bkey_s_c k, - bool check_gc_replicas) +int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) +{ + return __bch2_mark_replicas(c, r, false); +} + +static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k, + bool check) { struct bch_replicas_padded search; struct bch_devs_list cached = bch2_bkey_cached_devs(k); unsigned i; + int ret; for (i = 0; i < cached.nr; i++) { bch2_replicas_entry_cached(&search.e, cached.devs[i]); - if (!bch2_replicas_marked_locked(c, &search.e, - check_gc_replicas)) - return false; + ret = __bch2_mark_replicas(c, &search.e, check); + if (ret) + return ret; } bch2_bkey_to_replicas(&search.e, k); - return bch2_replicas_marked_locked(c, &search.e, check_gc_replicas); + return __bch2_mark_replicas(c, &search.e, check); } bool bch2_bkey_replicas_marked(struct bch_fs *c, - struct bkey_s_c k, - bool check_gc_replicas) + struct bkey_s_c k) { - bool marked; - - percpu_down_read(&c->mark_lock); - marked = bch2_bkey_replicas_marked_locked(c, k, check_gc_replicas); - percpu_up_read(&c->mark_lock); - - return marked; + return __bch2_mark_bkey_replicas(c, k, true) == 0; } int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) { - struct bch_replicas_padded search; - struct bch_devs_list cached = bch2_bkey_cached_devs(k); - unsigned i; - int ret; - - for (i = 0; i < cached.nr; i++) { - bch2_replicas_entry_cached(&search.e, cached.devs[i]); - - ret = bch2_mark_replicas(c, &search.e); - if (ret) - return ret; - } - - bch2_bkey_to_replicas(&search.e, k); - - return bch2_mark_replicas(c, &search.e); + return __bch2_mark_bkey_replicas(c, k, false); } int bch2_replicas_gc_end(struct bch_fs *c, int ret) @@ -611,7 +586,7 @@ retry: struct bch_replicas_entry *e = cpu_replicas_entry(&c->replicas, i); - if (e->data_type == BCH_DATA_JOURNAL || + if (e->data_type == BCH_DATA_journal || c->usage_base->replicas[i] || percpu_u64_get(&c->usage[0]->replicas[i]) || percpu_u64_get(&c->usage[1]->replicas[i])) @@ -1037,13 +1012,13 @@ static bool have_enough_devs(struct replicas_status s, bool bch2_have_enough_devs(struct replicas_status s, unsigned flags) { - return (have_enough_devs(s, BCH_DATA_JOURNAL, + return (have_enough_devs(s, BCH_DATA_journal, flags & BCH_FORCE_IF_METADATA_DEGRADED, flags & BCH_FORCE_IF_METADATA_LOST) && - have_enough_devs(s, BCH_DATA_BTREE, + have_enough_devs(s, BCH_DATA_btree, flags & BCH_FORCE_IF_METADATA_DEGRADED, flags & BCH_FORCE_IF_METADATA_LOST) && - have_enough_devs(s, BCH_DATA_USER, + have_enough_devs(s, BCH_DATA_user, flags & BCH_FORCE_IF_DATA_DEGRADED, flags & BCH_FORCE_IF_DATA_LOST)); } @@ -1053,9 +1028,9 @@ int bch2_replicas_online(struct bch_fs *c, bool meta) struct replicas_status s = bch2_replicas_status(c); return (meta - ? min(s.replicas[BCH_DATA_JOURNAL].redundancy, - s.replicas[BCH_DATA_BTREE].redundancy) - : s.replicas[BCH_DATA_USER].redundancy) + 1; + ? min(s.replicas[BCH_DATA_journal].redundancy, + s.replicas[BCH_DATA_btree].redundancy) + : s.replicas[BCH_DATA_user].redundancy) + 1; } unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) diff --git a/libbcachefs/replicas.h b/libbcachefs/replicas.h index 8527d82..8b95164 100644 --- a/libbcachefs/replicas.h +++ b/libbcachefs/replicas.h @@ -21,22 +21,18 @@ int bch2_replicas_entry_idx(struct bch_fs *, void bch2_devlist_to_replicas(struct bch_replicas_entry *, enum bch_data_type, struct bch_devs_list); -bool bch2_replicas_marked(struct bch_fs *, - struct bch_replicas_entry *, bool); +bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); int bch2_mark_replicas(struct bch_fs *, struct bch_replicas_entry *); -bool bch2_bkey_replicas_marked_locked(struct bch_fs *, - struct bkey_s_c, bool); void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); -bool bch2_bkey_replicas_marked(struct bch_fs *, - struct bkey_s_c, bool); +bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c); int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, unsigned dev) { - e->data_type = BCH_DATA_CACHED; + e->data_type = BCH_DATA_cached; e->nr_devs = 1; e->nr_required = 1; e->devs[0] = dev; diff --git a/libbcachefs/super-io.c b/libbcachefs/super-io.c index f2be64c..cee6cc9 100644 --- a/libbcachefs/super-io.c +++ b/libbcachefs/super-io.c @@ -636,7 +636,8 @@ static void write_super_endio(struct bio *bio) /* XXX: return errors directly */ - if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write")) + if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s", + bch2_blk_status_to_str(bio->bi_status))) ca->sb_write_error = 1; closure_put(&ca->fs->sb_write); @@ -656,7 +657,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca) bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META); bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); - this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_SB], + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio)); percpu_ref_get(&ca->io_ref); @@ -684,7 +685,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) roundup((size_t) vstruct_bytes(sb), bdev_logical_block_size(ca->disk_sb.bdev))); - this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB], + this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], bio_sectors(bio)); percpu_ref_get(&ca->io_ref); diff --git a/libbcachefs/super.c b/libbcachefs/super.c index 0cdf285..94288fc 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -169,10 +169,9 @@ int bch2_congested(void *data, int bdi_bits) } } } else { - unsigned target = READ_ONCE(c->opts.foreground_target); - const struct bch_devs_mask *devs = target - ? bch2_target_to_mask(c, target) - : &c->rw_devs[BCH_DATA_USER]; + const struct bch_devs_mask *devs = + bch2_target_to_mask(c, c->opts.foreground_target) ?: + &c->rw_devs[BCH_DATA_user]; for_each_member_device_rcu(ca, c, i, devs) { bdi = ca->disk_sb.bdev->bd_bdi; @@ -213,10 +212,7 @@ static void __bch2_fs_read_only(struct bch_fs *c) int ret; bch2_rebalance_stop(c); - - for_each_member_device(ca, c, i) - bch2_copygc_stop(ca); - + bch2_copygc_stop(c); bch2_gc_thread_stop(c); /* @@ -387,8 +383,8 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) { bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); - bch2_fs_read_only_async(c); bch2_journal_halt(&c->journal); + bch2_fs_read_only_async(c); wake_up(&bch_read_only_wait); return ret; @@ -396,8 +392,6 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) static int bch2_fs_read_write_late(struct bch_fs *c) { - struct bch_dev *ca; - unsigned i; int ret; ret = bch2_gc_thread_start(c); @@ -406,13 +400,10 @@ static int bch2_fs_read_write_late(struct bch_fs *c) return ret; } - for_each_rw_member(ca, c, i) { - ret = bch2_copygc_start(c, ca); - if (ret) { - bch_err(c, "error starting copygc threads"); - percpu_ref_put(&ca->io_ref); - return ret; - } + ret = bch2_copygc_start(c); + if (ret) { + bch_err(c, "error starting copygc thread"); + return ret; } ret = bch2_rebalance_start(c); @@ -450,6 +441,13 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) if (ret) goto err; + /* + * We need to write out a journal entry before we start doing btree + * updates, to ensure that on unclean shutdown new journal blacklist + * entries are created: + */ + bch2_journal_meta(&c->journal); + clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags); for_each_rw_member(ca, c, i) @@ -535,6 +533,7 @@ static void bch2_fs_free(struct bch_fs *c) kfree(c->replicas_gc.entries); kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); + free_heap(&c->copygc_heap); if (c->journal_reclaim_wq) destroy_workqueue(c->journal_reclaim_wq); @@ -684,6 +683,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); + bch2_fs_copygc_init(c); bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); bch2_fs_allocator_background_init(c); bch2_fs_allocator_foreground_init(c); @@ -708,9 +708,12 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) INIT_LIST_HEAD(&c->fsck_errors); mutex_init(&c->fsck_error_lock); - INIT_LIST_HEAD(&c->ec_new_stripe_list); - mutex_init(&c->ec_new_stripe_lock); - mutex_init(&c->ec_stripe_create_lock); + INIT_LIST_HEAD(&c->ec_stripe_head_list); + mutex_init(&c->ec_stripe_head_lock); + + INIT_LIST_HEAD(&c->ec_stripe_new_list); + mutex_init(&c->ec_stripe_new_lock); + spin_lock_init(&c->ec_stripes_heap_lock); seqcount_init(&c->gc_pos_lock); @@ -1108,10 +1111,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, init_rwsem(&ca->bucket_lock); - writepoint_init(&ca->copygc_write_point, BCH_DATA_USER); - - bch2_dev_copygc_init(ca); - INIT_WORK(&ca->io_error_work, bch2_io_error_work); bch2_time_stats_init(&ca->io_latency[READ]); @@ -1241,7 +1240,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) return ret; if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) && - !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_SB])) { + !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) { mutex_lock(&c->sb_lock); bch2_mark_dev_superblock(ca->fs, ca, 0); mutex_unlock(&c->sb_lock); @@ -1352,7 +1351,11 @@ static bool bch2_fs_may_start(struct bch_fs *c) static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) { - bch2_copygc_stop(ca); + /* + * Device going read only means the copygc reserve get smaller, so we + * don't want that happening while copygc is in progress: + */ + bch2_copygc_stop(c); /* * The allocator thread itself allocates btree nodes, so stop it first: @@ -1360,6 +1363,8 @@ static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_stop(ca); bch2_dev_allocator_remove(c, ca); bch2_dev_journal_stop(&c->journal, ca); + + bch2_copygc_start(c); } static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) @@ -1374,9 +1379,6 @@ static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) if (bch2_dev_allocator_start(ca)) return "error starting allocator thread"; - if (bch2_copygc_start(c, ca)) - return "error starting copygc thread"; - return NULL; } diff --git a/libbcachefs/super.h b/libbcachefs/super.h index 4aa5dd7..fffee96 100644 --- a/libbcachefs/super.h +++ b/libbcachefs/super.h @@ -222,6 +222,15 @@ void bch2_fs_read_only(struct bch_fs *); int bch2_fs_read_write(struct bch_fs *); int bch2_fs_read_write_early(struct bch_fs *); +/* + * Only for use in the recovery/fsck path: + */ +static inline void bch2_fs_lazy_rw(struct bch_fs *c) +{ + if (percpu_ref_is_zero(&c->writes)) + bch2_fs_read_write_early(c); +} + void bch2_fs_stop(struct bch_fs *); int bch2_fs_start(struct bch_fs *); diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index c169d28..0cb29f4 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -75,7 +75,6 @@ do { \ #define sysfs_hprint(file, val) \ do { \ if (attr == &sysfs_ ## file) { \ - struct printbuf out = _PBUF(buf, PAGE_SIZE); \ bch2_hprint(&out, val); \ pr_buf(&out, "\n"); \ return out.pos - buf; \ @@ -168,6 +167,7 @@ read_attribute(btree_updates); read_attribute(dirty_btree_nodes); read_attribute(btree_key_cache); read_attribute(btree_transactions); +read_attribute(stripes_heap); read_attribute(internal_uuid); @@ -238,24 +238,22 @@ static size_t bch2_btree_cache_size(struct bch_fs *c) return ret; } -static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) +static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c); if (!fs_usage) return -ENOMEM; - bch2_fs_usage_to_text(&out, c, fs_usage); + bch2_fs_usage_to_text(out, c, fs_usage); percpu_up_read(&c->mark_lock); kfree(fs_usage); - - return out.pos - buf; + return 0; } -static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) +static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) { struct btree_trans trans; struct btree_iter *iter; @@ -298,59 +296,26 @@ static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) if (ret) return ret; - return scnprintf(buf, PAGE_SIZE, - "uncompressed data:\n" - " nr extents: %llu\n" - " size (bytes): %llu\n" - "compressed data:\n" - " nr extents: %llu\n" - " compressed size (bytes): %llu\n" - " uncompressed size (bytes): %llu\n", - nr_uncompressed_extents, - uncompressed_sectors << 9, - nr_compressed_extents, - compressed_sectors_compressed << 9, - compressed_sectors_uncompressed << 9); -} - -static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf) -{ - char *out = buf, *end = buf + PAGE_SIZE; - struct ec_stripe_head *h; - struct ec_stripe_new *s; - - mutex_lock(&c->ec_new_stripe_lock); - list_for_each_entry(h, &c->ec_new_stripe_list, list) { - out += scnprintf(out, end - out, - "target %u algo %u redundancy %u:\n", - h->target, h->algo, h->redundancy); - - if (h->s) - out += scnprintf(out, end - out, - "\tpending: blocks %u allocated %u\n", - h->s->blocks.nr, - bitmap_weight(h->s->blocks_allocated, - h->s->blocks.nr)); - - mutex_lock(&h->lock); - list_for_each_entry(s, &h->stripes, list) - out += scnprintf(out, end - out, - "\tin flight: blocks %u allocated %u pin %u\n", - s->blocks.nr, - bitmap_weight(s->blocks_allocated, - s->blocks.nr), - atomic_read(&s->pin)); - mutex_unlock(&h->lock); - - } - mutex_unlock(&c->ec_new_stripe_lock); - - return out - buf; + pr_buf(out, + "uncompressed data:\n" + " nr extents: %llu\n" + " size (bytes): %llu\n" + "compressed data:\n" + " nr extents: %llu\n" + " compressed size (bytes): %llu\n" + " uncompressed size (bytes): %llu\n", + nr_uncompressed_extents, + uncompressed_sectors << 9, + nr_compressed_extents, + compressed_sectors_compressed << 9, + compressed_sectors_uncompressed << 9); + return 0; } SHOW(bch2_fs) { struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); + struct printbuf out = _PBUF(buf, PAGE_SIZE); sysfs_print(minor, c->minor); sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); @@ -378,9 +343,12 @@ SHOW(bch2_fs) sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ + sysfs_pd_controller_show(copy_gc, &c->copygc_pd); - if (attr == &sysfs_rebalance_work) - return bch2_rebalance_work_show(c, buf); + if (attr == &sysfs_rebalance_work) { + bch2_rebalance_work_to_text(&out, c); + return out.pos - buf; + } sysfs_print(promote_whole_extents, c->promote_whole_extents); @@ -390,44 +358,61 @@ SHOW(bch2_fs) /* Debugging: */ if (attr == &sysfs_alloc_debug) - return show_fs_alloc_debug(c, buf); + return fs_alloc_debug_to_text(&out, c) ?: out.pos - buf; - if (attr == &sysfs_journal_debug) - return bch2_journal_print_debug(&c->journal, buf); + if (attr == &sysfs_journal_debug) { + bch2_journal_debug_to_text(&out, &c->journal); + return out.pos - buf; + } - if (attr == &sysfs_journal_pins) - return bch2_journal_print_pins(&c->journal, buf); + if (attr == &sysfs_journal_pins) { + bch2_journal_pins_to_text(&out, &c->journal); + return out.pos - buf; + } - if (attr == &sysfs_btree_updates) - return bch2_btree_updates_print(c, buf); + if (attr == &sysfs_btree_updates) { + bch2_btree_updates_to_text(&out, c); + return out.pos - buf; + } - if (attr == &sysfs_dirty_btree_nodes) - return bch2_dirty_btree_nodes_print(c, buf); + if (attr == &sysfs_dirty_btree_nodes) { + bch2_dirty_btree_nodes_to_text(&out, c); + return out.pos - buf; + } if (attr == &sysfs_btree_key_cache) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); - bch2_btree_key_cache_to_text(&out, &c->btree_key_cache); return out.pos - buf; } if (attr == &sysfs_btree_transactions) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); - bch2_btree_trans_to_text(&out, c); return out.pos - buf; } - if (attr == &sysfs_compression_stats) - return bch2_compression_stats(c, buf); + if (attr == &sysfs_stripes_heap) { + bch2_stripes_heap_to_text(&out, c); + return out.pos - buf; + } + + if (attr == &sysfs_compression_stats) { + bch2_compression_stats_to_text(&out, c); + return out.pos - buf; + } - if (attr == &sysfs_new_stripes) - return bch2_new_stripes(c, buf); + if (attr == &sysfs_new_stripes) { + bch2_new_stripes_to_text(&out, c); + return out.pos - buf; + } - if (attr == &sysfs_io_timers_read) - return bch2_io_timers_show(&c->io_clock[READ], buf); - if (attr == &sysfs_io_timers_write) - return bch2_io_timers_show(&c->io_clock[WRITE], buf); + if (attr == &sysfs_io_timers_read) { + bch2_io_timers_to_text(&out, &c->io_clock[READ]); + return out.pos - buf; + } + if (attr == &sysfs_io_timers_write) { + bch2_io_timers_to_text(&out, &c->io_clock[WRITE]); + return out.pos - buf; + } #define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); BCH_DEBUG_PARAMS() @@ -452,14 +437,11 @@ STORE(bch2_fs) } if (attr == &sysfs_copy_gc_enabled) { - struct bch_dev *ca; - unsigned i; ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) ?: (ssize_t) size; - for_each_member_device(ca, c, i) - if (ca->copygc_thread) - wake_up_process(ca->copygc_thread); + if (c->copygc_thread) + wake_up_process(c->copygc_thread); return ret; } @@ -474,6 +456,7 @@ STORE(bch2_fs) sysfs_strtoul(pd_controllers_update_seconds, c->pd_controllers_update_seconds); sysfs_pd_controller_store(rebalance, &c->rebalance.pd); + sysfs_pd_controller_store(copy_gc, &c->copygc_pd); sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); @@ -583,6 +566,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_dirty_btree_nodes, &sysfs_btree_key_cache, &sysfs_btree_transactions, + &sysfs_stripes_heap, &sysfs_read_realloc_races, &sysfs_extent_migrate_done, @@ -598,6 +582,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_rebalance_enabled, &sysfs_rebalance_work, sysfs_pd_controller_files(rebalance), + sysfs_pd_controller_files(copy_gc), &sysfs_new_stripes, @@ -696,11 +681,13 @@ int bch2_opts_create_sysfs_files(struct kobject *kobj) SHOW(bch2_fs_time_stats) { struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); + struct printbuf out = _PBUF(buf, PAGE_SIZE); -#define x(name) \ - if (attr == &sysfs_time_stat_##name) \ - return bch2_time_stats_print(&c->times[BCH_TIME_##name],\ - buf, PAGE_SIZE); +#define x(name) \ + if (attr == &sysfs_time_stat_##name) { \ + bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\ + return out.pos - buf; \ + } BCH_TIME_STATS() #undef x @@ -753,13 +740,13 @@ static int unsigned_cmp(const void *_l, const void *_r) return cmp_int(*l, *r); } -static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca, - char *buf, bucket_map_fn *fn, void *private) +static int quantiles_to_text(struct printbuf *out, + struct bch_fs *c, struct bch_dev *ca, + bucket_map_fn *fn, void *private) { size_t i, n; /* Compute 31 quantiles */ unsigned q[31], *p; - ssize_t ret = 0; down_read(&ca->bucket_lock); n = ca->mi.nbuckets; @@ -786,38 +773,33 @@ static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca, vfree(p); for (i = 0; i < ARRAY_SIZE(q); i++) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, - "%u ", q[i]); - buf[ret - 1] = '\n'; - - return ret; + pr_buf(out, "%u ", q[i]); + pr_buf(out, "\n"); + return 0; } -static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf) +static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); enum alloc_reserve i; spin_lock(&ca->fs->freelist_lock); - pr_buf(&out, "free_inc:\t%zu\t%zu\n", + pr_buf(out, "free_inc:\t%zu\t%zu\n", fifo_used(&ca->free_inc), ca->free_inc.size); for (i = 0; i < RESERVE_NR; i++) - pr_buf(&out, "free[%u]:\t%zu\t%zu\n", i, + pr_buf(out, "free[%u]:\t%zu\t%zu\n", i, fifo_used(&ca->free[i]), ca->free[i].size); spin_unlock(&ca->fs->freelist_lock); - - return out.pos - buf; } -static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) +static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) { struct bch_fs *c = ca->fs; - struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); + struct bch_dev_usage stats = bch2_dev_usage_read(ca); unsigned i, nr[BCH_DATA_NR]; memset(nr, 0, sizeof(nr)); @@ -825,7 +807,7 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) nr[c->open_buckets[i].type]++; - return scnprintf(buf, PAGE_SIZE, + pr_buf(out, "free_inc: %zu/%zu\n" "free[RESERVE_BTREE]: %zu/%zu\n" "free[RESERVE_MOVINGGC]: %zu/%zu\n" @@ -861,27 +843,27 @@ static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, ca->mi.nbuckets - ca->mi.first_bucket, stats.buckets_alloc, - stats.buckets[BCH_DATA_SB], - stats.buckets[BCH_DATA_JOURNAL], - stats.buckets[BCH_DATA_BTREE], - stats.buckets[BCH_DATA_USER], - stats.buckets[BCH_DATA_CACHED], + stats.buckets[BCH_DATA_sb], + stats.buckets[BCH_DATA_journal], + stats.buckets[BCH_DATA_btree], + stats.buckets[BCH_DATA_user], + stats.buckets[BCH_DATA_cached], stats.buckets_ec, - ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable, - stats.sectors[BCH_DATA_SB], - stats.sectors[BCH_DATA_JOURNAL], - stats.sectors[BCH_DATA_BTREE], - stats.sectors[BCH_DATA_USER], - stats.sectors[BCH_DATA_CACHED], + __dev_buckets_available(ca, stats), + stats.sectors[BCH_DATA_sb], + stats.sectors[BCH_DATA_journal], + stats.sectors[BCH_DATA_btree], + stats.sectors[BCH_DATA_user], + stats.sectors[BCH_DATA_cached], stats.sectors_ec, stats.sectors_fragmented, - ca->copygc_threshold, + c->copygc_threshold, c->freelist_wait.list.first ? "waiting" : "empty", c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, BTREE_NODE_OPEN_BUCKET_RESERVE, c->open_buckets_wait.list.first ? "waiting" : "empty", - nr[BCH_DATA_BTREE], - nr[BCH_DATA_USER], + nr[BCH_DATA_btree], + nr[BCH_DATA_user], c->btree_reserve_cache_nr); } @@ -891,21 +873,18 @@ static const char * const bch2_rw[] = { NULL }; -static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf) +static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca) { - struct printbuf out = _PBUF(buf, PAGE_SIZE); int rw, i; for (rw = 0; rw < 2; rw++) { - pr_buf(&out, "%s:\n", bch2_rw[rw]); + pr_buf(out, "%s:\n", bch2_rw[rw]); for (i = 1; i < BCH_DATA_NR; i++) - pr_buf(&out, "%-12s:%12llu\n", + pr_buf(out, "%-12s:%12llu\n", bch2_data_types[i], percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); } - - return out.pos - buf; } SHOW(bch2_dev) @@ -942,8 +921,6 @@ SHOW(bch2_dev) return out.pos - buf; } - sysfs_pd_controller_show(copy_gc, &ca->copygc_pd); - if (attr == &sysfs_cache_replacement_policy) { bch2_string_opt_to_text(&out, bch2_cache_replacement_policies, @@ -959,34 +936,44 @@ SHOW(bch2_dev) return out.pos - buf; } - if (attr == &sysfs_iodone) - return show_dev_iodone(ca, buf); + if (attr == &sysfs_iodone) { + dev_iodone_to_text(&out, ca); + return out.pos - buf; + } sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); - if (attr == &sysfs_io_latency_stats_read) - return bch2_time_stats_print(&ca->io_latency[READ], buf, PAGE_SIZE); - if (attr == &sysfs_io_latency_stats_write) - return bch2_time_stats_print(&ca->io_latency[WRITE], buf, PAGE_SIZE); + if (attr == &sysfs_io_latency_stats_read) { + bch2_time_stats_to_text(&out, &ca->io_latency[READ]); + return out.pos - buf; + } + if (attr == &sysfs_io_latency_stats_write) { + bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]); + return out.pos - buf; + } sysfs_printf(congested, "%u%%", clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) * 100 / CONGESTED_MAX); if (attr == &sysfs_bucket_quantiles_last_read) - return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0); + return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 0) ?: out.pos - buf; if (attr == &sysfs_bucket_quantiles_last_write) - return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1); + return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 1) ?: out.pos - buf; if (attr == &sysfs_bucket_quantiles_fragmentation) - return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL); + return quantiles_to_text(&out, c, ca, bucket_sectors_used_fn, NULL) ?: out.pos - buf; if (attr == &sysfs_bucket_quantiles_oldest_gen) - return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL); + return quantiles_to_text(&out, c, ca, bucket_oldest_gen_fn, NULL) ?: out.pos - buf; - if (attr == &sysfs_reserve_stats) - return show_reserve_stats(ca, buf); - if (attr == &sysfs_alloc_debug) - return show_dev_alloc_debug(ca, buf); + if (attr == &sysfs_reserve_stats) { + reserve_stats_to_text(&out, ca); + return out.pos - buf; + } + if (attr == &sysfs_alloc_debug) { + dev_alloc_debug_to_text(&out, ca); + return out.pos - buf; + } return 0; } @@ -997,8 +984,6 @@ STORE(bch2_dev) struct bch_fs *c = ca->fs; struct bch_member *mi; - sysfs_pd_controller_store(copy_gc, &ca->copygc_pd); - if (attr == &sysfs_discard) { bool v = strtoul_or_return(buf); @@ -1083,8 +1068,6 @@ struct attribute *bch2_dev_files[] = { /* debug: */ &sysfs_alloc_debug, &sysfs_wake_allocator, - - sysfs_pd_controller_files(copy_gc), NULL }; diff --git a/libbcachefs/util.c b/libbcachefs/util.c index e69d03d..fd4044a 100644 --- a/libbcachefs/util.c +++ b/libbcachefs/util.c @@ -318,43 +318,40 @@ static void pr_time_units(struct printbuf *out, u64 ns) pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); } -size_t bch2_time_stats_print(struct time_stats *stats, char *buf, size_t len) +void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) { - struct printbuf out = _PBUF(buf, len); const struct time_unit *u; u64 freq = READ_ONCE(stats->average_frequency); u64 q, last_q = 0; int i; - pr_buf(&out, "count:\t\t%llu\n", + pr_buf(out, "count:\t\t%llu\n", stats->count); - pr_buf(&out, "rate:\t\t%llu/sec\n", + pr_buf(out, "rate:\t\t%llu/sec\n", freq ? div64_u64(NSEC_PER_SEC, freq) : 0); - pr_buf(&out, "frequency:\t"); - pr_time_units(&out, freq); + pr_buf(out, "frequency:\t"); + pr_time_units(out, freq); - pr_buf(&out, "\navg duration:\t"); - pr_time_units(&out, stats->average_duration); + pr_buf(out, "\navg duration:\t"); + pr_time_units(out, stats->average_duration); - pr_buf(&out, "\nmax duration:\t"); - pr_time_units(&out, stats->max_duration); + pr_buf(out, "\nmax duration:\t"); + pr_time_units(out, stats->max_duration); i = eytzinger0_first(NR_QUANTILES); u = pick_time_units(stats->quantiles.entries[i].m); - pr_buf(&out, "\nquantiles (%s):\t", u->name); + pr_buf(out, "\nquantiles (%s):\t", u->name); eytzinger0_for_each(i, NR_QUANTILES) { bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; q = max(stats->quantiles.entries[i].m, last_q); - pr_buf(&out, "%llu%s", + pr_buf(out, "%llu%s", div_u64(q, u->nsecs), is_last ? "\n" : " "); last_q = q; } - - return out.pos - buf; } void bch2_time_stats_exit(struct time_stats *stats) diff --git a/libbcachefs/util.h b/libbcachefs/util.h index 0128dab..f48c638 100644 --- a/libbcachefs/util.h +++ b/libbcachefs/util.h @@ -99,7 +99,7 @@ static inline void *vpmalloc(size_t size, gfp_t gfp_mask) { return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, get_order(size)) ?: - __vmalloc(size, gfp_mask, PAGE_KERNEL); + __vmalloc(size, gfp_mask); } static inline void kvpfree(void *p, size_t size) @@ -398,7 +398,7 @@ static inline void bch2_time_stats_update(struct time_stats *stats, u64 start) __bch2_time_stats_update(stats, start, local_clock()); } -size_t bch2_time_stats_print(struct time_stats *, char *, size_t); +void bch2_time_stats_to_text(struct printbuf *, struct time_stats *); void bch2_time_stats_exit(struct time_stats *); void bch2_time_stats_init(struct time_stats *); diff --git a/libbcachefs/xattr.c b/libbcachefs/xattr.c index 725a6f3..21f64cb 100644 --- a/libbcachefs/xattr.c +++ b/libbcachefs/xattr.c @@ -511,7 +511,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, mutex_lock(&inode->ei_update_lock); if (inode_opt_id == Inode_opt_project) { - ret = bch2_set_projid(c, inode, s.v); + /* + * inode fields accessible via the xattr interface are stored + * with a +1 bias, so that 0 means unset: + */ + ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0); if (ret) goto err; } diff --git a/linux/bio.c b/linux/bio.c index 797204f..8422c26 100644 --- a/linux/bio.c +++ b/linux/bio.c @@ -52,6 +52,15 @@ int blk_status_to_errno(blk_status_t status) return blk_errors[idx].err; } +const char *blk_status_to_str(blk_status_t status) +{ + int idx = (__force int)status; + + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) + return "(invalid error)"; + return blk_errors[idx].name; +} + void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter) { -- 2.39.2