From: Kent Overstreet Date: Wed, 12 Dec 2018 11:21:55 +0000 (-0500) Subject: Update bcachefs sources to f7670cba39 bcachefs: Fix for building in userspace X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=a10a41fa2b1a917b0f3b34d20175867f968b2d12;p=bcachefs-tools-debian Update bcachefs sources to f7670cba39 bcachefs: Fix for building in userspace --- diff --git a/.bcachefs_revision b/.bcachefs_revision index 34a8011..0779c53 100644 --- a/.bcachefs_revision +++ b/.bcachefs_revision @@ -1 +1 @@ -62de7539dc2586b4bd7058b138de89f334d0c6bd +f7670cba39ead5fcc99da93b46024bd6355c0663 diff --git a/cmd_migrate.c b/cmd_migrate.c index 7863dec..8f46485 100644 --- a/cmd_migrate.c +++ b/cmd_migrate.c @@ -100,7 +100,6 @@ static void mark_unreserved_space(struct bch_fs *c, ranges extents) struct range i; for_each_hole(iter, extents, bucket_to_sector(ca, ca->mi.nbuckets) << 9, i) { - struct bucket_mark new; u64 b; if (i.start == i.end) @@ -108,8 +107,7 @@ static void mark_unreserved_space(struct bch_fs *c, ranges extents) b = sector_to_bucket(ca, i.start >> 9); do { - struct bucket *g = bucket(ca, b); - bucket_cmpxchg(g, new, new.nouse = 1); + set_bit(b, ca->buckets_nouse); b++; } while (bucket_to_sector(ca, b) << 9 < i.end); } @@ -339,7 +337,7 @@ static void link_data(struct bch_fs *c, struct bch_inode_unpacked *dst, .gen = bucket(ca, b)->mark.gen, }); - set_bit(b, ca->buckets_dirty); + bucket_set_dirty(ca, b); ret = bch2_disk_reservation_get(c, &res, sectors, 1, BCH_DISK_RESERVATION_NOFAIL); diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index 2e2fb99..955caa2 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -22,6 +22,13 @@ #include #include +static const char * const bch2_alloc_field_names[] = { +#define x(name, bytes) #name, + BCH_ALLOC_FIELDS() +#undef x + NULL +}; + static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); /* Ratelimiting/PD controllers */ @@ -61,14 +68,73 @@ static void pd_controllers_update(struct work_struct *work) /* Persistent alloc info: */ +static inline u64 get_alloc_field(const struct bch_alloc *a, + const void **p, unsigned field) +{ + unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; + u64 v; + + if (!(a->fields & (1 << field))) + return 0; + + switch (bytes) { + case 1: + v = *((const u8 *) *p); + break; + case 2: + v = le16_to_cpup(*p); + break; + case 4: + v = le32_to_cpup(*p); + break; + case 8: + v = le64_to_cpup(*p); + break; + default: + BUG(); + } + + *p += bytes; + return v; +} + +static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, + unsigned field, u64 v) +{ + unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; + + if (!v) + return; + + a->v.fields |= 1 << field; + + switch (bytes) { + case 1: + *((u8 *) *p) = v; + break; + case 2: + *((__le16 *) *p) = cpu_to_le16(v); + break; + case 4: + *((__le32 *) *p) = cpu_to_le32(v); + break; + case 8: + *((__le64 *) *p) = cpu_to_le64(v); + break; + default: + BUG(); + } + + *p += bytes; +} + static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) { - unsigned bytes = offsetof(struct bch_alloc, data); + unsigned i, bytes = offsetof(struct bch_alloc, data); - if (a->fields & (1 << BCH_ALLOC_FIELD_READ_TIME)) - bytes += 2; - if (a->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) - bytes += 2; + for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++) + if (a->fields & (1 << i)) + bytes += BCH_ALLOC_FIELD_BYTES[i]; return DIV_ROUND_UP(bytes, sizeof(u64)); } @@ -92,58 +158,55 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); + const void *d = a.v->data; + unsigned i; pr_buf(out, "gen %u", a.v->gen); + + for (i = 0; i < BCH_ALLOC_FIELD_NR; i++) + if (a.v->fields & (1 << i)) + pr_buf(out, " %s %llu", + bch2_alloc_field_names[i], + get_alloc_field(a.v, &d, i)); } -static inline unsigned get_alloc_field(const u8 **p, unsigned bytes) +static void __alloc_read_key(struct bucket *g, const struct bch_alloc *a) { - unsigned v; - - switch (bytes) { - case 1: - v = **p; - break; - case 2: - v = le16_to_cpup((void *) *p); - break; - case 4: - v = le32_to_cpup((void *) *p); - break; - default: - BUG(); - } - - *p += bytes; - return v; + const void *d = a->data; + unsigned idx = 0; + + g->_mark.gen = a->gen; + g->gen_valid = 1; + g->io_time[READ] = get_alloc_field(a, &d, idx++); + g->io_time[WRITE] = get_alloc_field(a, &d, idx++); + g->_mark.data_type = get_alloc_field(a, &d, idx++); + g->_mark.dirty_sectors = get_alloc_field(a, &d, idx++); + g->_mark.cached_sectors = get_alloc_field(a, &d, idx++); } -static inline void put_alloc_field(u8 **p, unsigned bytes, unsigned v) +static void __alloc_write_key(struct bkey_i_alloc *a, struct bucket *g, + struct bucket_mark m) { - switch (bytes) { - case 1: - **p = v; - break; - case 2: - *((__le16 *) *p) = cpu_to_le16(v); - break; - case 4: - *((__le32 *) *p) = cpu_to_le32(v); - break; - default: - BUG(); - } + unsigned idx = 0; + void *d = a->v.data; - *p += bytes; + a->v.fields = 0; + a->v.gen = m.gen; + + d = a->v.data; + put_alloc_field(a, &d, idx++, g->io_time[READ]); + put_alloc_field(a, &d, idx++, g->io_time[WRITE]); + put_alloc_field(a, &d, idx++, m.data_type); + put_alloc_field(a, &d, idx++, m.dirty_sectors); + put_alloc_field(a, &d, idx++, m.cached_sectors); + + set_bkey_val_bytes(&a->k, (void *) d - (void *) &a->v); } static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k) { struct bch_dev *ca; struct bkey_s_c_alloc a; - struct bucket_mark new; - struct bucket *g; - const u8 *d; if (k.k->type != KEY_TYPE_alloc) return; @@ -154,21 +217,9 @@ static void bch2_alloc_read_key(struct bch_fs *c, struct bkey_s_c k) if (a.k->p.offset >= ca->mi.nbuckets) return; - percpu_down_read_preempt_disable(&c->usage_lock); - - g = bucket(ca, a.k->p.offset); - bucket_cmpxchg(g, new, ({ - new.gen = a.v->gen; - new.gen_valid = 1; - })); - - d = a.v->data; - if (a.v->fields & (1 << BCH_ALLOC_FIELD_READ_TIME)) - g->io_time[READ] = get_alloc_field(&d, 2); - if (a.v->fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) - g->io_time[WRITE] = get_alloc_field(&d, 2); - - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_down_read_preempt_disable(&c->mark_lock); + __alloc_read_key(bucket(ca, a.k->p.offset), a.v); + percpu_up_read_preempt_enable(&c->mark_lock); } int bch2_alloc_read(struct bch_fs *c, struct list_head *journal_replay_list) @@ -221,29 +272,20 @@ static int __bch2_alloc_write_key(struct bch_fs *c, struct bch_dev *ca, size_t b, struct btree_iter *iter, u64 *journal_seq, unsigned flags) { - struct bucket_mark m; - __BKEY_PADDED(k, DIV_ROUND_UP(sizeof(struct bch_alloc), 8)) alloc_key; + __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key; + struct bkey_i_alloc *a = bkey_alloc_init(&alloc_key.k); struct bucket *g; - struct bkey_i_alloc *a; + struct bucket_mark m; int ret; - u8 *d; - percpu_down_read_preempt_disable(&c->usage_lock); - g = bucket(ca, b); + a->k.p = POS(ca->dev_idx, b); - m = READ_ONCE(g->mark); - a = bkey_alloc_init(&alloc_key.k); - a->k.p = POS(ca->dev_idx, b); - a->v.fields = 0; - a->v.gen = m.gen; - set_bkey_val_u64s(&a->k, bch_alloc_val_u64s(&a->v)); + percpu_down_read_preempt_disable(&c->mark_lock); + g = bucket(ca, b); + m = bucket_cmpxchg(g, m, m.dirty = false); - d = a->v.data; - if (a->v.fields & (1 << BCH_ALLOC_FIELD_READ_TIME)) - put_alloc_field(&d, 2, g->io_time[READ]); - if (a->v.fields & (1 << BCH_ALLOC_FIELD_WRITE_TIME)) - put_alloc_field(&d, 2, g->io_time[WRITE]); - percpu_up_read_preempt_enable(&c->usage_lock); + __alloc_write_key(a, g, m); + percpu_up_read_preempt_enable(&c->mark_lock); bch2_btree_iter_cond_resched(iter); @@ -305,19 +347,24 @@ int bch2_alloc_write(struct bch_fs *c) for_each_rw_member(ca, c, i) { struct btree_iter iter; - unsigned long bucket; + struct bucket_array *buckets; + size_t b; bch2_btree_iter_init(&iter, c, BTREE_ID_ALLOC, POS_MIN, BTREE_ITER_SLOTS|BTREE_ITER_INTENT); down_read(&ca->bucket_lock); - for_each_set_bit(bucket, ca->buckets_dirty, ca->mi.nbuckets) { - ret = __bch2_alloc_write_key(c, ca, bucket, - &iter, NULL, 0); + buckets = bucket_array(ca); + + for (b = buckets->first_bucket; + b < buckets->nbuckets; + b++) { + if (!buckets->b[b].mark.dirty) + continue; + + ret = __bch2_alloc_write_key(c, ca, b, &iter, NULL, 0); if (ret) break; - - clear_bit(bucket, ca->buckets_dirty); } up_read(&ca->bucket_lock); bch2_btree_iter_unlock(&iter); @@ -496,6 +543,10 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, if (!is_available_bucket(mark)) return false; + if (ca->buckets_nouse && + test_bit(bucket, ca->buckets_nouse)) + return false; + gc_gen = bucket_gc_gen(ca, bucket); if (gc_gen >= BUCKET_GC_GEN_MAX / 2) @@ -745,7 +796,7 @@ static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, { struct bucket_mark m; - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read_preempt_disable(&c->mark_lock); spin_lock(&c->freelist_lock); bch2_invalidate_bucket(c, ca, bucket, &m); @@ -758,7 +809,7 @@ static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, bucket_io_clock_reset(c, ca, bucket, READ); bucket_io_clock_reset(c, ca, bucket, WRITE); - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read_preempt_enable(&c->mark_lock); if (m.journal_seq_valid) { u64 journal_seq = atomic64_read(&c->journal.seq); @@ -1286,7 +1337,7 @@ static int __bch2_fs_allocator_start(struct bch_fs *c) struct bucket_mark m; down_read(&ca->bucket_lock); - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read_preempt_disable(&c->mark_lock); buckets = bucket_array(ca); @@ -1294,7 +1345,8 @@ static int __bch2_fs_allocator_start(struct bch_fs *c) bu < buckets->nbuckets; bu++) { m = READ_ONCE(buckets->b[bu].mark); - if (!m.gen_valid || + if (!buckets->b[bu].gen_valid || + !test_bit(bu, ca->buckets_nouse) || !is_available_bucket(m) || m.cached_sectors) continue; @@ -1309,7 +1361,7 @@ static int __bch2_fs_allocator_start(struct bch_fs *c) if (fifo_full(&ca->free[RESERVE_BTREE])) break; } - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read_preempt_enable(&c->mark_lock); up_read(&ca->bucket_lock); } @@ -1333,7 +1385,7 @@ not_enough: bch2_invalidate_one_bucket(c, ca, bu, &journal_seq); fifo_push(&ca->free[RESERVE_BTREE], bu); - set_bit(bu, ca->buckets_dirty); + bucket_set_dirty(ca, bu); } } diff --git a/libbcachefs/alloc_foreground.c b/libbcachefs/alloc_foreground.c index 5024e56..596d3bc 100644 --- a/libbcachefs/alloc_foreground.c +++ b/libbcachefs/alloc_foreground.c @@ -100,7 +100,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) return; } - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read_preempt_disable(&c->mark_lock); spin_lock(&ob->lock); bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), @@ -108,7 +108,7 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) ob->valid = false; spin_unlock(&ob->lock); - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read_preempt_enable(&c->mark_lock); spin_lock(&c->freelist_lock); ob->freelist = c->open_buckets_freelist; @@ -440,7 +440,7 @@ static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) open_bucket_for_each(c, &h->blocks, ob, i) __clear_bit(ob->ptr.dev, devs.d); - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read_preempt_disable(&c->mark_lock); rcu_read_lock(); if (h->parity.nr < h->redundancy) { @@ -476,12 +476,12 @@ static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) } rcu_read_unlock(); - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read_preempt_enable(&c->mark_lock); return bch2_ec_stripe_new_alloc(c, h); err: rcu_read_unlock(); - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read_preempt_enable(&c->mark_lock); return -1; } @@ -637,7 +637,7 @@ static int open_bucket_add_buckets(struct bch_fs *c, if (*nr_effective >= nr_replicas) return 0; - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read_preempt_disable(&c->mark_lock); rcu_read_lock(); retry_blocking: @@ -654,7 +654,7 @@ retry_blocking: } rcu_read_unlock(); - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read_preempt_enable(&c->mark_lock); return ret; } @@ -720,7 +720,7 @@ static struct write_point *__writepoint_find(struct hlist_head *head, static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) { u64 stranded = c->write_points_nr * c->bucket_size_max; - u64 free = bch2_fs_sectors_free(c, bch2_fs_usage_read(c)); + u64 free = bch2_fs_sectors_free(c); return stranded * factor > free; } diff --git a/libbcachefs/bcachefs.h b/libbcachefs/bcachefs.h index d69da3e..5149e6e 100644 --- a/libbcachefs/bcachefs.h +++ b/libbcachefs/bcachefs.h @@ -387,12 +387,12 @@ struct bch_dev { /* * Buckets: - * Per-bucket arrays are protected by c->usage_lock, bucket_lock and + * Per-bucket arrays are protected by c->mark_lock, bucket_lock and * gc_lock, for device resize - holding any is sufficient for access: * Or rcu_read_lock(), but only for ptr_stale(): */ struct bucket_array __rcu *buckets[2]; - unsigned long *buckets_dirty; + unsigned long *buckets_nouse; unsigned long *buckets_written; /* most out of date gen in the btree */ u8 *oldest_gens; @@ -500,6 +500,10 @@ enum bch_fs_state { BCH_FS_RW, }; +struct bch_fs_pcpu { + u64 sectors_available; +}; + struct bch_fs { struct closure cl; @@ -525,8 +529,8 @@ struct bch_fs { struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; - struct bch_replicas_cpu __rcu *replicas; - struct bch_replicas_cpu __rcu *replicas_gc; + struct bch_replicas_cpu replicas; + struct bch_replicas_cpu replicas_gc; struct mutex replicas_gc_lock; struct bch_disk_groups_cpu __rcu *disk_groups; @@ -612,11 +616,11 @@ struct bch_fs { atomic64_t sectors_available; - struct bch_fs_usage __percpu *usage[2]; + struct bch_fs_pcpu __percpu *pcpu; - struct percpu_rw_semaphore usage_lock; + struct bch_fs_usage __percpu *usage[2]; - struct closure_waitlist freelist_wait; + struct percpu_rw_semaphore mark_lock; /* * When we invalidate buckets, we use both the priority and the amount @@ -630,6 +634,7 @@ struct bch_fs { /* ALLOCATOR */ spinlock_t freelist_lock; + struct closure_waitlist freelist_wait; u8 open_buckets_freelist; u8 open_buckets_nr_free; struct closure_waitlist open_buckets_wait; @@ -718,9 +723,6 @@ struct bch_fs { struct mutex fsck_error_lock; bool fsck_alloc_err; - /* FILESYSTEM */ - atomic_long_t nr_inodes; - /* QUOTAS */ struct bch_memquota_type quotas[QTYP_NR]; diff --git a/libbcachefs/bcachefs_format.h b/libbcachefs/bcachefs_format.h index 6d8397b..efda901 100644 --- a/libbcachefs/bcachefs_format.h +++ b/libbcachefs/bcachefs_format.h @@ -73,6 +73,7 @@ #include #include +#include #include #define LE_BITMASK(_bits, name, type, field, offset, end) \ @@ -802,11 +803,6 @@ struct bch_xattr { /* Bucket/allocation information: */ -enum { - BCH_ALLOC_FIELD_READ_TIME = 0, - BCH_ALLOC_FIELD_WRITE_TIME = 1, -}; - struct bch_alloc { struct bch_val v; __u8 fields; @@ -814,6 +810,32 @@ struct bch_alloc { __u8 data[]; } __attribute__((packed, aligned(8))); +#define BCH_ALLOC_FIELDS() \ + x(read_time, 2) \ + x(write_time, 2) \ + x(data_type, 1) \ + x(dirty_sectors, 2) \ + x(cached_sectors, 2) + +enum { +#define x(name, bytes) BCH_ALLOC_FIELD_##name, + BCH_ALLOC_FIELDS() +#undef x + BCH_ALLOC_FIELD_NR +}; + +static const unsigned BCH_ALLOC_FIELD_BYTES[] = { +#define x(name, bytes) [BCH_ALLOC_FIELD_##name] = bytes, + BCH_ALLOC_FIELDS() +#undef x +}; + +#define x(name, bytes) + bytes +static const unsigned BKEY_ALLOC_VAL_U64s_MAX = + DIV_ROUND_UP(offsetof(struct bch_alloc, data) + BCH_ALLOC_FIELDS(), sizeof(u64)); +#undef x + /* Quotas: */ enum quota_types { diff --git a/libbcachefs/bkey_methods.c b/libbcachefs/bkey_methods.c index 6b04bef..48c86e5 100644 --- a/libbcachefs/bkey_methods.c +++ b/libbcachefs/bkey_methods.c @@ -24,13 +24,13 @@ static const char *deleted_key_invalid(const struct bch_fs *c, return NULL; } -const struct bkey_ops bch2_bkey_ops_deleted = { - .key_invalid = deleted_key_invalid, -}; +#define bch2_bkey_ops_deleted (struct bkey_ops) { \ + .key_invalid = deleted_key_invalid, \ +} -const struct bkey_ops bch2_bkey_ops_discard = { - .key_invalid = deleted_key_invalid, -}; +#define bch2_bkey_ops_discard (struct bkey_ops) { \ + .key_invalid = deleted_key_invalid, \ +} static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k) { @@ -40,9 +40,9 @@ static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c return NULL; } -const struct bkey_ops bch2_bkey_ops_error = { - .key_invalid = empty_val_key_invalid, -}; +#define bch2_bkey_ops_error (struct bkey_ops) { \ + .key_invalid = empty_val_key_invalid, \ +} static const char *key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k) @@ -53,13 +53,13 @@ static const char *key_type_cookie_invalid(const struct bch_fs *c, return NULL; } -const struct bkey_ops bch2_bkey_ops_cookie = { - .key_invalid = key_type_cookie_invalid, -}; +#define bch2_bkey_ops_cookie (struct bkey_ops) { \ + .key_invalid = key_type_cookie_invalid, \ +} -const struct bkey_ops bch2_bkey_ops_whiteout = { - .key_invalid = empty_val_key_invalid, -}; +#define bch2_bkey_ops_whiteout (struct bkey_ops) { \ + .key_invalid = empty_val_key_invalid, \ +} static const struct bkey_ops bch2_bkey_ops[] = { #define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, diff --git a/libbcachefs/btree_gc.c b/libbcachefs/btree_gc.c index c30d1f7..9f5a79a 100644 --- a/libbcachefs/btree_gc.c +++ b/libbcachefs/btree_gc.c @@ -141,21 +141,21 @@ static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, size_t b = PTR_BUCKET_NR(ca, ptr); struct bucket *g = PTR_BUCKET(ca, ptr); - if (mustfix_fsck_err_on(!g->mark.gen_valid, c, + if (mustfix_fsck_err_on(!g->gen_valid, c, "found ptr with missing gen in alloc btree,\n" "type %u gen %u", k.k->type, ptr->gen)) { g->_mark.gen = ptr->gen; - g->_mark.gen_valid = 1; - set_bit(b, ca->buckets_dirty); + g->gen_valid = 1; + bucket_set_dirty(ca, b); } if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, "%u ptr gen in the future: %u > %u", k.k->type, ptr->gen, g->mark.gen)) { g->_mark.gen = ptr->gen; - g->_mark.gen_valid = 1; - set_bit(b, ca->buckets_dirty); + g->gen_valid = 1; + bucket_set_dirty(ca, b); set_bit(BCH_FS_FIXED_GENS, &c->flags); } } @@ -348,7 +348,7 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, */ if (c) { lockdep_assert_held(&c->sb_lock); - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read_preempt_disable(&c->mark_lock); } else { preempt_disable(); } @@ -373,7 +373,7 @@ void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, } if (c) { - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read_preempt_enable(&c->mark_lock); } else { preempt_enable(); } @@ -419,7 +419,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) size_t i, j, iter; unsigned ci; - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read_preempt_disable(&c->mark_lock); spin_lock(&c->freelist_lock); gc_pos_set(c, gc_pos_alloc(c, NULL)); @@ -455,7 +455,7 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) spin_unlock(&ob->lock); } - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read_preempt_enable(&c->mark_lock); } static void bch2_gc_free(struct bch_fs *c) @@ -479,6 +479,20 @@ static void bch2_gc_free(struct bch_fs *c) c->usage[1] = NULL; } +static void fs_usage_reset(struct bch_fs_usage *fs_usage) +{ + memset(&fs_usage->s.gc_start[0], 0, + sizeof(*fs_usage) - offsetof(typeof(*fs_usage), s.gc_start)); +} + +static void fs_usage_cpy(struct bch_fs_usage *dst, + struct bch_fs_usage *src) +{ + memcpy(&dst->s.gc_start[0], + &src->s.gc_start[0], + sizeof(*dst) - offsetof(typeof(*dst), s.gc_start)); +} + static void bch2_gc_done_nocheck(struct bch_fs *c) { struct bch_dev *ca; @@ -527,17 +541,12 @@ static void bch2_gc_done_nocheck(struct bch_fs *c) { struct bch_fs_usage src = __bch2_fs_usage_read(c, 1); - struct bch_fs_usage *p; - for_each_possible_cpu(cpu) { - p = per_cpu_ptr(c->usage[0], cpu); - memset(p, 0, offsetof(typeof(*p), online_reserved)); - } + for_each_possible_cpu(cpu) + fs_usage_reset(per_cpu_ptr(c->usage[0], cpu)); preempt_disable(); - memcpy(this_cpu_ptr(c->usage[0]), - &src, - offsetof(typeof(*p), online_reserved)); + fs_usage_cpy(this_cpu_ptr(c->usage[0]), &src); preempt_enable(); } @@ -575,7 +584,7 @@ static void bch2_gc_done(struct bch_fs *c, bool initial) #define copy_fs_field(_f, _msg, ...) \ copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) - percpu_down_write(&c->usage_lock); + percpu_down_write(&c->mark_lock); if (initial) { bch2_gc_done_nocheck(c); @@ -665,9 +674,14 @@ static void bch2_gc_done(struct bch_fs *c, bool initial) { struct bch_fs_usage dst = __bch2_fs_usage_read(c, 0); struct bch_fs_usage src = __bch2_fs_usage_read(c, 1); - struct bch_fs_usage *p; unsigned r, b; + copy_fs_field(s.hidden, "hidden"); + copy_fs_field(s.data, "data"); + copy_fs_field(s.cached, "cached"); + copy_fs_field(s.reserved, "reserved"); + copy_fs_field(s.nr_inodes, "nr_inodes"); + for (r = 0; r < BCH_REPLICAS_MAX; r++) { for (b = 0; b < BCH_DATA_NR; b++) copy_fs_field(replicas[r].data[b], @@ -683,18 +697,15 @@ static void bch2_gc_done(struct bch_fs *c, bool initial) copy_fs_field(buckets[b], "buckets[%s]", bch2_data_types[b]); - for_each_possible_cpu(cpu) { - p = per_cpu_ptr(c->usage[0], cpu); - memset(p, 0, offsetof(typeof(*p), online_reserved)); - } + for_each_possible_cpu(cpu) + fs_usage_reset(per_cpu_ptr(c->usage[0], cpu)); preempt_disable(); - p = this_cpu_ptr(c->usage[0]); - memcpy(p, &dst, offsetof(typeof(*p), online_reserved)); + fs_usage_cpy(this_cpu_ptr(c->usage[0]), &dst); preempt_enable(); } out: - percpu_up_write(&c->usage_lock); + percpu_up_write(&c->mark_lock); #undef copy_fs_field #undef copy_dev_field @@ -739,7 +750,7 @@ static int bch2_gc_start(struct bch_fs *c) } } - percpu_down_write(&c->usage_lock); + percpu_down_write(&c->mark_lock); for_each_member_device(ca, c, i) { struct bucket_array *dst = __bucket_array(ca, 1); @@ -753,7 +764,7 @@ static int bch2_gc_start(struct bch_fs *c) dst->b[b]._mark.gen = src->b[b].mark.gen; }; - percpu_up_write(&c->usage_lock); + percpu_up_write(&c->mark_lock); return bch2_ec_mem_alloc(c, true); } diff --git a/libbcachefs/btree_iter.c b/libbcachefs/btree_iter.c index f4922bc..73e2c5e 100644 --- a/libbcachefs/btree_iter.c +++ b/libbcachefs/btree_iter.c @@ -530,8 +530,24 @@ found: btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); bch2_btree_node_iter_sort(node_iter, b); - if (!b->level && node_iter == &iter->l[0].iter) + if (!b->level && node_iter == &iter->l[0].iter) { + /* + * not legal to call bkey_debugcheck() here, because we're + * called midway through the update path after update has been + * marked but before deletes have actually happened: + */ +#if 0 __btree_iter_peek_all(iter, &iter->l[0], &iter->k); +#endif + struct btree_iter_level *l = &iter->l[0]; + struct bkey_packed *k = + bch2_btree_node_iter_peek_all(&l->iter, l->b); + + if (unlikely(!k)) + iter->k.type = KEY_TYPE_deleted; + else + bkey_disassemble(l->b, k, &iter->k); + } iter_current_key_not_modified: /* diff --git a/libbcachefs/btree_types.h b/libbcachefs/btree_types.h index a91a37e..0af2a7d 100644 --- a/libbcachefs/btree_types.h +++ b/libbcachefs/btree_types.h @@ -244,9 +244,28 @@ struct btree_iter { #define BTREE_ITER_MAX 8 +struct deferred_update { + struct journal_entry_pin journal; + + spinlock_t lock; + unsigned gen; + + u8 allocated_u64s; + enum btree_id btree_id; + + /* must be last: */ + struct bkey_i k; +}; + struct btree_insert_entry { - struct btree_iter *iter; - struct bkey_i *k; + struct bkey_i *k; + + union { + struct btree_iter *iter; + struct deferred_update *d; + }; + + bool deferred; }; struct btree_trans { @@ -438,6 +457,7 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type) switch (type) { case BKEY_TYPE_BTREE: case BKEY_TYPE_EXTENTS: + case BKEY_TYPE_INODES: case BKEY_TYPE_EC: return true; default: diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h index 7683636..dd9d255 100644 --- a/libbcachefs/btree_update.h +++ b/libbcachefs/btree_update.h @@ -15,6 +15,11 @@ bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *, struct bkey_i *); +void bch2_deferred_update_free(struct bch_fs *, + struct deferred_update *); +struct deferred_update * +bch2_deferred_update_alloc(struct bch_fs *, enum btree_id, unsigned); + /* Normal update interface: */ struct btree_insert { @@ -37,6 +42,13 @@ int __bch2_btree_insert_at(struct btree_insert *); .k = (_k), \ }) +#define BTREE_INSERT_DEFERRED(_d, _k) \ + ((struct btree_insert_entry) { \ + .k = (_k), \ + .d = (_d), \ + .deferred = true, \ + }) + /** * bch_btree_insert_at - insert one or more keys at iterator positions * @iter: btree iterator diff --git a/libbcachefs/btree_update_interior.c b/libbcachefs/btree_update_interior.c index ee19b13..e18655e 100644 --- a/libbcachefs/btree_update_interior.c +++ b/libbcachefs/btree_update_interior.c @@ -561,7 +561,6 @@ static void bch2_btree_update_free(struct btree_update *as) closure_debug_destroy(&as->cl); mempool_free(as, &c->btree_interior_update_pool); - percpu_ref_put(&c->writes); closure_wake_up(&c->btree_interior_update_wait); mutex_unlock(&c->btree_interior_update_lock); @@ -1011,14 +1010,9 @@ bch2_btree_update_start(struct bch_fs *c, enum btree_id id, struct btree_reserve *reserve; struct btree_update *as; - if (unlikely(!percpu_ref_tryget(&c->writes))) - return ERR_PTR(-EROFS); - reserve = bch2_btree_reserve_get(c, nr_nodes, flags, cl); - if (IS_ERR(reserve)) { - percpu_ref_put(&c->writes); + if (IS_ERR(reserve)) return ERR_CAST(reserve); - } as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); memset(as, 0, sizeof(*as)); @@ -1067,7 +1061,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) __bch2_btree_set_root_inmem(c, b); mutex_lock(&c->btree_interior_update_lock); - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read_preempt_disable(&c->mark_lock); bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key), true, 0, @@ -1081,7 +1075,7 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b) bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res, gc_pos_btree_root(b->btree_id)); - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read_preempt_enable(&c->mark_lock); mutex_unlock(&c->btree_interior_update_lock); } @@ -1160,7 +1154,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b BUG_ON(insert->k.u64s > bch_btree_keys_u64s_remaining(c, b)); mutex_lock(&c->btree_interior_update_lock); - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read_preempt_disable(&c->mark_lock); bch2_mark_key_locked(c, bkey_i_to_s_c(insert), true, 0, @@ -1182,7 +1176,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res, gc_pos_btree_node(b)); - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read_preempt_enable(&c->mark_lock); mutex_unlock(&c->btree_interior_update_lock); bch2_btree_bset_insert_key(iter, b, node_iter, insert); @@ -1918,6 +1912,25 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, btree_interior_update_add_node_reference(as, b); + /* + * XXX: the rest of the update path treats this like we're actually + * inserting a new node and deleting the existing node, so the + * reservation needs to include enough space for @b + * + * that is actually sketch as fuck though and I am surprised the code + * seems to work like that, definitely need to go back and rework it + * into something saner. + * + * (I think @b is just getting double counted until the btree update + * finishes and "deletes" @b on disk) + */ + ret = bch2_disk_reservation_add(c, &as->reserve->disk_res, + c->opts.btree_node_size * + bch2_bkey_nr_ptrs(bkey_i_to_s_c(&new_key->k_i)), + BCH_DISK_RESERVATION_NOFAIL| + BCH_DISK_RESERVATION_GC_LOCK_HELD); + BUG_ON(ret); + parent = btree_node_parent(iter, b); if (parent) { if (new_hash) { @@ -1951,7 +1964,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, bch2_btree_node_lock_write(b, iter); mutex_lock(&c->btree_interior_update_lock); - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read_preempt_disable(&c->mark_lock); bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i), true, 0, @@ -1963,7 +1976,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c, bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res, gc_pos_btree_root(b->btree_id)); - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read_preempt_enable(&c->mark_lock); mutex_unlock(&c->btree_interior_update_lock); if (PTR_HASH(&new_key->k_i) != PTR_HASH(&b->key)) { diff --git a/libbcachefs/btree_update_leaf.c b/libbcachefs/btree_update_leaf.c index 57c5c7a..7eca920 100644 --- a/libbcachefs/btree_update_leaf.c +++ b/libbcachefs/btree_update_leaf.c @@ -7,6 +7,7 @@ #include "btree_locking.h" #include "buckets.h" #include "debug.h" +#include "error.h" #include "extents.h" #include "journal.h" #include "journal_reclaim.h" @@ -125,6 +126,27 @@ static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, return __btree_node_flush(j, pin, 1, seq); } +static inline void __btree_journal_key(struct btree_insert *trans, + enum btree_id btree_id, + struct bkey_i *insert) +{ + struct journal *j = &trans->c->journal; + u64 seq = trans->journal_res.seq; + bool needs_whiteout = insert->k.needs_whiteout; + + /* ick */ + insert->k.needs_whiteout = false; + bch2_journal_add_keys(j, &trans->journal_res, + btree_id, insert); + insert->k.needs_whiteout = needs_whiteout; + + bch2_journal_set_has_inode(j, &trans->journal_res, + insert->k.p.inode); + + if (trans->journal_seq) + *trans->journal_seq = seq; +} + void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *iter, struct bkey_i *insert) @@ -139,21 +161,9 @@ void bch2_btree_journal_key(struct btree_insert *trans, !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { - u64 seq = trans->journal_res.seq; - bool needs_whiteout = insert->k.needs_whiteout; - - /* ick */ - insert->k.needs_whiteout = false; - bch2_journal_add_keys(j, &trans->journal_res, - iter->btree_id, insert); - insert->k.needs_whiteout = needs_whiteout; - - bch2_journal_set_has_inode(j, &trans->journal_res, - insert->k.p.inode); - - if (trans->journal_seq) - *trans->journal_seq = seq; - btree_bset_last(b)->journal_seq = cpu_to_le64(seq); + __btree_journal_key(trans, iter->btree_id, insert); + btree_bset_last(b)->journal_seq = + cpu_to_le64(trans->journal_res.seq); } if (unlikely(!journal_pin_active(&w->journal))) { @@ -226,8 +236,109 @@ btree_insert_key_leaf(struct btree_insert *trans, return ret; } -#define trans_for_each_entry(trans, i) \ - for ((i) = (trans)->entries; (i) < (trans)->entries + (trans)->nr; (i)++) +/* Deferred btree updates: */ + +static void deferred_update_flush(struct journal *j, + struct journal_entry_pin *pin, + u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct deferred_update *d = + container_of(pin, struct deferred_update, journal); + u64 tmp[32]; + struct bkey_i *k = (void *) tmp; + unsigned gen; + int ret; + + if (d->allocated_u64s > ARRAY_SIZE(tmp)) { + k = kmalloc(d->allocated_u64s * sizeof(u64), GFP_NOFS); + + BUG_ON(!k); /* XXX */ + } + + spin_lock(&d->lock); + gen = d->gen; + + if (journal_pin_active(&d->journal)) { + BUG_ON(d->k.k.u64s > d->allocated_u64s); + bkey_copy(k, &d->k); + + spin_unlock(&d->lock); + + ret = bch2_btree_insert(c, d->btree_id, k, NULL, NULL, + BTREE_INSERT_NOFAIL); + bch2_fs_fatal_err_on(ret && !bch2_journal_error(j), + c, "error flushing deferred btree update: %i", ret); + + spin_lock(&d->lock); + } + + if (gen == d->gen) + bch2_journal_pin_drop(j, &d->journal); + spin_unlock(&d->lock); + + if (k != (void *) tmp) + kfree(k); +} + +static enum btree_insert_ret +btree_insert_key_deferred(struct btree_insert *trans, + struct btree_insert_entry *insert) +{ + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; + struct deferred_update *d = insert->d; + + BUG_ON(trans->flags & BTREE_INSERT_JOURNAL_REPLAY); + BUG_ON(insert->k->u64s > d->allocated_u64s); + + __btree_journal_key(trans, d->btree_id, insert->k); + + spin_lock(&d->lock); + d->gen++; + bkey_copy(&d->k, insert->k); + spin_unlock(&d->lock); + + bch2_journal_pin_update(j, trans->journal_res.seq, &d->journal, + deferred_update_flush); + + return BTREE_INSERT_OK; +} + +void bch2_deferred_update_free(struct bch_fs *c, + struct deferred_update *d) +{ + deferred_update_flush(&c->journal, &d->journal, 0); + + BUG_ON(journal_pin_active(&d->journal)); + + bch2_journal_pin_flush(&c->journal, &d->journal); + kfree(d); +} + +struct deferred_update * +bch2_deferred_update_alloc(struct bch_fs *c, + enum btree_id btree_id, + unsigned u64s) +{ + struct deferred_update *d; + + BUG_ON(u64s > U8_MAX); + + d = kmalloc(offsetof(struct deferred_update, k) + + u64s * sizeof(u64), GFP_NOFS); + BUG_ON(!d); + + memset(d, 0, offsetof(struct deferred_update, k)); + + spin_lock_init(&d->lock); + d->allocated_u64s = u64s; + d->btree_id = btree_id; + + return d; +} + +/* struct btree_insert operations: */ /* * We sort transaction entries so that if multiple iterators point to the same @@ -237,25 +348,32 @@ static bool same_leaf_as_prev(struct btree_insert *trans, struct btree_insert_entry *i) { return i != trans->entries && + !i->deferred && i[0].iter->l[0].b == i[-1].iter->l[0].b; } -static inline struct btree_insert_entry *trans_next_leaf(struct btree_insert *trans, - struct btree_insert_entry *i) -{ - struct btree *b = i->iter->l[0].b; +#define __trans_next_entry(_trans, _i, _filter) \ +({ \ + while ((_i) < (_trans)->entries + (_trans->nr) && !(_filter)) \ + (_i)++; \ + \ + (_i) < (_trans)->entries + (_trans->nr); \ +}) - do { - i++; - } while (i < trans->entries + trans->nr && b == i->iter->l[0].b); +#define __trans_for_each_entry(_trans, _i, _filter) \ + for ((_i) = (_trans)->entries; \ + __trans_next_entry(_trans, _i, _filter); \ + (_i)++) - return i; -} +#define trans_for_each_entry(trans, i) \ + __trans_for_each_entry(trans, i, true) + +#define trans_for_each_iter(trans, i) \ + __trans_for_each_entry(trans, i, !(i)->deferred) #define trans_for_each_leaf(trans, i) \ - for ((i) = (trans)->entries; \ - (i) < (trans)->entries + (trans)->nr; \ - (i) = trans_next_leaf(trans, i)) + __trans_for_each_entry(trans, i, !(i)->deferred && \ + !same_leaf_as_prev(trans, i)) inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, struct btree_iter *iter) @@ -293,7 +411,8 @@ static void multi_unlock_write(struct btree_insert *trans) static inline int btree_trans_cmp(struct btree_insert_entry l, struct btree_insert_entry r) { - return btree_iter_cmp(l.iter, r.iter); + return (l.deferred > r.deferred) - (l.deferred < r.deferred) ?: + btree_iter_cmp(l.iter, r.iter); } /* Normal update interface: */ @@ -327,6 +446,15 @@ btree_key_can_insert(struct btree_insert *trans, return BTREE_INSERT_OK; } +static inline enum btree_insert_ret +do_btree_insert_one(struct btree_insert *trans, + struct btree_insert_entry *insert) +{ + return likely(!insert->deferred) + ? btree_insert_key_leaf(trans, insert) + : btree_insert_key_deferred(trans, insert); +} + /* * Get journal reservation, take write locks, and attempt to do btree update(s): */ @@ -339,9 +467,14 @@ static inline int do_btree_insert_at(struct btree_insert *trans, unsigned u64s; int ret; - trans_for_each_entry(trans, i) + trans_for_each_iter(trans, i) BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK); + /* reserve space for deferred updates */ + __trans_for_each_entry(trans, i, i->deferred) { + + } + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { @@ -352,22 +485,21 @@ static inline int do_btree_insert_at(struct btree_insert *trans, while ((ret = bch2_journal_res_get(&c->journal, &trans->journal_res, u64s, JOURNAL_RES_GET_NONBLOCK)) == -EAGAIN) { - struct btree_iter *iter = trans->entries[0].iter; - struct closure cl; - - bch2_btree_iter_unlock(iter); + struct btree_iter *iter = NULL; - closure_init_stack(&cl); + trans_for_each_iter(trans, i) + iter = i->iter; - while ((ret = bch2_journal_open_seq_async(&c->journal, - trans->journal_res.seq, - &cl)) == -EAGAIN) - closure_sync(&cl); + if (iter) + bch2_btree_iter_unlock(iter); + ret = bch2_journal_res_get(&c->journal, + &trans->journal_res, u64s, + JOURNAL_RES_GET_CHECK); if (ret) return ret; - if (!bch2_btree_iter_relock(iter)) { + if (iter && !bch2_btree_iter_relock(iter)) { trans_restart(" (iter relock after journal res get blocked)"); return -EINTR; } @@ -391,7 +523,7 @@ static inline int do_btree_insert_at(struct btree_insert *trans, * amount of space available: */ u64s = 0; - trans_for_each_entry(trans, i) { + trans_for_each_iter(trans, i) { /* Multiple inserts might go to same leaf: */ if (!same_leaf_as_prev(trans, i)) u64s = 0; @@ -419,14 +551,17 @@ static inline int do_btree_insert_at(struct btree_insert *trans, * have been traversed/locked, depending on what the caller was * doing: */ - for_each_btree_iter(trans->entries[0].iter, linked) - if (linked->uptodate < BTREE_ITER_NEED_RELOCK) - linked->flags |= BTREE_ITER_NOUNLOCK; + trans_for_each_iter(trans, i) { + for_each_btree_iter(i->iter, linked) + if (linked->uptodate < BTREE_ITER_NEED_RELOCK) + linked->flags |= BTREE_ITER_NOUNLOCK; + break; + } } trans->did_work = true; trans_for_each_entry(trans, i) { - switch (btree_insert_key_leaf(trans, i)) { + switch (do_btree_insert_one(trans, i)) { case BTREE_INSERT_OK: break; case BTREE_INSERT_NEED_TRAVERSE: @@ -448,12 +583,20 @@ out: static inline void btree_insert_entry_checks(struct bch_fs *c, struct btree_insert_entry *i) { - BUG_ON(i->iter->level); - BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); + enum btree_id btree_id = !i->deferred + ? i->iter->btree_id + : i->d->btree_id; + + if (!i->deferred) { + BUG_ON(i->iter->level); + BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); + + bch2_btree_iter_verify_locks(i->iter); + } + BUG_ON(debug_check_bkeys(c) && !bkey_deleted(&i->k->k) && - bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), - i->iter->btree_id)); + bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), btree_id)); } /** @@ -477,20 +620,18 @@ int __bch2_btree_insert_at(struct btree_insert *trans) BUG_ON(!trans->nr); - bch2_btree_iter_verify_locks(trans->entries[0].iter); - /* for the sake of sanity: */ BUG_ON(trans->nr > 1 && !(trans->flags & BTREE_INSERT_ATOMIC)); + bubble_sort(trans->entries, trans->nr, btree_trans_cmp); + trans_for_each_entry(trans, i) btree_insert_entry_checks(c, i); - bubble_sort(trans->entries, trans->nr, btree_trans_cmp); - if (unlikely(!percpu_ref_tryget(&c->writes))) return -EROFS; retry: - trans_for_each_entry(trans, i) { + trans_for_each_iter(trans, i) { unsigned old_locks_want = i->iter->locks_want; unsigned old_uptodate = i->iter->uptodate; @@ -514,16 +655,22 @@ retry: trans_for_each_leaf(trans, i) bch2_foreground_maybe_merge(c, i->iter, 0, trans->flags); - trans_for_each_entry(trans, i) + trans_for_each_iter(trans, i) bch2_btree_iter_downgrade(i->iter); out: percpu_ref_put(&c->writes); /* make sure we didn't drop or screw up locks: */ - bch2_btree_iter_verify_locks(trans->entries[0].iter); + trans_for_each_iter(trans, i) { + bch2_btree_iter_verify_locks(i->iter); + break; + } - for_each_btree_iter(trans->entries[0].iter, linked) - linked->flags &= ~BTREE_ITER_NOUNLOCK; + trans_for_each_iter(trans, i) { + for_each_btree_iter(i->iter, linked) + linked->flags &= ~BTREE_ITER_NOUNLOCK; + break; + } BUG_ON(!(trans->flags & BTREE_INSERT_ATOMIC) && ret == -EINTR); @@ -602,7 +749,7 @@ err: goto out; } - trans_for_each_entry(trans, i) { + trans_for_each_iter(trans, i) { int ret2 = bch2_btree_iter_traverse(i->iter); if (ret2) { ret = ret2; diff --git a/libbcachefs/buckets.c b/libbcachefs/buckets.c index 401ff82..d72e595 100644 --- a/libbcachefs/buckets.c +++ b/libbcachefs/buckets.c @@ -105,9 +105,9 @@ static void bch2_fs_stats_verify(struct bch_fs *c) bch_data_types[j], stats.buckets[j]); - if ((s64) stats.online_reserved < 0) + if ((s64) stats.s.online_reserved < 0) panic("sectors_online_reserved underflow: %lli\n", - stats.online_reserved); + stats.s.online_reserved); } static void bch2_dev_stats_verify(struct bch_dev *ca) @@ -227,38 +227,6 @@ struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *c) return bch2_usage_read_raw(c->usage[0]); } -struct fs_usage_sum { - u64 hidden; - u64 data; - u64 cached; - u64 reserved; -}; - -static inline struct fs_usage_sum __fs_usage_sum(struct bch_fs_usage stats) -{ - struct fs_usage_sum sum = { 0 }; - unsigned i; - - /* - * For superblock and journal we count bucket usage, not sector usage, - * because any internal fragmentation should _not_ be counted as - * free space: - */ - sum.hidden += stats.buckets[BCH_DATA_SB]; - sum.hidden += stats.buckets[BCH_DATA_JOURNAL]; - - for (i = 0; i < ARRAY_SIZE(stats.replicas); i++) { - sum.data += stats.replicas[i].data[BCH_DATA_BTREE]; - sum.data += stats.replicas[i].data[BCH_DATA_USER]; - sum.data += stats.replicas[i].ec_data; - sum.cached += stats.replicas[i].data[BCH_DATA_CACHED]; - sum.reserved += stats.replicas[i].persistent_reserved; - } - - sum.reserved += stats.online_reserved; - return sum; -} - #define RESERVE_FACTOR 6 static u64 reserve_factor(u64 r) @@ -271,16 +239,33 @@ static u64 avail_factor(u64 r) return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1); } -static inline u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats) +static inline u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage) { - struct fs_usage_sum sum = __fs_usage_sum(stats); + return fs_usage.s.hidden + + fs_usage.s.data + + reserve_factor(fs_usage.s.reserved + + fs_usage.s.online_reserved); +} - return sum.hidden + sum.data + reserve_factor(sum.reserved); +u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage) +{ + return min(c->capacity, __bch2_fs_sectors_used(c, fs_usage)); } -u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage stats) +struct bch_fs_usage_short +bch2_fs_usage_read_short(struct bch_fs *c) { - return min(c->capacity, __bch2_fs_sectors_used(c, stats)); + struct bch_fs_usage_summarized usage = + bch2_usage_read_raw(&c->usage[0]->s); + struct bch_fs_usage_short ret; + + ret.capacity = READ_ONCE(c->capacity) - usage.hidden; + ret.used = min(ret.capacity, usage.data + + reserve_factor(usage.reserved + + usage.online_reserved)); + ret.nr_inodes = usage.nr_inodes; + + return ret; } static inline int is_unavailable_bucket(struct bucket_mark m) @@ -314,15 +299,14 @@ static bool bucket_became_unavailable(struct bucket_mark old, } void bch2_fs_usage_apply(struct bch_fs *c, - struct bch_fs_usage *stats, + struct bch_fs_usage *fs_usage, struct disk_reservation *disk_res, struct gc_pos gc_pos) { - struct fs_usage_sum sum = __fs_usage_sum(*stats); - s64 added = sum.data + sum.reserved; + s64 added = fs_usage->s.data + fs_usage->s.reserved; s64 should_not_have_added; - percpu_rwsem_assert_held(&c->usage_lock); + percpu_rwsem_assert_held(&c->mark_lock); /* * Not allowed to reduce sectors_available except by getting a @@ -336,23 +320,30 @@ void bch2_fs_usage_apply(struct bch_fs *c, } if (added > 0) { - disk_res->sectors -= added; - stats->online_reserved -= added; + disk_res->sectors -= added; + fs_usage->s.online_reserved -= added; } - /* online_reserved not subject to gc: */ - this_cpu_ptr(c->usage[0])->online_reserved += - stats->online_reserved; - stats->online_reserved = 0; - - bch2_usage_add(this_cpu_ptr(c->usage[0]), stats); + bch2_usage_add(this_cpu_ptr(c->usage[0]), fs_usage); if (gc_visited(c, gc_pos)) - bch2_usage_add(this_cpu_ptr(c->usage[1]), stats); + bch2_usage_add(this_cpu_ptr(c->usage[1]), fs_usage); bch2_fs_stats_verify(c); - memset(stats, 0, sizeof(*stats)); + memset(fs_usage, 0, sizeof(*fs_usage)); +} + +static inline void account_bucket(struct bch_fs_usage *fs_usage, + struct bch_dev_usage *dev_usage, + enum bch_data_type type, + int nr, s64 size) +{ + if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL) + fs_usage->s.hidden += size; + + fs_usage->buckets[type] += size; + dev_usage->buckets[type] += nr; } static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, @@ -362,7 +353,7 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, { struct bch_dev_usage *dev_usage; - percpu_rwsem_assert_held(&c->usage_lock); + percpu_rwsem_assert_held(&c->mark_lock); bch2_fs_inconsistent_on(old.data_type && new.data_type && old.data_type != new.data_type, c, @@ -372,15 +363,13 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, dev_usage = this_cpu_ptr(ca->usage[gc]); - if (bucket_type(old)) { - fs_usage->buckets[bucket_type(old)] -= ca->mi.bucket_size; - dev_usage->buckets[bucket_type(old)]--; - } + if (bucket_type(old)) + account_bucket(fs_usage, dev_usage, bucket_type(old), + -1, -ca->mi.bucket_size); - if (bucket_type(new)) { - fs_usage->buckets[bucket_type(new)] += ca->mi.bucket_size; - dev_usage->buckets[bucket_type(new)]++; - } + if (bucket_type(new)) + account_bucket(fs_usage, dev_usage, bucket_type(new), + 1, ca->mi.bucket_size); dev_usage->buckets_alloc += (int) new.owned_by_allocator - (int) old.owned_by_allocator; @@ -409,14 +398,14 @@ void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca) struct bucket_array *buckets; struct bucket *g; - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read_preempt_disable(&c->mark_lock); fs_usage = this_cpu_ptr(c->usage[0]); buckets = bucket_array(ca); for_each_bucket(g, buckets) if (g->mark.data_type) bch2_dev_usage_update(c, ca, fs_usage, old, g->mark, false); - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read_preempt_enable(&c->mark_lock); } #define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr) \ @@ -431,11 +420,11 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, struct bucket_mark *old, bool gc) { - struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]); + struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]); struct bucket *g = __bucket(ca, b, gc); struct bucket_mark new; - *old = bucket_data_cmpxchg(c, ca, stats, g, new, ({ + *old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ BUG_ON(!is_available_bucket(new)); new.owned_by_allocator = 1; @@ -445,13 +434,14 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, new.gen++; })); - stats->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors; + fs_usage->replicas[0].data[BCH_DATA_CACHED] -= old->cached_sectors; + fs_usage->s.cached -= old->cached_sectors; } void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, struct bucket_mark *old) { - percpu_rwsem_assert_held(&c->usage_lock); + percpu_rwsem_assert_held(&c->mark_lock); __bch2_invalidate_bucket(c, ca, b, old, false); @@ -464,11 +454,11 @@ static void __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, bool owned_by_allocator, bool gc) { - struct bch_fs_usage *stats = this_cpu_ptr(c->usage[gc]); + struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]); struct bucket *g = __bucket(ca, b, gc); struct bucket_mark old, new; - old = bucket_data_cmpxchg(c, ca, stats, g, new, ({ + old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({ new.owned_by_allocator = owned_by_allocator; })); @@ -480,7 +470,7 @@ void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, size_t b, bool owned_by_allocator, struct gc_pos pos, unsigned flags) { - percpu_rwsem_assert_held(&c->usage_lock); + percpu_rwsem_assert_held(&c->mark_lock); if (!(flags & BCH_BUCKET_MARK_GC)) __bch2_mark_alloc_bucket(c, ca, b, owned_by_allocator, false); @@ -513,7 +503,10 @@ static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, checked_add(new.dirty_sectors, sectors); })); - fs_usage->replicas[0].data[type] += sectors; + if (type == BCH_DATA_BTREE || + type == BCH_DATA_USER) + fs_usage->s.data += sectors; + fs_usage->replicas[0].data[type] += sectors; } void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, @@ -525,7 +518,7 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, type != BCH_DATA_JOURNAL); if (likely(c)) { - percpu_rwsem_assert_held(&c->usage_lock); + percpu_rwsem_assert_held(&c->mark_lock); if (!(flags & BCH_BUCKET_MARK_GC)) __bch2_mark_metadata_bucket(c, ca, b, type, sectors, @@ -550,36 +543,25 @@ void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, } } -static int __disk_sectors(struct bch_extent_crc_unpacked crc, unsigned sectors) -{ - if (!sectors) - return 0; - - return max(1U, DIV_ROUND_UP(sectors * crc.compressed_size, - crc.uncompressed_size)); -} - -static s64 ptr_disk_sectors(const struct bkey *k, - struct extent_ptr_decoded p, - s64 sectors) +static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, + s64 delta) { + if (delta > 0) { + /* + * marking a new extent, which _will have size_ @delta + * + * in the bch2_mark_update -> BCH_EXTENT_OVERLAP_MIDDLE + * case, we haven't actually created the key we'll be inserting + * yet (for the split) - so we don't want to be using + * k->size/crc.live_size here: + */ + return __ptr_disk_sectors(p, delta); + } else { + BUG_ON(-delta > p.crc.live_size); - if (p.crc.compression_type) { - unsigned old_sectors, new_sectors; - - if (sectors > 0) { - old_sectors = 0; - new_sectors = sectors; - } else { - old_sectors = k->size; - new_sectors = k->size + sectors; - } - - sectors = -__disk_sectors(p.crc, old_sectors) - +__disk_sectors(p.crc, new_sectors); + return (s64) __ptr_disk_sectors(p, p.crc.live_size + delta) - + (s64) ptr_disk_sectors(p); } - - return sectors; } /* @@ -591,7 +573,7 @@ static void bch2_mark_pointer(struct bch_fs *c, struct extent_ptr_decoded p, s64 sectors, enum bch_data_type data_type, struct bch_fs_usage *fs_usage, - u64 journal_seq, unsigned flags, + unsigned journal_seq, unsigned flags, bool gc) { struct bucket_mark old, new; @@ -696,8 +678,8 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, s64 sectors, enum bch_data_type data_type, - struct bch_fs_usage *stats, - u64 journal_seq, unsigned flags, + struct bch_fs_usage *fs_usage, + unsigned journal_seq, unsigned flags, bool gc) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -714,11 +696,13 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, BUG_ON(!sectors); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - s64 disk_sectors = ptr_disk_sectors(k.k, p, sectors); + s64 disk_sectors = data_type == BCH_DATA_BTREE + ? sectors + : ptr_disk_sectors_delta(p, sectors); s64 adjusted_disk_sectors = disk_sectors; bch2_mark_pointer(c, p, disk_sectors, data_type, - stats, journal_seq, flags, gc); + fs_usage, journal_seq, flags, gc); if (!p.ptr.cached) for (i = 0; i < p.ec_nr; i++) { @@ -741,13 +725,18 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, } replicas = clamp_t(unsigned, replicas, - 1, ARRAY_SIZE(stats->replicas)); + 1, ARRAY_SIZE(fs_usage->replicas)); ec_redundancy = clamp_t(unsigned, ec_redundancy, - 1, ARRAY_SIZE(stats->replicas)); + 1, ARRAY_SIZE(fs_usage->replicas)); + + fs_usage->s.cached += cached_sectors; + fs_usage->replicas[0].data[BCH_DATA_CACHED] += cached_sectors; - stats->replicas[0].data[BCH_DATA_CACHED] += cached_sectors; - stats->replicas[replicas - 1].data[data_type] += dirty_sectors; - stats->replicas[ec_redundancy - 1].ec_data += ec_sectors; + fs_usage->s.data += dirty_sectors; + fs_usage->replicas[replicas - 1].data[data_type] += dirty_sectors; + + fs_usage->s.data += ec_sectors; + fs_usage->replicas[ec_redundancy - 1].ec_data += ec_sectors; return 0; } @@ -832,8 +821,8 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, bool inserting, s64 sectors, - struct bch_fs_usage *stats, - u64 journal_seq, unsigned flags, + struct bch_fs_usage *fs_usage, + unsigned journal_seq, unsigned flags, bool gc) { int ret = 0; @@ -844,24 +833,31 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, ? c->opts.btree_node_size : -c->opts.btree_node_size, BCH_DATA_BTREE, - stats, journal_seq, flags, gc); + fs_usage, journal_seq, flags, gc); break; case KEY_TYPE_extent: ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER, - stats, journal_seq, flags, gc); + fs_usage, journal_seq, flags, gc); break; case KEY_TYPE_stripe: ret = bch2_mark_stripe(c, k, inserting, - stats, journal_seq, flags, gc); + fs_usage, journal_seq, flags, gc); + break; + case KEY_TYPE_alloc: + if (inserting) + fs_usage->s.nr_inodes++; + else + fs_usage->s.nr_inodes--; break; case KEY_TYPE_reservation: { unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; sectors *= replicas; replicas = clamp_t(unsigned, replicas, - 1, ARRAY_SIZE(stats->replicas)); + 1, ARRAY_SIZE(fs_usage->replicas)); - stats->replicas[replicas - 1].persistent_reserved += sectors; + fs_usage->s.reserved += sectors; + fs_usage->replicas[replicas - 1].persistent_reserved += sectors; break; } default: @@ -875,17 +871,15 @@ int bch2_mark_key_locked(struct bch_fs *c, struct bkey_s_c k, bool inserting, s64 sectors, struct gc_pos pos, - struct bch_fs_usage *stats, + struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { int ret; if (!(flags & BCH_BUCKET_MARK_GC)) { - if (!stats) - stats = this_cpu_ptr(c->usage[0]); - ret = __bch2_mark_key(c, k, inserting, sectors, - stats, journal_seq, flags, false); + fs_usage ?: this_cpu_ptr(c->usage[0]), + journal_seq, flags, false); if (ret) return ret; } @@ -905,15 +899,15 @@ int bch2_mark_key_locked(struct bch_fs *c, int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, bool inserting, s64 sectors, struct gc_pos pos, - struct bch_fs_usage *stats, + struct bch_fs_usage *fs_usage, u64 journal_seq, unsigned flags) { int ret; - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read_preempt_disable(&c->mark_lock); ret = bch2_mark_key_locked(c, k, inserting, sectors, - pos, stats, journal_seq, flags); - percpu_up_read_preempt_enable(&c->usage_lock); + pos, fs_usage, journal_seq, flags); + percpu_up_read_preempt_enable(&c->mark_lock); return ret; } @@ -925,20 +919,20 @@ void bch2_mark_update(struct btree_insert *trans, struct btree_iter *iter = insert->iter; struct btree *b = iter->l[0].b; struct btree_node_iter node_iter = iter->l[0].iter; - struct bch_fs_usage stats = { 0 }; + struct bch_fs_usage fs_usage = { 0 }; struct gc_pos pos = gc_pos_btree_node(b); struct bkey_packed *_k; if (!btree_node_type_needs_gc(iter->btree_id)) return; - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read_preempt_disable(&c->mark_lock); if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true, bpos_min(insert->k->k.p, b->key.k.p).offset - bkey_start_offset(&insert->k->k), - pos, &stats, trans->journal_res.seq, 0); + pos, &fs_usage, trans->journal_res.seq, 0); while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, KEY_TYPE_discard))) { @@ -971,7 +965,7 @@ void bch2_mark_update(struct btree_insert *trans, BUG_ON(sectors <= 0); bch2_mark_key_locked(c, k, true, sectors, - pos, &stats, trans->journal_res.seq, 0); + pos, &fs_usage, trans->journal_res.seq, 0); sectors = bkey_start_offset(&insert->k->k) - k.k->p.offset; @@ -982,14 +976,14 @@ void bch2_mark_update(struct btree_insert *trans, } bch2_mark_key_locked(c, k, false, sectors, - pos, &stats, trans->journal_res.seq, 0); + pos, &fs_usage, trans->journal_res.seq, 0); bch2_btree_node_iter_advance(&node_iter, b); } - bch2_fs_usage_apply(c, &stats, trans->disk_res, pos); + bch2_fs_usage_apply(c, &fs_usage, trans->disk_res, pos); - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read_preempt_enable(&c->mark_lock); } /* Disk reservations: */ @@ -999,19 +993,19 @@ static u64 bch2_recalc_sectors_available(struct bch_fs *c) int cpu; for_each_possible_cpu(cpu) - per_cpu_ptr(c->usage[0], cpu)->available_cache = 0; + per_cpu_ptr(c->pcpu, cpu)->sectors_available = 0; - return avail_factor(bch2_fs_sectors_free(c, bch2_fs_usage_read(c))); + return avail_factor(bch2_fs_sectors_free(c)); } void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) { - percpu_down_read_preempt_disable(&c->usage_lock); - this_cpu_sub(c->usage[0]->online_reserved, + percpu_down_read_preempt_disable(&c->mark_lock); + this_cpu_sub(c->usage[0]->s.online_reserved, res->sectors); bch2_fs_stats_verify(c); - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read_preempt_enable(&c->mark_lock); res->sectors = 0; } @@ -1021,15 +1015,15 @@ void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, unsigned sectors, int flags) { - struct bch_fs_usage *stats; + struct bch_fs_pcpu *pcpu; u64 old, v, get; s64 sectors_available; int ret; - percpu_down_read_preempt_disable(&c->usage_lock); - stats = this_cpu_ptr(c->usage[0]); + percpu_down_read_preempt_disable(&c->mark_lock); + pcpu = this_cpu_ptr(c->pcpu); - if (sectors <= stats->available_cache) + if (sectors <= pcpu->sectors_available) goto out; v = atomic64_read(&c->sectors_available); @@ -1038,22 +1032,22 @@ int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, get = min((u64) sectors + SECTORS_CACHE, old); if (get < sectors) { - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read_preempt_enable(&c->mark_lock); goto recalculate; } } while ((v = atomic64_cmpxchg(&c->sectors_available, old, old - get)) != old); - stats->available_cache += get; + pcpu->sectors_available += get; out: - stats->available_cache -= sectors; - stats->online_reserved += sectors; - res->sectors += sectors; + pcpu->sectors_available -= sectors; + this_cpu_add(c->usage[0]->s.online_reserved, sectors); + res->sectors += sectors; bch2_disk_reservations_verify(c, flags); bch2_fs_stats_verify(c); - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read_preempt_enable(&c->mark_lock); return 0; recalculate: @@ -1074,15 +1068,15 @@ recalculate: return -EINTR; } - percpu_down_write(&c->usage_lock); + percpu_down_write(&c->mark_lock); sectors_available = bch2_recalc_sectors_available(c); if (sectors <= sectors_available || (flags & BCH_DISK_RESERVATION_NOFAIL)) { atomic64_set(&c->sectors_available, max_t(s64, 0, sectors_available - sectors)); - stats->online_reserved += sectors; - res->sectors += sectors; + this_cpu_add(c->usage[0]->s.online_reserved, sectors); + res->sectors += sectors; ret = 0; bch2_disk_reservations_verify(c, flags); @@ -1092,7 +1086,7 @@ recalculate: } bch2_fs_stats_verify(c); - percpu_up_write(&c->usage_lock); + percpu_up_write(&c->mark_lock); if (!(flags & BCH_DISK_RESERVATION_GC_LOCK_HELD)) up_read(&c->gc_lock); @@ -1115,7 +1109,7 @@ static void buckets_free_rcu(struct rcu_head *rcu) int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { struct bucket_array *buckets = NULL, *old_buckets = NULL; - unsigned long *buckets_dirty = NULL; + unsigned long *buckets_nouse = NULL; unsigned long *buckets_written = NULL; u8 *oldest_gens = NULL; alloc_fifo free[RESERVE_NR]; @@ -1145,7 +1139,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) GFP_KERNEL|__GFP_ZERO)) || !(oldest_gens = kvpmalloc(nbuckets * sizeof(u8), GFP_KERNEL|__GFP_ZERO)) || - !(buckets_dirty = kvpmalloc(BITS_TO_LONGS(nbuckets) * + !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * sizeof(unsigned long), GFP_KERNEL|__GFP_ZERO)) || !(buckets_written = kvpmalloc(BITS_TO_LONGS(nbuckets) * @@ -1168,7 +1162,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) if (resize) { down_write(&c->gc_lock); down_write(&ca->bucket_lock); - percpu_down_write(&c->usage_lock); + percpu_down_write(&c->mark_lock); } old_buckets = bucket_array(ca); @@ -1182,8 +1176,8 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) memcpy(oldest_gens, ca->oldest_gens, n * sizeof(u8)); - memcpy(buckets_dirty, - ca->buckets_dirty, + memcpy(buckets_nouse, + ca->buckets_nouse, BITS_TO_LONGS(n) * sizeof(unsigned long)); memcpy(buckets_written, ca->buckets_written, @@ -1194,11 +1188,11 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) buckets = old_buckets; swap(ca->oldest_gens, oldest_gens); - swap(ca->buckets_dirty, buckets_dirty); + swap(ca->buckets_nouse, buckets_nouse); swap(ca->buckets_written, buckets_written); if (resize) - percpu_up_write(&c->usage_lock); + percpu_up_write(&c->mark_lock); spin_lock(&c->freelist_lock); for (i = 0; i < RESERVE_NR; i++) { @@ -1233,7 +1227,7 @@ err: free_fifo(&free_inc); for (i = 0; i < RESERVE_NR; i++) free_fifo(&free[i]); - kvpfree(buckets_dirty, + kvpfree(buckets_nouse, BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); kvpfree(buckets_written, BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); @@ -1256,7 +1250,7 @@ void bch2_dev_buckets_free(struct bch_dev *ca) free_fifo(&ca->free[i]); kvpfree(ca->buckets_written, BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); - kvpfree(ca->buckets_dirty, + kvpfree(ca->buckets_nouse, BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); kvpfree(ca->oldest_gens, ca->mi.nbuckets * sizeof(u8)); kvpfree(rcu_dereference_protected(ca->buckets[0], 1), diff --git a/libbcachefs/buckets.h b/libbcachefs/buckets.h index 17a9b44..8405911 100644 --- a/libbcachefs/buckets.h +++ b/libbcachefs/buckets.h @@ -33,7 +33,7 @@ static inline struct bucket_array *__bucket_array(struct bch_dev *ca, { return rcu_dereference_check(ca->buckets[gc], !ca->fs || - percpu_rwsem_is_held(&ca->fs->usage_lock) || + percpu_rwsem_is_held(&ca->fs->mark_lock) || lockdep_is_held(&ca->fs->gc_lock) || lockdep_is_held(&ca->bucket_lock)); } @@ -56,6 +56,18 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b) return __bucket(ca, b, false); } +static inline void bucket_set_dirty(struct bch_dev *ca, size_t b) +{ + struct bucket *g; + struct bucket_mark m; + + rcu_read_lock(); + g = bucket(ca, b); + bucket_cmpxchg(g, m, m.dirty = true); + rcu_read_unlock(); + +} + static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca, size_t b, int rw) { @@ -123,6 +135,20 @@ static inline u8 ptr_stale(struct bch_dev *ca, return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen); } +static inline unsigned __ptr_disk_sectors(struct extent_ptr_decoded p, + unsigned live_size) +{ + return live_size && p.crc.compression_type + ? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size, + p.crc.uncompressed_size)) + : live_size; +} + +static inline unsigned ptr_disk_sectors(struct extent_ptr_decoded p) +{ + return __ptr_disk_sectors(p, p.crc.live_size); +} + /* bucket gc marks */ static inline unsigned bucket_sectors_used(struct bucket_mark mark) @@ -137,6 +163,20 @@ static inline bool bucket_unused(struct bucket_mark mark) !bucket_sectors_used(mark); } +static inline bool is_available_bucket(struct bucket_mark mark) +{ + return (!mark.owned_by_allocator && + !mark.dirty_sectors && + !mark.stripe); +} + +static inline bool bucket_needs_journal_commit(struct bucket_mark m, + u16 last_seq_ondisk) +{ + return m.journal_seq_valid && + ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); +} + /* Device usage: */ struct bch_dev_usage __bch2_dev_usage_read(struct bch_dev *, bool); @@ -180,32 +220,21 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca) struct bch_fs_usage __bch2_fs_usage_read(struct bch_fs *, bool); struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *); -void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, - struct disk_reservation *, struct gc_pos); u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage); -static inline u64 bch2_fs_sectors_free(struct bch_fs *c, - struct bch_fs_usage stats) -{ - return c->capacity - bch2_fs_sectors_used(c, stats); -} +struct bch_fs_usage_short +bch2_fs_usage_read_short(struct bch_fs *); -static inline bool is_available_bucket(struct bucket_mark mark) +static inline u64 bch2_fs_sectors_free(struct bch_fs *c) { - return (!mark.owned_by_allocator && - !mark.dirty_sectors && - !mark.stripe && - !mark.nouse); -} + struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); -static inline bool bucket_needs_journal_commit(struct bucket_mark m, - u16 last_seq_ondisk) -{ - return m.journal_seq_valid && - ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); + return usage.capacity - usage.used; } +/* key/bucket marking: */ + void bch2_bucket_seq_cleanup(struct bch_fs *); void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, @@ -226,6 +255,10 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c, bool, s64, struct gc_pos, struct bch_fs_usage *, u64, unsigned); void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *); +void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, + struct disk_reservation *, struct gc_pos); + +/* disk reservations: */ void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); diff --git a/libbcachefs/buckets_types.h b/libbcachefs/buckets_types.h index 0b1bd95..c5537a2 100644 --- a/libbcachefs/buckets_types.h +++ b/libbcachefs/buckets_types.h @@ -8,28 +8,25 @@ struct bucket_mark { union { - struct { - atomic64_t v; - }; + atomic64_t v; struct { - u8 gen; - u8 data_type:3, - gen_valid:1, - owned_by_allocator:1, - nouse:1, - journal_seq_valid:1, - stripe:1; - u16 dirty_sectors; - u16 cached_sectors; - - /* - * low bits of journal sequence number when this bucket was most - * recently modified: if journal_seq_valid is set, this bucket - * can't be reused until the journal sequence number written to - * disk is >= the bucket's journal sequence number: - */ - u16 journal_seq; + u8 gen; + u8 data_type:3, + owned_by_allocator:1, + dirty:1, + journal_seq_valid:1, + stripe:1; + u16 dirty_sectors; + u16 cached_sectors; + + /* + * low bits of journal sequence number when this bucket was most + * recently modified: if journal_seq_valid is set, this bucket can't be + * reused until the journal sequence number written to disk is >= the + * bucket's journal sequence number: + */ + u16 journal_seq; }; }; }; @@ -41,6 +38,7 @@ struct bucket { }; u16 io_time[2]; + unsigned gen_valid:1; }; struct bucket_array { @@ -64,6 +62,21 @@ struct bch_dev_usage { struct bch_fs_usage { /* all fields are in units of 512 byte sectors: */ + /* summarized: */ + struct bch_fs_usage_summarized { + u64 online_reserved; + + /* fields after online_reserved are cleared/recalculated by gc: */ + u64 gc_start[0]; + + u64 hidden; + u64 data; + u64 cached; + u64 reserved; + u64 nr_inodes; + } s; + + /* broken out: */ struct { u64 data[BCH_DATA_NR]; u64 ec_data; @@ -71,19 +84,21 @@ struct bch_fs_usage { } replicas[BCH_REPLICAS_MAX]; u64 buckets[BCH_DATA_NR]; +}; - /* fields starting here aren't touched by gc: */ - u64 online_reserved; - u64 available_cache; +struct bch_fs_usage_short { + u64 capacity; + u64 used; + u64 nr_inodes; }; /* * A reservation for space on disk: */ struct disk_reservation { - u64 sectors; - u32 gen; - unsigned nr_replicas; + u64 sectors; + u32 gen; + unsigned nr_replicas; }; struct copygc_heap_entry { diff --git a/libbcachefs/chardev.c b/libbcachefs/chardev.c index 808167d..ac1ec5f 100644 --- a/libbcachefs/chardev.c +++ b/libbcachefs/chardev.c @@ -305,7 +305,7 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, .p.btree_id = ctx->stats.iter.btree_id, .p.pos = ctx->stats.iter.pos, .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), - .p.sectors_total = bch2_fs_sectors_used(c, bch2_fs_usage_read(c)), + .p.sectors_total = bch2_fs_usage_read_short(c).used, }; if (len < sizeof(e)) @@ -397,7 +397,7 @@ static long bch2_ioctl_usage(struct bch_fs *c, struct bch_ioctl_fs_usage dst = { .capacity = c->capacity, .used = bch2_fs_sectors_used(c, src), - .online_reserved = src.online_reserved, + .online_reserved = src.s.online_reserved, }; for (i = 0; i < BCH_REPLICAS_MAX; i++) { diff --git a/libbcachefs/ec.c b/libbcachefs/ec.c index c8115f6..755a260 100644 --- a/libbcachefs/ec.c +++ b/libbcachefs/ec.c @@ -104,22 +104,32 @@ static unsigned stripe_csums_per_device(const struct bch_stripe *s) 1 << s->csum_granularity_bits); } -static unsigned stripe_val_u64s(const struct bch_stripe *s) +static unsigned stripe_csum_offset(const struct bch_stripe *s, + unsigned dev, unsigned csum_idx) { - unsigned bytes = sizeof(struct bch_stripe) + + unsigned csum_bytes = bch_crc_bytes[s->csum_type]; + + return sizeof(struct bch_stripe) + sizeof(struct bch_extent_ptr) * s->nr_blocks + - bch_crc_bytes[s->csum_type] * s->nr_blocks * stripe_csums_per_device(s); - return DIV_ROUND_UP(bytes, sizeof(u64)); + (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; } -static void *stripe_csum(struct bch_stripe *s, unsigned dev, unsigned csum_idx) +static unsigned stripe_blockcount_offset(const struct bch_stripe *s, + unsigned idx) { - unsigned csum_bytes = bch_crc_bytes[s->csum_type]; - void *csums = s->ptrs + s->nr_blocks; + return stripe_csum_offset(s, s->nr_blocks, 0) + + sizeof(16) * idx; +} - BUG_ON(!csum_bytes); +static unsigned stripe_val_u64s(const struct bch_stripe *s) +{ + return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks), + sizeof(u64)); +} - return csums + (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; +static void *stripe_csum(struct bch_stripe *s, unsigned dev, unsigned csum_idx) +{ + return (void *) s + stripe_csum_offset(s, dev, csum_idx); } const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) @@ -132,7 +142,8 @@ const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) if (bkey_val_bytes(k.k) < sizeof(*s)) return "incorrect value size"; - if (bkey_val_u64s(k.k) != stripe_val_u64s(s)) + if (bkey_val_bytes(k.k) < sizeof(*s) || + bkey_val_u64s(k.k) < stripe_val_u64s(s)) return "incorrect value size"; return NULL; diff --git a/libbcachefs/extents.c b/libbcachefs/extents.c index dc3fbfb..2980416 100644 --- a/libbcachefs/extents.c +++ b/libbcachefs/extents.c @@ -305,8 +305,7 @@ unsigned bch2_extent_is_compressed(struct bkey_s_c k) extent_for_each_ptr_decode(e, p, entry) if (!p.ptr.cached && - p.crc.compression_type != BCH_COMPRESSION_NONE && - p.crc.compressed_size < p.crc.live_size) + p.crc.compression_type != BCH_COMPRESSION_NONE) ret += p.crc.compressed_size; } } @@ -627,48 +626,34 @@ void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct btree *b, { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const struct bch_extent_ptr *ptr; - unsigned seq; const char *err; char buf[160]; struct bucket_mark mark; struct bch_dev *ca; - unsigned replicas = 0; - bool bad; + + bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + !bch2_bkey_replicas_marked(c, k, false), c, + "btree key bad (replicas not marked in superblock):\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); + + if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) + return; bkey_for_each_ptr(ptrs, ptr) { ca = bch_dev_bkey_exists(c, ptr->dev); - replicas++; - if (!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags)) - continue; + mark = ptr_bucket_mark(ca, ptr); err = "stale"; - if (ptr_stale(ca, ptr)) + if (gen_after(mark.gen, ptr->gen)) goto err; - do { - seq = read_seqcount_begin(&c->gc_pos_lock); - mark = ptr_bucket_mark(ca, ptr); - - bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && - (mark.data_type != BCH_DATA_BTREE || - mark.dirty_sectors < c->opts.btree_node_size); - } while (read_seqcount_retry(&c->gc_pos_lock, seq)); - err = "inconsistent"; - if (bad) + if (mark.data_type != BCH_DATA_BTREE || + mark.dirty_sectors < c->opts.btree_node_size) goto err; } - if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked(c, k, false)) { - bch2_bkey_val_to_text(&PBUF(buf), c, k); - bch2_fs_bug(c, - "btree key bad (replicas not marked in superblock):\n%s", - buf); - return; - } - return; err: bch2_bkey_val_to_text(&PBUF(buf), c, k); @@ -1341,13 +1326,9 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) { struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const struct bch_extent_ptr *ptr; - struct bch_dev *ca; - struct bucket_mark mark; - unsigned seq, stale; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; char buf[160]; - bool bad; - unsigned replicas = 0; /* * XXX: we should be doing most/all of these checks at startup time, @@ -1358,73 +1339,42 @@ void bch2_extent_debugcheck(struct bch_fs *c, struct btree *b, * going to get overwritten during replay) */ - extent_for_each_ptr(e, ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); - replicas++; - - /* - * If journal replay hasn't finished, we might be seeing keys - * that will be overwritten by the time journal replay is done: - */ - if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) - continue; - - stale = 0; - - do { - seq = read_seqcount_begin(&c->gc_pos_lock); - mark = ptr_bucket_mark(ca, ptr); + bch2_fs_bug_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && + !bch2_bkey_replicas_marked(c, e.s_c, false), c, + "extent key bad (replicas not marked in superblock):\n%s", + (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); - /* between mark and bucket gen */ - smp_rmb(); - - stale = ptr_stale(ca, ptr); - - bch2_fs_bug_on(stale && !ptr->cached, c, - "stale dirty pointer"); - - bch2_fs_bug_on(stale > 96, c, - "key too stale: %i", - stale); - - if (stale) - break; - - bad = gc_pos_cmp(c->gc_pos, gc_pos_btree_node(b)) > 0 && - (mark.data_type != BCH_DATA_USER || - !(ptr->cached - ? mark.cached_sectors - : mark.dirty_sectors)); - } while (read_seqcount_retry(&c->gc_pos_lock, seq)); - - if (bad) - goto bad_ptr; - } - - if (replicas > BCH_REPLICAS_MAX) { - bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c); - bch2_fs_bug(c, - "extent key bad (too many replicas: %u): %s", - replicas, buf); + /* + * If journal replay hasn't finished, we might be seeing keys + * that will be overwritten by the time journal replay is done: + */ + if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) return; - } - if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && - !bch2_bkey_replicas_marked(c, e.s_c, false)) { - bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c); - bch2_fs_bug(c, - "extent key bad (replicas not marked in superblock):\n%s", - buf); - return; + extent_for_each_ptr_decode(e, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); + unsigned stale = gen_after(mark.gen, p.ptr.gen); + unsigned disk_sectors = ptr_disk_sectors(p); + unsigned mark_sectors = p.ptr.cached + ? mark.cached_sectors + : mark.dirty_sectors; + + bch2_fs_bug_on(stale && !p.ptr.cached, c, + "stale dirty pointer (ptr gen %u bucket %u", + p.ptr.gen, mark.gen); + + bch2_fs_bug_on(stale > 96, c, "key too stale: %i", stale); + + bch2_fs_bug_on(!stale && + (mark.data_type != BCH_DATA_USER || + mark_sectors < disk_sectors), c, + "extent pointer not marked: %s:\n" + "type %u sectors %u < %u", + (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), + mark.data_type, + mark_sectors, disk_sectors); } - - return; - -bad_ptr: - bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c); - bch2_fs_bug(c, "extent pointer bad gc mark: %s:\nbucket %zu " - "gen %i type %u", buf, - PTR_BUCKET_NR(ca, ptr), mark.gen, mark.data_type); } void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, @@ -1495,6 +1445,18 @@ void bch2_extent_crc_append(struct bkey_i_extent *e, __extent_entry_push(e); } +static inline void __extent_entry_insert(struct bkey_i_extent *e, + union bch_extent_entry *dst, + union bch_extent_entry *new) +{ + union bch_extent_entry *end = extent_entry_last(extent_i_to_s(e)); + + memmove_u64s_up((u64 *) dst + extent_entry_u64s(new), + dst, (u64 *) end - (u64 *) dst); + e->k.u64s += extent_entry_u64s(new); + memcpy(dst, new, extent_entry_bytes(new)); +} + void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e, struct extent_ptr_decoded *p) { @@ -1503,8 +1465,10 @@ void bch2_extent_ptr_decoded_append(struct bkey_i_extent *e, unsigned i; extent_for_each_crc(extent_i_to_s(e), crc, pos) - if (!bch2_crc_unpacked_cmp(crc, p->crc)) + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = extent_entry_next(pos); goto found; + } bch2_extent_crc_append(e, p->crc); pos = extent_entry_last(extent_i_to_s(e)); diff --git a/libbcachefs/extents.h b/libbcachefs/extents.h index e6e9c30..0e6f4a0 100644 --- a/libbcachefs/extents.h +++ b/libbcachefs/extents.h @@ -509,18 +509,6 @@ void bch2_extent_crc_append(struct bkey_i_extent *, void bch2_extent_ptr_decoded_append(struct bkey_i_extent *, struct extent_ptr_decoded *); -static inline void __extent_entry_insert(struct bkey_i_extent *e, - union bch_extent_entry *dst, - union bch_extent_entry *new) -{ - union bch_extent_entry *end = extent_entry_last(extent_i_to_s(e)); - - memmove_u64s_up((u64 *) dst + extent_entry_u64s(new), - dst, (u64 *) end - (u64 *) dst); - e->k.u64s += extent_entry_u64s(new); - memcpy(dst, new, extent_entry_bytes(new)); -} - static inline void __extent_entry_push(struct bkey_i_extent *e) { union bch_extent_entry *entry = extent_entry_last(extent_i_to_s(e)); diff --git a/libbcachefs/extents_types.h b/libbcachefs/extents_types.h index efd72e2..6d42841 100644 --- a/libbcachefs/extents_types.h +++ b/libbcachefs/extents_types.h @@ -4,14 +4,14 @@ #include "bcachefs_format.h" struct bch_extent_crc_unpacked { + u32 compressed_size; + u32 uncompressed_size; + u32 live_size; + u8 csum_type; u8 compression_type; - u16 compressed_size; - u16 uncompressed_size; - u16 offset; - u16 live_size; u16 nonce; diff --git a/libbcachefs/fs.c b/libbcachefs/fs.c index 67b0dd3..13670a6 100644 --- a/libbcachefs/fs.c +++ b/libbcachefs/fs.c @@ -341,8 +341,6 @@ retry: if (unlikely(ret)) goto err_trans; - atomic_long_inc(&c->nr_inodes); - if (!tmpfile) { bch2_inode_update_after_write(c, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); @@ -1333,9 +1331,6 @@ static void bch2_evict_inode(struct inode *vinode) bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, KEY_TYPE_QUOTA_WARN); bch2_inode_rm(c, inode->v.i_ino); - - WARN_ONCE(atomic_long_dec_return(&c->nr_inodes) < 0, - "nr_inodes < 0"); } } @@ -1343,18 +1338,16 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct bch_fs *c = sb->s_fs_info; - struct bch_fs_usage usage = bch2_fs_usage_read(c); - u64 hidden_metadata = usage.buckets[BCH_DATA_SB] + - usage.buckets[BCH_DATA_JOURNAL]; + struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); unsigned shift = sb->s_blocksize_bits - 9; u64 fsid; buf->f_type = BCACHEFS_STATFS_MAGIC; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = (c->capacity - hidden_metadata) >> shift; - buf->f_bfree = (c->capacity - bch2_fs_sectors_used(c, usage)) >> shift; + buf->f_blocks = usage.capacity >> shift; + buf->f_bfree = (usage.capacity - usage.used) >> shift; buf->f_bavail = buf->f_bfree; - buf->f_files = atomic_long_read(&c->nr_inodes); + buf->f_files = usage.nr_inodes; buf->f_ffree = U64_MAX; fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ diff --git a/libbcachefs/fsck.c b/libbcachefs/fsck.c index 5525af8..955ab8b 100644 --- a/libbcachefs/fsck.c +++ b/libbcachefs/fsck.c @@ -1313,9 +1313,6 @@ peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); BUG_ON(ret == -EINTR); if (ret) break; - - if (link->count) - atomic_long_inc(&c->nr_inodes); } else { /* Should have been caught by dirents pass: */ need_fsck_err_on(link->count, c, @@ -1379,7 +1376,6 @@ static int check_inodes_fast(struct bch_fs *c) struct btree_iter iter; struct bkey_s_c k; struct bkey_s_c_inode inode; - unsigned long nr_inodes = 0; int ret = 0; for_each_btree_key(&iter, c, BTREE_ID_INODES, POS_MIN, 0, k) { @@ -1388,9 +1384,6 @@ static int check_inodes_fast(struct bch_fs *c) inode = bkey_s_c_to_inode(k); - if (!(inode.v->bi_flags & BCH_INODE_UNLINKED)) - nr_inodes++; - if (inode.v->bi_flags & (BCH_INODE_I_SIZE_DIRTY| BCH_INODE_I_SECTORS_DIRTY| @@ -1404,7 +1397,6 @@ static int check_inodes_fast(struct bch_fs *c) break; } } - atomic_long_set(&c->nr_inodes, nr_inodes); fsck_err: return bch2_btree_iter_unlock(&iter) ?: ret; } diff --git a/libbcachefs/io.c b/libbcachefs/io.c index 98eca9a..ede1ac1 100644 --- a/libbcachefs/io.c +++ b/libbcachefs/io.c @@ -408,6 +408,7 @@ static void init_append_extent(struct bch_write_op *op, struct bch_extent_crc_unpacked crc) { struct bkey_i_extent *e = bkey_extent_init(op->insert_keys.top); + struct bch_extent_ptr *ptr; op->pos.offset += crc.uncompressed_size; e->k.p = op->pos; @@ -418,6 +419,10 @@ static void init_append_extent(struct bch_write_op *op, bch2_alloc_sectors_append_ptrs(op->c, wp, &e->k_i, crc.compressed_size); + if (op->flags & BCH_WRITE_CACHED) + extent_for_each_ptr(extent_i_to_s(e), ptr) + ptr->cached = true; + bch2_keylist_push(&op->insert_keys); } @@ -1720,9 +1725,9 @@ noclone: bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read_preempt_disable(&c->mark_lock); bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read_preempt_enable(&c->mark_lock); if (likely(!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT)))) { bio_inc_remaining(&orig->bio); diff --git a/libbcachefs/journal.c b/libbcachefs/journal.c index 47cfd50..261149a 100644 --- a/libbcachefs/journal.c +++ b/libbcachefs/journal.c @@ -342,7 +342,7 @@ static int __journal_res_get(struct journal *j, struct journal_res *res, struct journal_buf *buf; int ret; retry: - if (journal_res_get_fast(j, res)) + if (journal_res_get_fast(j, res, flags)) return 0; spin_lock(&j->lock); @@ -351,7 +351,7 @@ retry: * that just did journal_entry_open() and call journal_entry_close() * unnecessarily */ - if (journal_res_get_fast(j, res)) { + if (journal_res_get_fast(j, res, flags)) { spin_unlock(&j->lock); return 0; } @@ -377,11 +377,11 @@ retry: return -EROFS; case JOURNAL_ENTRY_INUSE: /* - * haven't finished writing out the previous entry, can't start - * another yet: - * signal to caller which sequence number we're trying to open: + * The current journal entry is still open, but we failed to get + * a journal reservation because there's not enough space in it, + * and we can't close it and start another because we haven't + * finished writing out the previous entry: */ - res->seq = journal_cur_seq(j) + 1; spin_unlock(&j->lock); trace_journal_entry_full(c); goto blocked; @@ -393,8 +393,6 @@ retry: /* We now have a new, closed journal buf - see if we can open it: */ ret = journal_entry_open(j); - if (!ret) - res->seq = journal_cur_seq(j); spin_unlock(&j->lock); if (ret < 0) @@ -755,7 +753,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, } if (c) { - percpu_down_read_preempt_disable(&c->usage_lock); + percpu_down_read_preempt_disable(&c->mark_lock); spin_lock(&c->journal.lock); } else { preempt_disable(); @@ -783,7 +781,7 @@ static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, if (c) { spin_unlock(&c->journal.lock); - percpu_up_read_preempt_enable(&c->usage_lock); + percpu_up_read_preempt_enable(&c->mark_lock); } else { preempt_enable(); } diff --git a/libbcachefs/journal.h b/libbcachefs/journal.h index 0595597..3a08374 100644 --- a/libbcachefs/journal.h +++ b/libbcachefs/journal.h @@ -271,8 +271,12 @@ static inline void bch2_journal_res_put(struct journal *j, int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, unsigned); +#define JOURNAL_RES_GET_NONBLOCK (1 << 0) +#define JOURNAL_RES_GET_CHECK (1 << 1) + static inline int journal_res_get_fast(struct journal *j, - struct journal_res *res) + struct journal_res *res, + unsigned flags) { union journal_res_state old, new; u64 v = atomic64_read(&j->reservations.counter); @@ -287,6 +291,9 @@ static inline int journal_res_get_fast(struct journal *j, if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) return 0; + if (flags & JOURNAL_RES_GET_CHECK) + return 1; + new.cur_entry_offset += res->u64s; journal_state_inc(&new); } while ((v = atomic64_cmpxchg(&j->reservations.counter, @@ -299,8 +306,6 @@ static inline int journal_res_get_fast(struct journal *j, return 1; } -#define JOURNAL_RES_GET_NONBLOCK (1 << 0) - static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, unsigned u64s, unsigned flags) { @@ -311,15 +316,17 @@ static inline int bch2_journal_res_get(struct journal *j, struct journal_res *re res->u64s = u64s; - if (journal_res_get_fast(j, res)) + if (journal_res_get_fast(j, res, flags)) goto out; ret = bch2_journal_res_get_slowpath(j, res, flags); if (ret) return ret; out: - lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_); - EBUG_ON(!res->ref); + if (!(flags & JOURNAL_RES_GET_CHECK)) { + lock_acquire_shared(&j->res_map, 0, 0, NULL, _THIS_IP_); + EBUG_ON(!res->ref); + } return 0; } diff --git a/libbcachefs/journal_io.c b/libbcachefs/journal_io.c index 05500bf..6c16aab 100644 --- a/libbcachefs/journal_io.c +++ b/libbcachefs/journal_io.c @@ -1036,7 +1036,7 @@ done: spin_unlock(&j->lock); rcu_read_unlock(); - return replicas >= replicas_want ? 0 : -EROFS; + return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; } static void journal_write_compact(struct jset *jset) diff --git a/libbcachefs/journal_reclaim.c b/libbcachefs/journal_reclaim.c index 9ac65d0..d8d1b6b 100644 --- a/libbcachefs/journal_reclaim.c +++ b/libbcachefs/journal_reclaim.c @@ -74,6 +74,25 @@ void bch2_journal_pin_drop(struct journal *j, spin_unlock(&j->lock); } +void bch2_journal_pin_update(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + spin_lock(&j->lock); + + if (pin->seq != seq) { + __journal_pin_drop(j, pin); + __journal_pin_add(j, seq, pin, flush_fn); + } else { + struct journal_entry_pin_list *pin_list = + journal_seq_pin(j, seq); + + list_move(&pin->list, &pin_list->list); + } + + spin_unlock(&j->lock); +} + void bch2_journal_pin_add_if_older(struct journal *j, struct journal_entry_pin *src_pin, struct journal_entry_pin *pin, diff --git a/libbcachefs/journal_reclaim.h b/libbcachefs/journal_reclaim.h index f7dcbfd..287590c 100644 --- a/libbcachefs/journal_reclaim.h +++ b/libbcachefs/journal_reclaim.h @@ -18,6 +18,8 @@ journal_seq_pin(struct journal *j, u64 seq) void bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, journal_pin_flush_fn); +void bch2_journal_pin_update(struct journal *, u64, struct journal_entry_pin *, + journal_pin_flush_fn); void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); void bch2_journal_pin_add_if_older(struct journal *, struct journal_entry_pin *, diff --git a/libbcachefs/migrate.c b/libbcachefs/migrate.c index 3f26f45..b219865 100644 --- a/libbcachefs/migrate.c +++ b/libbcachefs/migrate.c @@ -118,8 +118,8 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; struct bkey_i_btree_ptr *new_key; retry: - if (!bch2_extent_has_device(bkey_i_to_s_c_extent(&b->key), - dev_idx)) { + if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), + dev_idx)) { /* * we might have found a btree node key we * needed to update, and then tried to update it diff --git a/libbcachefs/move.c b/libbcachefs/move.c index 8c95aa9..80909ae 100644 --- a/libbcachefs/move.c +++ b/libbcachefs/move.c @@ -128,13 +128,15 @@ static int bch2_migrate_index_update(struct bch_write_op *op) op->opts.data_replicas); /* - * It's possible we race, and for whatever reason the extent now - * has fewer replicas than when we last looked at it - meaning - * we need to get a disk reservation here: + * If we're not fully overwriting @k, and it's compressed, we + * need a reservation for all the pointers in @insert */ nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) - - (bch2_bkey_nr_dirty_ptrs(k) + m->nr_ptrs_reserved); - if (nr > 0) { + m->nr_ptrs_reserved; + + if (insert->k.size < k.k->size && + bch2_extent_is_compressed(k) && + nr > 0) { /* * can't call bch2_disk_reservation_add() with btree * locks held, at least not without a song and dance @@ -242,8 +244,16 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, switch (data_cmd) { case DATA_ADD_REPLICAS: { + /* + * DATA_ADD_REPLICAS is used for moving data to a different + * device in the background, and due to compression the new copy + * might take up more space than the old copy: + */ +#if 0 int nr = (int) io_opts.data_replicas - bch2_bkey_nr_dirty_ptrs(k); +#endif + int nr = (int) io_opts.data_replicas; if (nr > 0) { m->op.nr_replicas = m->nr_ptrs_reserved = nr; diff --git a/libbcachefs/recovery.c b/libbcachefs/recovery.c index e9e4a1a..eae38ea 100644 --- a/libbcachefs/recovery.c +++ b/libbcachefs/recovery.c @@ -130,8 +130,7 @@ int bch2_fs_recovery(struct bch_fs *c) int ret; mutex_lock(&c->sb_lock); - if (!rcu_dereference_protected(c->replicas, - lockdep_is_held(&c->sb_lock))->nr) { + if (!c->replicas.entries) { bch_info(c, "building replicas info"); set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); } @@ -374,8 +373,6 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - atomic_long_set(&c->nr_inodes, 2); - if (enabled_qtypes(c)) { ret = bch2_fs_quota_read(c); if (ret) diff --git a/libbcachefs/replicas.c b/libbcachefs/replicas.c index 6ab4e36..1d3161e 100644 --- a/libbcachefs/replicas.c +++ b/libbcachefs/replicas.c @@ -148,35 +148,31 @@ static inline void devlist_to_replicas(struct bch_devs_list devs, replicas_entry_sort(e); } -static struct bch_replicas_cpu * +static struct bch_replicas_cpu cpu_replicas_add_entry(struct bch_replicas_cpu *old, struct bch_replicas_entry *new_entry) { - struct bch_replicas_cpu *new; - unsigned i, nr, entry_size; - - entry_size = max_t(unsigned, old->entry_size, - replicas_entry_bytes(new_entry)); - nr = old->nr + 1; - - new = kzalloc(sizeof(struct bch_replicas_cpu) + - nr * entry_size, GFP_NOIO); - if (!new) - return NULL; + unsigned i; + struct bch_replicas_cpu new = { + .nr = old->nr + 1, + .entry_size = max_t(unsigned, old->entry_size, + replicas_entry_bytes(new_entry)), + }; - new->nr = nr; - new->entry_size = entry_size; + new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO); + if (!new.entries) + return new; for (i = 0; i < old->nr; i++) - memcpy(cpu_replicas_entry(new, i), + memcpy(cpu_replicas_entry(&new, i), cpu_replicas_entry(old, i), old->entry_size); - memcpy(cpu_replicas_entry(new, old->nr), + memcpy(cpu_replicas_entry(&new, old->nr), new_entry, replicas_entry_bytes(new_entry)); - bch2_cpu_replicas_sort(new); + bch2_cpu_replicas_sort(&new); return new; } @@ -193,16 +189,14 @@ static bool replicas_has_entry(struct bch_fs *c, struct bch_replicas_entry *search, bool check_gc_replicas) { - struct bch_replicas_cpu *r, *gc_r; bool marked; - rcu_read_lock(); - r = rcu_dereference(c->replicas); - marked = __replicas_has_entry(r, search) && + percpu_down_read_preempt_disable(&c->mark_lock); + marked = __replicas_has_entry(&c->replicas, search) && (!check_gc_replicas || - likely(!(gc_r = rcu_dereference(c->replicas_gc))) || - __replicas_has_entry(gc_r, search)); - rcu_read_unlock(); + likely((!c->replicas_gc.entries)) || + __replicas_has_entry(&c->replicas_gc, search)); + percpu_up_read_preempt_enable(&c->mark_lock); return marked; } @@ -211,54 +205,55 @@ noinline static int bch2_mark_replicas_slowpath(struct bch_fs *c, struct bch_replicas_entry *new_entry) { - struct bch_replicas_cpu *old_gc, *new_gc = NULL, *old_r, *new_r = NULL; + struct bch_replicas_cpu new_r, new_gc; int ret = -ENOMEM; + memset(&new_r, 0, sizeof(new_r)); + memset(&new_gc, 0, sizeof(new_gc)); + mutex_lock(&c->sb_lock); - old_gc = rcu_dereference_protected(c->replicas_gc, - lockdep_is_held(&c->sb_lock)); - if (old_gc && !__replicas_has_entry(old_gc, new_entry)) { - new_gc = cpu_replicas_add_entry(old_gc, new_entry); - if (!new_gc) + if (c->replicas_gc.entries && + !__replicas_has_entry(&c->replicas_gc, new_entry)) { + new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry); + if (!new_gc.entries) goto err; } - old_r = rcu_dereference_protected(c->replicas, - lockdep_is_held(&c->sb_lock)); - if (!__replicas_has_entry(old_r, new_entry)) { - new_r = cpu_replicas_add_entry(old_r, new_entry); - if (!new_r) + if (!__replicas_has_entry(&c->replicas, new_entry)) { + new_r = cpu_replicas_add_entry(&c->replicas, new_entry); + if (!new_r.entries) goto err; - ret = bch2_cpu_replicas_to_sb_replicas(c, new_r); + ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); if (ret) goto err; } + if (!new_r.entries && + !new_gc.entries) + goto out; + /* allocations done, now commit: */ - if (new_r) + if (new_r.entries) bch2_write_super(c); /* don't update in memory replicas until changes are persistent */ - - if (new_gc) { - rcu_assign_pointer(c->replicas_gc, new_gc); - kfree_rcu(old_gc, rcu); - } - - if (new_r) { - rcu_assign_pointer(c->replicas, new_r); - kfree_rcu(old_r, rcu); - } - - mutex_unlock(&c->sb_lock); - return 0; + percpu_down_write(&c->mark_lock); + if (new_r.entries) + swap(new_r, c->replicas); + if (new_gc.entries) + swap(new_gc, c->replicas_gc); + percpu_up_write(&c->mark_lock); +out: + ret = 0; err: mutex_unlock(&c->sb_lock); - kfree(new_gc); - kfree(new_r); + + kfree(new_r.entries); + kfree(new_gc.entries); + return ret; } @@ -311,20 +306,14 @@ int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) int bch2_replicas_gc_end(struct bch_fs *c, int ret) { - struct bch_replicas_cpu *new_r, *old_r; - lockdep_assert_held(&c->replicas_gc_lock); mutex_lock(&c->sb_lock); - new_r = rcu_dereference_protected(c->replicas_gc, - lockdep_is_held(&c->sb_lock)); - rcu_assign_pointer(c->replicas_gc, NULL); - if (ret) goto err; - if (bch2_cpu_replicas_to_sb_replicas(c, new_r)) { + if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) { ret = -ENOSPC; goto err; } @@ -332,51 +321,54 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret) bch2_write_super(c); /* don't update in memory replicas until changes are persistent */ +err: + percpu_down_write(&c->mark_lock); + if (!ret) + swap(c->replicas, c->replicas_gc); - old_r = rcu_dereference_protected(c->replicas, - lockdep_is_held(&c->sb_lock)); + kfree(c->replicas_gc.entries); + c->replicas_gc.entries = NULL; + percpu_up_write(&c->mark_lock); - rcu_assign_pointer(c->replicas, new_r); - kfree_rcu(old_r, rcu); -out: mutex_unlock(&c->sb_lock); return ret; -err: - kfree_rcu(new_r, rcu); - goto out; } int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) { - struct bch_replicas_cpu *dst, *src; struct bch_replicas_entry *e; + unsigned i = 0; lockdep_assert_held(&c->replicas_gc_lock); mutex_lock(&c->sb_lock); - BUG_ON(c->replicas_gc); + BUG_ON(c->replicas_gc.entries); - src = rcu_dereference_protected(c->replicas, - lockdep_is_held(&c->sb_lock)); + c->replicas_gc.nr = 0; + c->replicas_gc.entry_size = 0; - dst = kzalloc(sizeof(struct bch_replicas_cpu) + - src->nr * src->entry_size, GFP_NOIO); - if (!dst) { + for_each_cpu_replicas_entry(&c->replicas, e) + if (!((1 << e->data_type) & typemask)) { + c->replicas_gc.nr++; + c->replicas_gc.entry_size = + max_t(unsigned, c->replicas_gc.entry_size, + replicas_entry_bytes(e)); + } + + c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, + c->replicas_gc.entry_size, + GFP_NOIO); + if (!c->replicas_gc.entries) { mutex_unlock(&c->sb_lock); return -ENOMEM; } - dst->nr = 0; - dst->entry_size = src->entry_size; - - for_each_cpu_replicas_entry(src, e) + for_each_cpu_replicas_entry(&c->replicas, e) if (!((1 << e->data_type) & typemask)) - memcpy(cpu_replicas_entry(dst, dst->nr++), - e, dst->entry_size); - - bch2_cpu_replicas_sort(dst); + memcpy(cpu_replicas_entry(&c->replicas_gc, i++), + e, c->replicas_gc.entry_size); - rcu_assign_pointer(c->replicas_gc, dst); + bch2_cpu_replicas_sort(&c->replicas_gc); mutex_unlock(&c->sb_lock); return 0; @@ -384,11 +376,11 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) /* Replicas tracking - superblock: */ -static struct bch_replicas_cpu * -__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r) +static int +__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, + struct bch_replicas_cpu *cpu_r) { struct bch_replicas_entry *e, *dst; - struct bch_replicas_cpu *cpu_r; unsigned nr = 0, entry_size = 0, idx = 0; for_each_replicas_entry(sb_r, e) { @@ -397,10 +389,9 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r) nr++; } - cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) + - nr * entry_size, GFP_NOIO); - if (!cpu_r) - return NULL; + cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); + if (!cpu_r->entries) + return -ENOMEM; cpu_r->nr = nr; cpu_r->entry_size = entry_size; @@ -411,14 +402,14 @@ __bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r) replicas_entry_sort(dst); } - return cpu_r; + return 0; } -static struct bch_replicas_cpu * -__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r) +static int +__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, + struct bch_replicas_cpu *cpu_r) { struct bch_replicas_entry_v0 *e; - struct bch_replicas_cpu *cpu_r; unsigned nr = 0, entry_size = 0, idx = 0; for_each_replicas_entry(sb_r, e) { @@ -430,10 +421,9 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r) entry_size += sizeof(struct bch_replicas_entry) - sizeof(struct bch_replicas_entry_v0); - cpu_r = kzalloc(sizeof(struct bch_replicas_cpu) + - nr * entry_size, GFP_NOIO); - if (!cpu_r) - return NULL; + cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); + if (!cpu_r->entries) + return -ENOMEM; cpu_r->nr = nr; cpu_r->entry_size = entry_size; @@ -449,31 +439,31 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r) replicas_entry_sort(dst); } - return cpu_r; + return 0; } int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) { struct bch_sb_field_replicas *sb_v1; struct bch_sb_field_replicas_v0 *sb_v0; - struct bch_replicas_cpu *cpu_r, *old_r; + struct bch_replicas_cpu new_r = { 0, 0, NULL }; + int ret = 0; if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb))) - cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_v1); + ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb))) - cpu_r = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0); - else - cpu_r = kzalloc(sizeof(struct bch_replicas_cpu), GFP_NOIO); + ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); - if (!cpu_r) + if (ret) return -ENOMEM; - bch2_cpu_replicas_sort(cpu_r); + bch2_cpu_replicas_sort(&new_r); + + percpu_down_write(&c->mark_lock); + swap(c->replicas, new_r); + percpu_up_write(&c->mark_lock); - old_r = rcu_dereference_check(c->replicas, lockdep_is_held(&c->sb_lock)); - rcu_assign_pointer(c->replicas, cpu_r); - if (old_r) - kfree_rcu(old_r, rcu); + kfree(new_r.entries); return 0; } @@ -588,7 +578,7 @@ static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_fi { struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); struct bch_sb_field_members *mi = bch2_sb_get_members(sb); - struct bch_replicas_cpu *cpu_r = NULL; + struct bch_replicas_cpu cpu_r = { .entries = NULL }; struct bch_replicas_entry *e; const char *err; unsigned i; @@ -615,13 +605,12 @@ static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_fi } err = "cannot allocate memory"; - cpu_r = __bch2_sb_replicas_to_cpu_replicas(sb_r); - if (!cpu_r) + if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r)) goto err; - err = check_dup_replicas_entries(cpu_r); + err = check_dup_replicas_entries(&cpu_r); err: - kfree(cpu_r); + kfree(cpu_r.entries); return err; } @@ -651,7 +640,7 @@ static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb { struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); struct bch_sb_field_members *mi = bch2_sb_get_members(sb); - struct bch_replicas_cpu *cpu_r = NULL; + struct bch_replicas_cpu cpu_r = { .entries = NULL }; struct bch_replicas_entry_v0 *e; const char *err; unsigned i; @@ -672,13 +661,12 @@ static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb } err = "cannot allocate memory"; - cpu_r = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r); - if (!cpu_r) + if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r)) goto err; - err = check_dup_replicas_entries(cpu_r); + err = check_dup_replicas_entries(&cpu_r); err: - kfree(cpu_r); + kfree(cpu_r.entries); return err; } @@ -733,7 +721,6 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c, { struct bch_sb_field_members *mi; struct bch_replicas_entry *e; - struct bch_replicas_cpu *r; unsigned i, nr_online, nr_offline; struct replicas_status ret; @@ -743,10 +730,10 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c, ret.replicas[i].redundancy = INT_MAX; mi = bch2_sb_get_members(c->disk_sb.sb); - rcu_read_lock(); - r = rcu_dereference(c->replicas); - for_each_cpu_replicas_entry(r, e) { + percpu_down_read_preempt_disable(&c->mark_lock); + + for_each_cpu_replicas_entry(&c->replicas, e) { if (e->data_type >= ARRAY_SIZE(ret.replicas)) panic("e %p data_type %u\n", e, e->data_type); @@ -771,7 +758,7 @@ struct replicas_status __bch2_replicas_status(struct bch_fs *c, nr_offline); } - rcu_read_unlock(); + percpu_up_read_preempt_enable(&c->mark_lock); for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) if (ret.replicas[i].redundancy == INT_MAX) @@ -820,18 +807,16 @@ int bch2_replicas_online(struct bch_fs *c, bool meta) unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) { struct bch_replicas_entry *e; - struct bch_replicas_cpu *r; unsigned i, ret = 0; - rcu_read_lock(); - r = rcu_dereference(c->replicas); + percpu_down_read_preempt_disable(&c->mark_lock); - for_each_cpu_replicas_entry(r, e) + for_each_cpu_replicas_entry(&c->replicas, e) for (i = 0; i < e->nr_devs; i++) if (e->devs[i] == ca->dev_idx) ret |= 1 << e->data_type; - rcu_read_unlock(); + percpu_up_read_preempt_enable(&c->mark_lock); return ret; } diff --git a/libbcachefs/replicas_types.h b/libbcachefs/replicas_types.h index 3061840..0535b1d 100644 --- a/libbcachefs/replicas_types.h +++ b/libbcachefs/replicas_types.h @@ -2,10 +2,9 @@ #define _BCACHEFS_REPLICAS_TYPES_H struct bch_replicas_cpu { - struct rcu_head rcu; unsigned nr; unsigned entry_size; - struct bch_replicas_entry entries[]; + struct bch_replicas_entry *entries; }; #endif /* _BCACHEFS_REPLICAS_TYPES_H */ diff --git a/libbcachefs/super.c b/libbcachefs/super.c index b33117d..0212832 100644 --- a/libbcachefs/super.c +++ b/libbcachefs/super.c @@ -405,8 +405,9 @@ static void bch2_fs_free(struct bch_fs *c) bch2_io_clock_exit(&c->io_clock[WRITE]); bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_compress_exit(c); - percpu_free_rwsem(&c->usage_lock); + percpu_free_rwsem(&c->mark_lock); free_percpu(c->usage[0]); + free_percpu(c->pcpu); mempool_exit(&c->btree_iters_pool); mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); @@ -414,7 +415,8 @@ static void bch2_fs_free(struct bch_fs *c) mempool_exit(&c->btree_reserve_pool); mempool_exit(&c->fill_iter); percpu_ref_exit(&c->writes); - kfree(rcu_dereference_protected(c->replicas, 1)); + kfree(c->replicas.entries); + kfree(c->replicas_gc.entries); kfree(rcu_dereference_protected(c->disk_groups, 1)); if (c->copygc_wq) @@ -597,6 +599,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) bch2_fs_btree_cache_init_early(&c->btree_cache); + if (percpu_init_rwsem(&c->mark_lock)) + goto err; + mutex_lock(&c->sb_lock); if (bch2_sb_to_fs(c, sb)) { @@ -640,7 +645,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) offsetof(struct btree_write_bio, wbio.bio)), BIOSET_NEED_BVECS) || !(c->usage[0] = alloc_percpu(struct bch_fs_usage)) || - percpu_init_rwsem(&c->usage_lock) || + !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, btree_bytes(c)) || mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, diff --git a/libbcachefs/sysfs.c b/libbcachefs/sysfs.c index 42e09f5..8831651 100644 --- a/libbcachefs/sysfs.c +++ b/libbcachefs/sysfs.c @@ -258,7 +258,7 @@ static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) stats.buckets[type]); pr_buf(&out, "online reserved:\t%llu\n", - stats.online_reserved); + stats.s.online_reserved); return out.pos - buf; }