X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libbcachefs%2Falloc_background.c;h=48971fcf2d5bb393c818732e3b0f1be50c179ebc;hb=ce906d661e63d4318b9f26ec145f2ff5fddf5162;hp=b2d1b8f9c9b8e86d477fe2b219ad77aa854d3ff8;hpb=ac0d08877aa87a9cdf493bc6f336c391fb4e04a0;p=bcachefs-tools-debian diff --git a/libbcachefs/alloc_background.c b/libbcachefs/alloc_background.c index b2d1b8f..48971fc 100644 --- a/libbcachefs/alloc_background.c +++ b/libbcachefs/alloc_background.c @@ -4,6 +4,7 @@ #include "alloc_foreground.h" #include "btree_cache.h" #include "btree_io.h" +#include "btree_key_cache.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_gc.h" @@ -13,6 +14,7 @@ #include "ec.h" #include "error.h" #include "recovery.h" +#include "varint.h" #include #include @@ -23,15 +25,12 @@ #include #include -static const char * const bch2_alloc_field_names[] = { -#define x(name, bytes) #name, - BCH_ALLOC_FIELDS() +static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { +#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, + BCH_ALLOC_FIELDS_V1() #undef x - NULL }; -static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); - /* Ratelimiting/PD controllers */ static void pd_controllers_update(struct work_struct *work) @@ -40,39 +39,36 @@ static void pd_controllers_update(struct work_struct *work) struct bch_fs, pd_controllers_update); struct bch_dev *ca; + s64 free = 0, fragmented = 0; unsigned i; for_each_member_device(ca, c, i) { - struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); + struct bch_dev_usage stats = bch2_dev_usage_read(ca); - u64 free = bucket_to_sector(ca, + free += bucket_to_sector(ca, __dev_buckets_free(ca, stats)) << 9; /* * Bytes of internal fragmentation, which can be * reclaimed by copy GC */ - s64 fragmented = (bucket_to_sector(ca, - stats.buckets[BCH_DATA_USER] + - stats.buckets[BCH_DATA_CACHED]) - - (stats.sectors[BCH_DATA_USER] + - stats.sectors[BCH_DATA_CACHED])) << 9; - - fragmented = max(0LL, fragmented); - - bch2_pd_controller_update(&ca->copygc_pd, - free, fragmented, -1); + fragmented += max_t(s64, 0, (bucket_to_sector(ca, + stats.d[BCH_DATA_user].buckets + + stats.d[BCH_DATA_cached].buckets) - + (stats.d[BCH_DATA_user].sectors + + stats.d[BCH_DATA_cached].sectors)) << 9); } + bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1); schedule_delayed_work(&c->pd_controllers_update, c->pd_controllers_update_seconds * HZ); } /* Persistent alloc info: */ -static inline u64 get_alloc_field(const struct bch_alloc *a, - const void **p, unsigned field) +static inline u64 alloc_field_v1_get(const struct bch_alloc *a, + const void **p, unsigned field) { - unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; + unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; u64 v; if (!(a->fields & (1 << field))) @@ -99,10 +95,10 @@ static inline u64 get_alloc_field(const struct bch_alloc *a, return v; } -static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, - unsigned field, u64 v) +static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p, + unsigned field, u64 v) { - unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; + unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; if (!v) return; @@ -129,55 +125,127 @@ static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, *p += bytes; } -struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) +static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, + struct bkey_s_c k) { - struct bkey_alloc_unpacked ret = { .gen = 0 }; + const struct bch_alloc *in = bkey_s_c_to_alloc(k).v; + const void *d = in->data; + unsigned idx = 0; + + out->gen = in->gen; - if (k.k->type == KEY_TYPE_alloc) { - const struct bch_alloc *a = bkey_s_c_to_alloc(k).v; - const void *d = a->data; - unsigned idx = 0; +#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++); + BCH_ALLOC_FIELDS_V1() +#undef x +} - ret.gen = a->gen; +static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, + struct bkey_s_c k) +{ + struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k); + const u8 *in = a.v->data; + const u8 *end = bkey_val_end(a); + unsigned fieldnr = 0; + int ret; + u64 v; -#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++); - BCH_ALLOC_FIELDS() + out->gen = a.v->gen; + out->oldest_gen = a.v->oldest_gen; + out->data_type = a.v->data_type; + +#define x(_name, _bits) \ + if (fieldnr < a.v->nr_fields) { \ + ret = bch2_varint_decode(in, end, &v); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + } else { \ + v = 0; \ + } \ + out->_name = v; \ + if (v != out->_name) \ + return -1; \ + fieldnr++; + + BCH_ALLOC_FIELDS_V2() #undef x - } - return ret; + return 0; } -void bch2_alloc_pack(struct bkey_i_alloc *dst, - const struct bkey_alloc_unpacked src) +static void bch2_alloc_pack_v2(struct bkey_alloc_buf *dst, + const struct bkey_alloc_unpacked src) { - unsigned idx = 0; - void *d = dst->v.data; + struct bkey_i_alloc_v2 *a = bkey_alloc_v2_init(&dst->k); + unsigned nr_fields = 0, last_nonzero_fieldnr = 0; + u8 *out = a->v.data; + u8 *end = (void *) &dst[1]; + u8 *last_nonzero_field = out; unsigned bytes; - dst->v.fields = 0; - dst->v.gen = src.gen; + a->k.p = POS(src.dev, src.bucket); + a->v.gen = src.gen; + a->v.oldest_gen = src.oldest_gen; + a->v.data_type = src.data_type; + +#define x(_name, _bits) \ + nr_fields++; \ + \ + if (src._name) { \ + out += bch2_varint_encode(out, src._name); \ + \ + last_nonzero_field = out; \ + last_nonzero_fieldnr = nr_fields; \ + } else { \ + *out++ = 0; \ + } -#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name); - BCH_ALLOC_FIELDS() + BCH_ALLOC_FIELDS_V2() #undef x + BUG_ON(out > end); + + out = last_nonzero_field; + a->v.nr_fields = last_nonzero_fieldnr; - bytes = (void *) d - (void *) &dst->v; - set_bkey_val_bytes(&dst->k, bytes); - memset_u64s_tail(&dst->v, 0, bytes); + bytes = (u8 *) out - (u8 *) &a->v; + set_bkey_val_bytes(&a->k, bytes); + memset_u64s_tail(&a->v, 0, bytes); +} + +struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) +{ + struct bkey_alloc_unpacked ret = { + .dev = k.k->p.inode, + .bucket = k.k->p.offset, + .gen = 0, + }; + + if (k.k->type == KEY_TYPE_alloc_v2) + bch2_alloc_unpack_v2(&ret, k); + else if (k.k->type == KEY_TYPE_alloc) + bch2_alloc_unpack_v1(&ret, k); + + return ret; +} + +void bch2_alloc_pack(struct bch_fs *c, + struct bkey_alloc_buf *dst, + const struct bkey_alloc_unpacked src) +{ + bch2_alloc_pack_v2(dst, src); } static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) { unsigned i, bytes = offsetof(struct bch_alloc, data); - for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++) + for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) if (a->fields & (1 << i)) - bytes += BCH_ALLOC_FIELD_BYTES[i]; + bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; return DIV_ROUND_UP(bytes, sizeof(u64)); } -const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) +const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); @@ -192,79 +260,77 @@ const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) return NULL; } -void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) +const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k) { - struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); - const void *d = a.v->data; - unsigned i; + struct bkey_alloc_unpacked u; + + if (k.k->p.inode >= c->sb.nr_devices || + !c->devs[k.k->p.inode]) + return "invalid device"; - pr_buf(out, "gen %u", a.v->gen); + if (bch2_alloc_unpack_v2(&u, k)) + return "unpack error"; - for (i = 0; i < BCH_ALLOC_FIELD_NR; i++) - if (a.v->fields & (1 << i)) - pr_buf(out, " %s %llu", - bch2_alloc_field_names[i], - get_alloc_field(a.v, &d, i)); + return NULL; } -int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) +void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); + + pr_buf(out, "gen %u oldest_gen %u data_type %u", + u.gen, u.oldest_gen, u.data_type); +#define x(_name, ...) pr_buf(out, #_name " %llu ", (u64) u._name); + BCH_ALLOC_FIELDS_V2() +#undef x +} + +static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_s_c k) { - struct btree_trans trans; - struct btree_and_journal_iter iter; - struct bkey_s_c k; struct bch_dev *ca; - unsigned i; - int ret = 0; + struct bucket *g; + struct bkey_alloc_unpacked u; - bch2_trans_init(&trans, c, 0, 0); + if (level || + (k.k->type != KEY_TYPE_alloc && + k.k->type != KEY_TYPE_alloc_v2)) + return 0; - bch2_btree_and_journal_iter_init(&iter, &trans, journal_keys, - BTREE_ID_ALLOC, POS_MIN); + ca = bch_dev_bkey_exists(c, k.k->p.inode); + g = bucket(ca, k.k->p.offset); + u = bch2_alloc_unpack(k); - while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { - bch2_mark_key(c, k, 0, 0, NULL, 0, - BTREE_TRIGGER_ALLOC_READ| - BTREE_TRIGGER_NOATOMIC); + g->_mark.gen = u.gen; + g->_mark.data_type = u.data_type; + g->_mark.dirty_sectors = u.dirty_sectors; + g->_mark.cached_sectors = u.cached_sectors; + g->io_time[READ] = u.read_time; + g->io_time[WRITE] = u.write_time; + g->oldest_gen = u.oldest_gen; + g->gen_valid = 1; - bch2_btree_and_journal_iter_advance(&iter); - } + return 0; +} + +int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) +{ + int ret; + + down_read(&c->gc_lock); + ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_alloc, + NULL, bch2_alloc_read_fn); + up_read(&c->gc_lock); - ret = bch2_trans_exit(&trans) ?: ret; if (ret) { bch_err(c, "error reading alloc info: %i", ret); return ret; } - percpu_down_write(&c->mark_lock); - bch2_dev_usage_from_buckets(c); - percpu_up_write(&c->mark_lock); - - mutex_lock(&c->bucket_clock[READ].lock); - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - bch2_recalc_oldest_io(c, ca, READ); - up_read(&ca->bucket_lock); - } - mutex_unlock(&c->bucket_clock[READ].lock); - - mutex_lock(&c->bucket_clock[WRITE].lock); - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - bch2_recalc_oldest_io(c, ca, WRITE); - up_read(&ca->bucket_lock); - } - mutex_unlock(&c->bucket_clock[WRITE].lock); - return 0; } -enum alloc_write_ret { - ALLOC_WROTE, - ALLOC_NOWROTE, - ALLOC_END, -}; - static int bch2_alloc_write_key(struct btree_trans *trans, struct btree_iter *iter, unsigned flags) @@ -272,14 +338,19 @@ static int bch2_alloc_write_key(struct btree_trans *trans, struct bch_fs *c = trans->c; struct bkey_s_c k; struct bch_dev *ca; - struct bucket_array *ba; struct bucket *g; struct bucket_mark m; struct bkey_alloc_unpacked old_u, new_u; - __BKEY_PADDED(k, 8) alloc_key; /* hack: */ - struct bkey_i_alloc *a; + struct bkey_alloc_buf a; int ret; retry: + bch2_trans_begin(trans); + + ret = bch2_btree_key_cache_flush(trans, + BTREE_ID_alloc, iter->pos); + if (ret) + goto err; + k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) @@ -287,32 +358,18 @@ retry: old_u = bch2_alloc_unpack(k); - if (iter->pos.inode >= c->sb.nr_devices || - !c->devs[iter->pos.inode]) - return ALLOC_END; - percpu_down_read(&c->mark_lock); ca = bch_dev_bkey_exists(c, iter->pos.inode); - ba = bucket_array(ca); - - if (iter->pos.offset >= ba->nbuckets) { - percpu_up_read(&c->mark_lock); - return ALLOC_END; - } - - g = &ba->b[iter->pos.offset]; + g = bucket(ca, iter->pos.offset); m = READ_ONCE(g->mark); - new_u = alloc_mem_to_key(g, m); + new_u = alloc_mem_to_key(iter, g, m); percpu_up_read(&c->mark_lock); if (!bkey_alloc_unpacked_cmp(old_u, new_u)) - return ALLOC_NOWROTE; - - a = bkey_alloc_init(&alloc_key.k); - a->k.p = iter->pos; - bch2_alloc_pack(a, new_u); + return 0; - bch2_trans_update(trans, iter, &a->k_i, + bch2_alloc_pack(c, &a, new_u); + bch2_trans_update(trans, iter, &a.k, BTREE_TRIGGER_NORUN); ret = bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL|flags); @@ -322,7 +379,7 @@ err: return ret; } -int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) +int bch2_alloc_write(struct bch_fs *c, unsigned flags) { struct btree_trans trans; struct btree_iter *iter; @@ -330,169 +387,76 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) unsigned i; int ret = 0; - BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); - - bch2_trans_init(&trans, c, 0, 0); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN, + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS_MIN, BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - for_each_rw_member(ca, c, i) { - unsigned first_bucket; - - percpu_down_read(&c->mark_lock); - first_bucket = bucket_array(ca)->first_bucket; - percpu_up_read(&c->mark_lock); + for_each_member_device(ca, c, i) { + bch2_btree_iter_set_pos(iter, + POS(ca->dev_idx, ca->mi.first_bucket)); - bch2_btree_iter_set_pos(iter, POS(i, first_bucket)); + while (iter->pos.offset < ca->mi.nbuckets) { + bch2_trans_cond_resched(&trans); - while (1) { ret = bch2_alloc_write_key(&trans, iter, flags); - if (ret < 0 || ret == ALLOC_END) - break; - if (ret == ALLOC_WROTE) - *wrote = true; + if (ret) { + percpu_ref_put(&ca->io_ref); + goto err; + } bch2_btree_iter_next_slot(iter); } - - if (ret < 0) { - percpu_ref_put(&ca->io_ref); - break; - } } - - bch2_trans_exit(&trans); - - return ret < 0 ? ret : 0; -} - -int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) -{ - struct btree_trans trans; - struct btree_iter *iter; - int ret; - - bch2_trans_init(&trans, c, 0, 0); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, k->k.p, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - - ret = bch2_alloc_write_key(&trans, iter, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_JOURNAL_REPLAY); +err: + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); - return ret < 0 ? ret : 0; + return ret; } /* Bucket IO clocks: */ -static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) -{ - struct bucket_clock *clock = &c->bucket_clock[rw]; - struct bucket_array *buckets = bucket_array(ca); - struct bucket *g; - u16 max_last_io = 0; - unsigned i; - - lockdep_assert_held(&c->bucket_clock[rw].lock); - - /* Recalculate max_last_io for this device: */ - for_each_bucket(g, buckets) - max_last_io = max(max_last_io, bucket_last_io(c, g, rw)); - - ca->max_last_bucket_io[rw] = max_last_io; - - /* Recalculate global max_last_io: */ - max_last_io = 0; - - for_each_member_device(ca, c, i) - max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]); - - clock->max_last_io = max_last_io; -} - -static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw) +int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + size_t bucket_nr, int rw) { - struct bucket_clock *clock = &c->bucket_clock[rw]; - struct bucket_array *buckets; - struct bch_dev *ca; + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, dev); + struct btree_iter *iter; struct bucket *g; - unsigned i; - - trace_rescale_prios(c); - - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for_each_bucket(g, buckets) - g->io_time[rw] = clock->hand - - bucket_last_io(c, g, rw) / 2; - - bch2_recalc_oldest_io(c, ca, rw); - - up_read(&ca->bucket_lock); - } -} - -static inline u64 bucket_clock_freq(u64 capacity) -{ - return max(capacity >> 10, 2028ULL); -} - -static void bch2_inc_clock_hand(struct io_timer *timer) -{ - struct bucket_clock *clock = container_of(timer, - struct bucket_clock, rescale); - struct bch_fs *c = container_of(clock, - struct bch_fs, bucket_clock[clock->rw]); - struct bch_dev *ca; - u64 capacity; - unsigned i; - - mutex_lock(&clock->lock); - - /* if clock cannot be advanced more, rescale prio */ - if (clock->max_last_io >= U16_MAX - 2) - bch2_rescale_bucket_io_times(c, clock->rw); - - BUG_ON(clock->max_last_io >= U16_MAX - 2); - - for_each_member_device(ca, c, i) - ca->max_last_bucket_io[clock->rw]++; - clock->max_last_io++; - clock->hand++; - - mutex_unlock(&clock->lock); + struct bkey_alloc_buf *a; + struct bkey_alloc_unpacked u; + u64 *time, now; + int ret = 0; - capacity = READ_ONCE(c->capacity); + iter = bch2_trans_get_iter(trans, BTREE_ID_alloc, POS(dev, bucket_nr), + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(iter); + if (ret) + goto out; - if (!capacity) - return; + a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf)); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto out; - /* - * we only increment when 0.1% of the filesystem capacity has been read - * or written too, this determines if it's time - * - * XXX: we shouldn't really be going off of the capacity of devices in - * RW mode (that will be 0 when we're RO, yet we can still service - * reads) - */ - timer->expire += bucket_clock_freq(capacity); + percpu_down_read(&c->mark_lock); + g = bucket(ca, bucket_nr); + u = alloc_mem_to_key(iter, g, READ_ONCE(g->mark)); + percpu_up_read(&c->mark_lock); - bch2_io_timer_add(&c->io_clock[clock->rw], timer); -} + time = rw == READ ? &u.read_time : &u.write_time; + now = atomic64_read(&c->io_clock[rw].now); + if (*time == now) + goto out; -static void bch2_bucket_clock_init(struct bch_fs *c, int rw) -{ - struct bucket_clock *clock = &c->bucket_clock[rw]; + *time = now; - clock->hand = 1; - clock->rw = rw; - clock->rescale.fn = bch2_inc_clock_hand; - clock->rescale.expire = bucket_clock_freq(c->capacity); - mutex_init(&clock->lock); + bch2_alloc_pack(c, a, u); + ret = bch2_trans_update(trans, iter, &a->k, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); +out: + bch2_trans_iter_put(trans, iter); + return ret; } /* Background allocator thread: */ @@ -503,8 +467,6 @@ static void bch2_bucket_clock_init(struct bch_fs *c, int rw) * commands to the newly free buckets, then puts them on the various freelists. */ -#define BUCKET_GC_GEN_MAX 96U - /** * wait_buckets_available - wait on reclaimable buckets * @@ -514,6 +476,8 @@ static void bch2_bucket_clock_init(struct bch_fs *c, int rw) static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) { unsigned long gc_count = c->gc_count; + s64 available; + unsigned i; int ret = 0; ca->allocator_state = ALLOCATOR_BLOCKED; @@ -529,9 +493,19 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) if (gc_count != c->gc_count) ca->inc_gen_really_needs_gc = 0; - if ((ssize_t) (dev_buckets_available(c, ca) - - ca->inc_gen_really_needs_gc) >= - (ssize_t) fifo_free(&ca->free_inc)) + available = dev_buckets_available(ca); + available -= ca->inc_gen_really_needs_gc; + + spin_lock(&c->freelist_lock); + for (i = 0; i < RESERVE_NR; i++) + available -= fifo_used(&ca->free[i]); + spin_unlock(&c->freelist_lock); + + available = max(available, 0LL); + + if (available > fifo_free(&ca->free_inc) || + (available && + !fifo_full(&ca->free[RESERVE_MOVINGGC]))) break; up_read(&c->gc_lock); @@ -547,20 +521,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) return ret; } -static bool bch2_can_invalidate_bucket(struct bch_dev *ca, - size_t bucket, - struct bucket_mark mark) +static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, + struct bucket_mark m) { u8 gc_gen; - if (!is_available_bucket(mark)) + if (!is_available_bucket(m)) + return false; + + if (m.owned_by_allocator) return false; if (ca->buckets_nouse && - test_bit(bucket, ca->buckets_nouse)) + test_bit(b, ca->buckets_nouse)) return false; - gc_gen = bucket_gc_gen(ca, bucket); + gc_gen = bucket_gc_gen(bucket(ca, b)); if (gc_gen >= BUCKET_GC_GEN_MAX / 2) ca->inc_gen_needs_gc++; @@ -574,43 +550,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, /* * Determines what order we're going to reuse buckets, smallest bucket_key() * first. - * - * - * - We take into account the read prio of the bucket, which gives us an - * indication of how hot the data is -- we scale the prio so that the prio - * farthest from the clock is worth 1/8th of the closest. - * - * - The number of sectors of cached data in the bucket, which gives us an - * indication of the cost in cache misses this eviction will cause. - * - * - If hotness * sectors used compares equal, we pick the bucket with the - * smallest bucket_gc_gen() - since incrementing the same bucket's generation - * number repeatedly forces us to run mark and sweep gc to avoid generation - * number wraparound. */ -static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, - size_t b, struct bucket_mark m) +static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, + u64 now, u64 last_seq_ondisk) { - unsigned last_io = bucket_last_io(c, bucket(ca, b), READ); - unsigned max_last_io = ca->max_last_bucket_io[READ]; - - /* - * Time since last read, scaled to [0, 8) where larger value indicates - * more recently read data: - */ - unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io; - - /* How much we want to keep the data in this bucket: */ - unsigned long data_wantness = - (hotness + 1) * bucket_sectors_used(m); + unsigned used = bucket_sectors_used(m); - unsigned long needs_journal_commit = - bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); + if (used) { + /* + * Prefer to keep buckets that have been read more recently, and + * buckets that have more data in them: + */ + u64 last_read = max_t(s64, 0, now - g->io_time[READ]); + u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used)); - return (data_wantness << 9) | - (needs_journal_commit << 8) | - (bucket_gc_gen(ca, b) / 16); + return -last_read_scaled; + } else { + /* + * Prefer to use buckets with smaller gc_gen so that we don't + * have to walk the btree and recalculate oldest_gen - but shift + * off the low bits so that buckets will still have equal sort + * keys when there's only a small difference, so that we can + * keep sequential buckets together: + */ + return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)| + (bucket_gc_gen(g) >> 4); + } } static inline int bucket_alloc_cmp(alloc_heap *h, @@ -633,16 +599,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) { struct bucket_array *buckets; struct alloc_heap_entry e = { 0 }; + u64 now, last_seq_ondisk; size_t b, i, nr = 0; - ca->alloc_heap.used = 0; - - mutex_lock(&c->bucket_clock[READ].lock); down_read(&ca->bucket_lock); buckets = bucket_array(ca); - - bch2_recalc_oldest_io(c, ca, READ); + ca->alloc_heap.used = 0; + now = atomic64_read(&c->io_clock[READ].now); + last_seq_ondisk = c->journal.last_seq_ondisk; /* * Find buckets with lowest read priority, by building a maxheap sorted @@ -650,8 +615,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) * all buckets have been visited. */ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { - struct bucket_mark m = READ_ONCE(buckets->b[b].mark); - unsigned long key = bucket_sort_key(c, ca, b, m); + struct bucket *g = &buckets->b[b]; + struct bucket_mark m = READ_ONCE(g->mark); + unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk); if (!bch2_can_invalidate_bucket(ca, b, m)) continue; @@ -686,7 +652,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) } up_read(&ca->bucket_lock); - mutex_unlock(&c->bucket_clock[READ].lock); } static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) @@ -766,13 +731,13 @@ static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) ca->inc_gen_needs_gc = 0; switch (ca->mi.replacement) { - case CACHE_REPLACEMENT_LRU: + case BCH_CACHE_REPLACEMENT_lru: find_reclaimable_buckets_lru(c, ca); break; - case CACHE_REPLACEMENT_FIFO: + case BCH_CACHE_REPLACEMENT_fifo: find_reclaimable_buckets_fifo(c, ca); break; - case CACHE_REPLACEMENT_RANDOM: + case BCH_CACHE_REPLACEMENT_random: find_reclaimable_buckets_random(c, ca); break; } @@ -831,21 +796,14 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, struct btree_iter *iter, u64 *journal_seq, unsigned flags) { -#if 0 - __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key; -#else - /* hack: */ - __BKEY_PADDED(k, 8) alloc_key; -#endif struct bch_fs *c = trans->c; - struct bkey_i_alloc *a; + struct bkey_alloc_buf a; struct bkey_alloc_unpacked u; struct bucket *g; struct bucket_mark m; - struct bkey_s_c k; bool invalidating_cached_data; size_t b; - int ret; + int ret = 0; BUG_ON(!ca->alloc_heap.used || !ca->alloc_heap.data[0].nr); @@ -853,51 +811,66 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, /* first, put on free_inc and mark as owned by allocator: */ percpu_down_read(&c->mark_lock); - spin_lock(&c->freelist_lock); + g = bucket(ca, b); + m = READ_ONCE(g->mark); - verify_not_on_freelist(c, ca, b); - - BUG_ON(!fifo_push(&ca->free_inc, b)); + BUG_ON(m.dirty_sectors); bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); + spin_lock(&c->freelist_lock); + verify_not_on_freelist(c, ca, b); + BUG_ON(!fifo_push(&ca->free_inc, b)); spin_unlock(&c->freelist_lock); + + /* + * If we're not invalidating cached data, we only increment the bucket + * gen in memory here, the incremented gen will be updated in the btree + * by bch2_trans_mark_pointer(): + */ + if (!m.cached_sectors && + !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) { + BUG_ON(m.data_type); + bucket_cmpxchg(g, m, m.gen++); + percpu_up_read(&c->mark_lock); + goto out; + } + percpu_up_read(&c->mark_lock); - BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); + /* + * If the read-only path is trying to shut down, we can't be generating + * new btree updates: + */ + if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) { + ret = 1; + goto out; + } bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); retry: - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); + ret = bch2_btree_iter_traverse(iter); if (ret) return ret; - /* - * The allocator has to start before journal replay is finished - thus, - * we have to trust the in memory bucket @m, not the version in the - * btree: - */ percpu_down_read(&c->mark_lock); - g = bucket(ca, b); + g = bucket(ca, iter->pos.offset); m = READ_ONCE(g->mark); - u = alloc_mem_to_key(g, m); + u = alloc_mem_to_key(iter, g, m); + percpu_up_read(&c->mark_lock); - invalidating_cached_data = m.cached_sectors != 0; + invalidating_cached_data = u.cached_sectors != 0; u.gen++; u.data_type = 0; u.dirty_sectors = 0; u.cached_sectors = 0; - u.read_time = c->bucket_clock[READ].hand; - u.write_time = c->bucket_clock[WRITE].hand; - - a = bkey_alloc_init(&alloc_key.k); - a->k.p = iter->pos; - bch2_alloc_pack(a, u); + u.read_time = atomic64_read(&c->io_clock[READ].now); + u.write_time = atomic64_read(&c->io_clock[WRITE].now); - bch2_trans_update(trans, iter, &a->k_i, + bch2_alloc_pack(c, &a, u); + bch2_trans_update(trans, iter, &a.k, BTREE_TRIGGER_BUCKET_INVALIDATE); /* @@ -912,12 +885,11 @@ retry: BTREE_INSERT_NOUNLOCK| BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_USE_ALLOC_RESERVE| + BTREE_INSERT_JOURNAL_RESERVED| flags); if (ret == -EINTR) goto retry; - +out: if (!ret) { /* remove from alloc_heap: */ struct alloc_heap_entry e, *top = ca->alloc_heap.data; @@ -951,32 +923,7 @@ retry: percpu_up_read(&c->mark_lock); } - return ret; -} - -static bool bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, - size_t bucket, u64 *flush_seq) -{ - struct bucket_mark m; - - percpu_down_read(&c->mark_lock); - spin_lock(&c->freelist_lock); - - bch2_invalidate_bucket(c, ca, bucket, &m); - - verify_not_on_freelist(c, ca, bucket); - BUG_ON(!fifo_push(&ca->free_inc, bucket)); - - spin_unlock(&c->freelist_lock); - - bucket_io_clock_reset(c, ca, bucket, READ); - bucket_io_clock_reset(c, ca, bucket, WRITE); - - percpu_up_read(&c->mark_lock); - - *flush_seq = max(*flush_seq, bucket_journal_seq(c, m)); - - return m.cached_sectors != 0; + return ret < 0 ? ret : 0; } /* @@ -990,10 +937,11 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) int ret = 0; bch2_trans_init(&trans, c, 0, 0); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, + iter = bch2_trans_get_iter(&trans, BTREE_ID_alloc, POS(ca->dev_idx, 0), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); /* Only use nowait if we've already invalidated at least one bucket: */ while (!ret && @@ -1004,6 +952,7 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) (!fifo_empty(&ca->free_inc) ? BTREE_INSERT_NOWAIT : 0)); + bch2_trans_iter_put(&trans, iter); bch2_trans_exit(&trans); /* If we used NOWAIT, don't return the error: */ @@ -1033,7 +982,16 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t set_current_state(TASK_INTERRUPTIBLE); spin_lock(&c->freelist_lock); - for (i = 0; i < RESERVE_NR; i++) + for (i = 0; i < RESERVE_NR; i++) { + + /* + * Don't strand buckets on the copygc freelist until + * after recovery is finished: + */ + if (!test_bit(BCH_FS_STARTED, &c->flags) && + i == RESERVE_MOVINGGC) + continue; + if (fifo_push(&ca->free[i], bucket)) { fifo_pop(&ca->free_inc, bucket); @@ -1043,6 +1001,7 @@ static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t spin_unlock(&c->freelist_lock); goto out; } + } if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) { ca->allocator_state = ALLOCATOR_BLOCKED_FULL; @@ -1087,6 +1046,12 @@ static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) return 0; } +static inline bool allocator_thread_running(struct bch_dev *ca) +{ + return ca->mi.state == BCH_MEMBER_STATE_rw && + test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags); +} + /** * bch_allocator_thread - move buckets from free_inc to reserves * @@ -1103,10 +1068,19 @@ static int bch2_allocator_thread(void *arg) int ret; set_freezable(); - ca->allocator_state = ALLOCATOR_RUNNING; while (1) { + if (!allocator_thread_running(ca)) { + ca->allocator_state = ALLOCATOR_STOPPED; + if (kthread_wait_freezable(allocator_thread_running(ca))) + break; + } + + ca->allocator_state = ALLOCATOR_RUNNING; + cond_resched(); + if (kthread_should_stop()) + break; pr_debug("discarding %zu invalidated buckets", fifo_used(&ca->free_inc)); @@ -1192,7 +1166,7 @@ stop: void bch2_recalc_capacity(struct bch_fs *c) { struct bch_dev *ca; - u64 capacity = 0, reserved_sectors = 0, gc_reserve; + u64 capacity = 0, reserved_sectors = 0, gc_reserve, copygc_threshold = 0; unsigned bucket_size_max = 0; unsigned long ra_pages = 0; unsigned i, j; @@ -1235,7 +1209,7 @@ void bch2_recalc_capacity(struct bch_fs *c) dev_reserve *= ca->mi.bucket_size; - ca->copygc_threshold = dev_reserve; + copygc_threshold += dev_reserve; capacity += bucket_to_sector(ca, ca->mi.nbuckets - ca->mi.first_bucket); @@ -1254,22 +1228,11 @@ void bch2_recalc_capacity(struct bch_fs *c) reserved_sectors = min(reserved_sectors, capacity); + c->copygc_threshold = copygc_threshold; c->capacity = capacity - reserved_sectors; c->bucket_size_max = bucket_size_max; - if (c->capacity) { - bch2_io_timer_add(&c->io_clock[READ], - &c->bucket_clock[READ].rescale); - bch2_io_timer_add(&c->io_clock[WRITE], - &c->bucket_clock[WRITE].rescale); - } else { - bch2_io_timer_del(&c->io_clock[READ], - &c->bucket_clock[READ].rescale); - bch2_io_timer_del(&c->io_clock[WRITE], - &c->bucket_clock[WRITE].rescale); - } - /* Wake up case someone was waiting for buckets */ closure_wake_up(&c->freelist_wait); } @@ -1313,7 +1276,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) for (i = 0; i < ARRAY_SIZE(c->write_points); i++) bch2_writepoint_stop(c, ca, &c->write_points[i]); - bch2_writepoint_stop(c, ca, &ca->copygc_write_point); + bch2_writepoint_stop(c, ca, &c->copygc_write_point); bch2_writepoint_stop(c, ca, &c->rebalance_write_point); bch2_writepoint_stop(c, ca, &c->btree_write_point); @@ -1414,9 +1377,12 @@ int bch2_dev_allocator_start(struct bch_dev *ca) return 0; p = kthread_create(bch2_allocator_thread, ca, - "bch_alloc[%s]", ca->name); - if (IS_ERR(p)) + "bch-alloc/%s", ca->name); + if (IS_ERR(p)) { + bch_err(ca->fs, "error creating allocator thread: %li", + PTR_ERR(p)); return PTR_ERR(p); + } get_task_struct(p); rcu_assign_pointer(ca->alloc_thread, p); @@ -1424,226 +1390,9 @@ int bch2_dev_allocator_start(struct bch_dev *ca) return 0; } -static bool flush_held_btree_writes(struct bch_fs *c) -{ - struct bucket_table *tbl; - struct rhash_head *pos; - struct btree *b; - bool nodes_unwritten; - size_t i; -again: - cond_resched(); - nodes_unwritten = false; - - if (bch2_journal_error(&c->journal)) - return true; - - rcu_read_lock(); - for_each_cached_btree(b, c, tbl, i, pos) - if (btree_node_need_write(b)) { - if (btree_node_may_write(b)) { - rcu_read_unlock(); - btree_node_lock_type(c, b, SIX_LOCK_read); - bch2_btree_node_write(c, b, SIX_LOCK_read); - six_unlock_read(&b->lock); - goto again; - } else { - nodes_unwritten = true; - } - } - rcu_read_unlock(); - - if (c->btree_roots_dirty) { - bch2_journal_meta(&c->journal); - goto again; - } - - return !nodes_unwritten && - !bch2_btree_interior_updates_nr_pending(c); -} - -static void allocator_start_issue_discards(struct bch_fs *c) -{ - struct bch_dev *ca; - unsigned dev_iter; - size_t bu; - - for_each_rw_member(ca, c, dev_iter) - while (fifo_pop(&ca->free_inc, bu)) - blkdev_issue_discard(ca->disk_sb.bdev, - bucket_to_sector(ca, bu), - ca->mi.bucket_size, GFP_NOIO, 0); -} - -static int resize_free_inc(struct bch_dev *ca) -{ - alloc_fifo free_inc; - - if (!fifo_full(&ca->free_inc)) - return 0; - - if (!init_fifo(&free_inc, - ca->free_inc.size * 2, - GFP_KERNEL)) - return -ENOMEM; - - fifo_move(&free_inc, &ca->free_inc); - swap(free_inc, ca->free_inc); - free_fifo(&free_inc); - return 0; -} - -static bool bch2_fs_allocator_start_fast(struct bch_fs *c) -{ - struct bch_dev *ca; - unsigned dev_iter; - bool ret = true; - - if (test_alloc_startup(c)) - return false; - - down_read(&c->gc_lock); - - /* Scan for buckets that are already invalidated: */ - for_each_rw_member(ca, c, dev_iter) { - struct bucket_array *buckets; - struct bucket_mark m; - long bu; - - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for (bu = buckets->first_bucket; - bu < buckets->nbuckets; bu++) { - m = READ_ONCE(buckets->b[bu].mark); - - if (!buckets->b[bu].gen_valid || - !is_available_bucket(m) || - m.cached_sectors || - (ca->buckets_nouse && - test_bit(bu, ca->buckets_nouse))) - continue; - - percpu_down_read(&c->mark_lock); - bch2_mark_alloc_bucket(c, ca, bu, true, - gc_pos_alloc(c, NULL), 0); - percpu_up_read(&c->mark_lock); - - fifo_push(&ca->free_inc, bu); - - discard_invalidated_buckets(c, ca); - - if (fifo_full(&ca->free[RESERVE_BTREE])) - break; - } - up_read(&ca->bucket_lock); - } - - up_read(&c->gc_lock); - - /* did we find enough buckets? */ - for_each_rw_member(ca, c, dev_iter) - if (!fifo_full(&ca->free[RESERVE_BTREE])) - ret = false; - - return ret; -} - -int bch2_fs_allocator_start(struct bch_fs *c) -{ - struct bch_dev *ca; - unsigned dev_iter; - u64 journal_seq = 0; - bool wrote; - long bu; - int ret = 0; - - if (!test_alloc_startup(c) && - bch2_fs_allocator_start_fast(c)) - return 0; - - pr_debug("not enough empty buckets; scanning for reclaimable buckets"); - - /* - * We're moving buckets to freelists _before_ they've been marked as - * invalidated on disk - we have to so that we can allocate new btree - * nodes to mark them as invalidated on disk. - * - * However, we can't _write_ to any of these buckets yet - they might - * have cached data in them, which is live until they're marked as - * invalidated on disk: - */ - set_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); - - down_read(&c->gc_lock); - do { - wrote = false; - - for_each_rw_member(ca, c, dev_iter) { - find_reclaimable_buckets(c, ca); - - while (!fifo_full(&ca->free[RESERVE_BTREE]) && - (bu = next_alloc_bucket(ca)) >= 0) { - ret = resize_free_inc(ca); - if (ret) { - percpu_ref_put(&ca->io_ref); - up_read(&c->gc_lock); - goto err; - } - - bch2_invalidate_one_bucket(c, ca, bu, - &journal_seq); - - fifo_push(&ca->free[RESERVE_BTREE], bu); - } - } - - pr_debug("done scanning for reclaimable buckets"); - - /* - * XXX: it's possible for this to deadlock waiting on journal reclaim, - * since we're holding btree writes. What then? - */ - ret = bch2_alloc_write(c, - BTREE_INSERT_NOCHECK_RW| - BTREE_INSERT_USE_ALLOC_RESERVE| - BTREE_INSERT_NOWAIT, &wrote); - - /* - * If bch2_alloc_write() did anything, it may have used some - * buckets, and we need the RESERVE_BTREE freelist full - so we - * need to loop and scan again. - * And if it errored, it may have been because there weren't - * enough buckets, so just scan and loop again as long as it - * made some progress: - */ - } while (wrote); - up_read(&c->gc_lock); - - if (ret) - goto err; - - pr_debug("flushing journal"); - - ret = bch2_journal_flush(&c->journal); - if (ret) - goto err; - - pr_debug("issuing discards"); - allocator_start_issue_discards(c); -err: - clear_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags); - closure_wait_event(&c->btree_interior_update_wait, - flush_held_btree_writes(c)); - - return ret; -} - void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); - bch2_bucket_clock_init(c, READ); - bch2_bucket_clock_init(c, WRITE); c->pd_controllers_update_seconds = 5; INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update);