1 // SPDX-License-Identifier: GPL-2.0
3 #include "alloc_background.h"
4 #include "alloc_foreground.h"
5 #include "btree_cache.h"
7 #include "btree_key_cache.h"
8 #include "btree_update.h"
9 #include "btree_update_interior.h"
19 #include <linux/kthread.h>
20 #include <linux/math64.h>
21 #include <linux/random.h>
22 #include <linux/rculist.h>
23 #include <linux/rcupdate.h>
24 #include <linux/sched/task.h>
25 #include <linux/sort.h>
26 #include <trace/events/bcachefs.h>
28 const char * const bch2_allocator_states[] = {
35 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
36 #define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8,
41 struct bkey_alloc_buf {
43 struct bch_alloc_v3 v;
45 #define x(_name, _bits) + _bits / 8
46 u8 _pad[0 + BCH_ALLOC_FIELDS_V2()];
48 } __attribute__((packed, aligned(8)));
50 /* Persistent alloc info: */
52 static inline u64 alloc_field_v1_get(const struct bch_alloc *a,
53 const void **p, unsigned field)
55 unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
58 if (!(a->fields & (1 << field)))
63 v = *((const u8 *) *p);
82 static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p,
83 unsigned field, u64 v)
85 unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field];
90 a->v.fields |= 1 << field;
97 *((__le16 *) *p) = cpu_to_le16(v);
100 *((__le32 *) *p) = cpu_to_le32(v);
103 *((__le64 *) *p) = cpu_to_le64(v);
112 static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out,
115 const struct bch_alloc *in = bkey_s_c_to_alloc(k).v;
116 const void *d = in->data;
121 #define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++);
122 BCH_ALLOC_FIELDS_V1()
126 static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out,
129 struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k);
130 const u8 *in = a.v->data;
131 const u8 *end = bkey_val_end(a);
132 unsigned fieldnr = 0;
137 out->oldest_gen = a.v->oldest_gen;
138 out->data_type = a.v->data_type;
140 #define x(_name, _bits) \
141 if (fieldnr < a.v->nr_fields) { \
142 ret = bch2_varint_decode_fast(in, end, &v); \
150 if (v != out->_name) \
154 BCH_ALLOC_FIELDS_V2()
159 static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out,
162 struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k);
163 const u8 *in = a.v->data;
164 const u8 *end = bkey_val_end(a);
165 unsigned fieldnr = 0;
170 out->oldest_gen = a.v->oldest_gen;
171 out->data_type = a.v->data_type;
172 out->journal_seq = le64_to_cpu(a.v->journal_seq);
174 #define x(_name, _bits) \
175 if (fieldnr < a.v->nr_fields) { \
176 ret = bch2_varint_decode_fast(in, end, &v); \
184 if (v != out->_name) \
188 BCH_ALLOC_FIELDS_V2()
193 static void bch2_alloc_pack_v3(struct bkey_alloc_buf *dst,
194 const struct bkey_alloc_unpacked src)
196 struct bkey_i_alloc_v3 *a = bkey_alloc_v3_init(&dst->k);
197 unsigned nr_fields = 0, last_nonzero_fieldnr = 0;
199 u8 *end = (void *) &dst[1];
200 u8 *last_nonzero_field = out;
203 a->k.p = POS(src.dev, src.bucket);
205 a->v.oldest_gen = src.oldest_gen;
206 a->v.data_type = src.data_type;
207 a->v.journal_seq = cpu_to_le64(src.journal_seq);
209 #define x(_name, _bits) \
213 out += bch2_varint_encode_fast(out, src._name); \
215 last_nonzero_field = out; \
216 last_nonzero_fieldnr = nr_fields; \
221 BCH_ALLOC_FIELDS_V2()
225 out = last_nonzero_field;
226 a->v.nr_fields = last_nonzero_fieldnr;
228 bytes = (u8 *) out - (u8 *) &a->v;
229 set_bkey_val_bytes(&a->k, bytes);
230 memset_u64s_tail(&a->v, 0, bytes);
233 struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k)
235 struct bkey_alloc_unpacked ret = {
237 .bucket = k.k->p.offset,
243 bch2_alloc_unpack_v1(&ret, k);
245 case KEY_TYPE_alloc_v2:
246 bch2_alloc_unpack_v2(&ret, k);
248 case KEY_TYPE_alloc_v3:
249 bch2_alloc_unpack_v3(&ret, k);
256 static void bch2_alloc_pack(struct bch_fs *c,
257 struct bkey_alloc_buf *dst,
258 const struct bkey_alloc_unpacked src)
260 bch2_alloc_pack_v3(dst, src);
263 int bch2_alloc_write(struct btree_trans *trans, struct btree_iter *iter,
264 struct bkey_alloc_unpacked *u, unsigned trigger_flags)
266 struct bkey_alloc_buf *a;
268 a = bch2_trans_kmalloc(trans, sizeof(struct bkey_alloc_buf));
272 bch2_alloc_pack(trans->c, a, *u);
273 return bch2_trans_update(trans, iter, &a->k, trigger_flags);
276 static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a)
278 unsigned i, bytes = offsetof(struct bch_alloc, data);
280 for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++)
281 if (a->fields & (1 << i))
282 bytes += BCH_ALLOC_V1_FIELD_BYTES[i];
284 return DIV_ROUND_UP(bytes, sizeof(u64));
287 const char *bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k)
289 struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k);
291 if (k.k->p.inode >= c->sb.nr_devices ||
292 !c->devs[k.k->p.inode])
293 return "invalid device";
295 /* allow for unknown fields */
296 if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v))
297 return "incorrect value size";
302 const char *bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k)
304 struct bkey_alloc_unpacked u;
306 if (k.k->p.inode >= c->sb.nr_devices ||
307 !c->devs[k.k->p.inode])
308 return "invalid device";
310 if (bch2_alloc_unpack_v2(&u, k))
311 return "unpack error";
316 const char *bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k)
318 struct bkey_alloc_unpacked u;
320 if (k.k->p.inode >= c->sb.nr_devices ||
321 !c->devs[k.k->p.inode])
322 return "invalid device";
324 if (bch2_alloc_unpack_v3(&u, k))
325 return "unpack error";
330 void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
333 struct bkey_alloc_unpacked u = bch2_alloc_unpack(k);
335 pr_buf(out, "gen %u oldest_gen %u data_type %s journal_seq %llu",
336 u.gen, u.oldest_gen, bch2_data_types[u.data_type],
338 #define x(_name, ...) pr_buf(out, " " #_name " %llu", (u64) u._name);
339 BCH_ALLOC_FIELDS_V2()
343 static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k)
345 struct bch_fs *c = trans->c;
348 struct bkey_alloc_unpacked u;
350 if (!bkey_is_alloc(k.k))
353 ca = bch_dev_bkey_exists(c, k.k->p.inode);
354 g = bucket(ca, k.k->p.offset);
355 u = bch2_alloc_unpack(k);
357 g->_mark.gen = u.gen;
358 g->_mark.data_type = u.data_type;
359 g->_mark.dirty_sectors = u.dirty_sectors;
360 g->_mark.cached_sectors = u.cached_sectors;
361 g->_mark.stripe = u.stripe != 0;
362 g->stripe = u.stripe;
363 g->stripe_redundancy = u.stripe_redundancy;
364 g->io_time[READ] = u.read_time;
365 g->io_time[WRITE] = u.write_time;
366 g->oldest_gen = u.oldest_gen;
372 int bch2_alloc_read(struct bch_fs *c)
374 struct btree_trans trans;
377 bch2_trans_init(&trans, c, 0, 0);
378 down_read(&c->gc_lock);
379 ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_alloc, bch2_alloc_read_fn);
380 up_read(&c->gc_lock);
381 bch2_trans_exit(&trans);
383 bch_err(c, "error reading alloc info: %i", ret);
390 static int bch2_alloc_write_key(struct btree_trans *trans,
391 struct btree_iter *iter,
394 struct bch_fs *c = trans->c;
396 struct bkey_alloc_unpacked old_u, new_u;
399 bch2_trans_begin(trans);
401 ret = bch2_btree_key_cache_flush(trans,
402 BTREE_ID_alloc, iter->pos);
406 k = bch2_btree_iter_peek_slot(iter);
411 old_u = bch2_alloc_unpack(k);
412 new_u = alloc_mem_to_key(c, iter);
414 if (!bkey_alloc_unpacked_cmp(old_u, new_u))
417 ret = bch2_alloc_write(trans, iter, &new_u,
418 BTREE_TRIGGER_NORUN) ?:
419 bch2_trans_commit(trans, NULL, NULL,
420 BTREE_INSERT_NOFAIL|flags);
427 int bch2_alloc_write_all(struct bch_fs *c, unsigned flags)
429 struct btree_trans trans;
430 struct btree_iter iter;
435 bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0);
436 bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN,
437 BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
439 for_each_member_device(ca, c, i) {
440 bch2_btree_iter_set_pos(&iter,
441 POS(ca->dev_idx, ca->mi.first_bucket));
443 while (iter.pos.offset < ca->mi.nbuckets) {
444 ret = bch2_alloc_write_key(&trans, &iter, flags);
446 percpu_ref_put(&ca->ref);
449 bch2_btree_iter_advance(&iter);
453 bch2_trans_iter_exit(&trans, &iter);
454 bch2_trans_exit(&trans);
458 /* Bucket IO clocks: */
460 int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
461 size_t bucket_nr, int rw)
463 struct bch_fs *c = trans->c;
464 struct btree_iter iter;
465 struct bkey_alloc_unpacked u;
469 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, POS(dev, bucket_nr),
471 BTREE_ITER_CACHED_NOFILL|
473 ret = bch2_btree_iter_traverse(&iter);
477 u = alloc_mem_to_key(c, &iter);
479 time = rw == READ ? &u.read_time : &u.write_time;
480 now = atomic64_read(&c->io_clock[rw].now);
486 ret = bch2_alloc_write(trans, &iter, &u, 0) ?:
487 bch2_trans_commit(trans, NULL, NULL, 0);
489 bch2_trans_iter_exit(trans, &iter);
493 /* Background allocator thread: */
496 * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens
497 * (marking them as invalidated on disk), then optionally issues discard
498 * commands to the newly free buckets, then puts them on the various freelists.
501 static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b,
502 struct bucket_mark m)
506 if (!is_available_bucket(m))
509 if (m.owned_by_allocator)
512 if (ca->buckets_nouse &&
513 test_bit(b, ca->buckets_nouse))
516 gc_gen = bucket_gc_gen(bucket(ca, b));
518 ca->inc_gen_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX / 2;
519 ca->inc_gen_really_needs_gc += gc_gen >= BUCKET_GC_GEN_MAX;
521 return gc_gen < BUCKET_GC_GEN_MAX;
525 * Determines what order we're going to reuse buckets, smallest bucket_key()
529 static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m,
530 u64 now, u64 last_seq_ondisk)
532 unsigned used = bucket_sectors_used(m);
536 * Prefer to keep buckets that have been read more recently, and
537 * buckets that have more data in them:
539 u64 last_read = max_t(s64, 0, now - g->io_time[READ]);
540 u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used));
542 return -last_read_scaled;
545 * Prefer to use buckets with smaller gc_gen so that we don't
546 * have to walk the btree and recalculate oldest_gen - but shift
547 * off the low bits so that buckets will still have equal sort
548 * keys when there's only a small difference, so that we can
549 * keep sequential buckets together:
551 return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)|
552 (bucket_gc_gen(g) >> 4);
556 static inline int bucket_alloc_cmp(alloc_heap *h,
557 struct alloc_heap_entry l,
558 struct alloc_heap_entry r)
560 return cmp_int(l.key, r.key) ?:
561 cmp_int(r.nr, l.nr) ?:
562 cmp_int(l.bucket, r.bucket);
565 static inline int bucket_idx_cmp(const void *_l, const void *_r)
567 const struct alloc_heap_entry *l = _l, *r = _r;
569 return cmp_int(l->bucket, r->bucket);
572 static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca)
574 struct bucket_array *buckets;
575 struct alloc_heap_entry e = { 0 };
576 u64 now, last_seq_ondisk;
579 down_read(&ca->bucket_lock);
581 buckets = bucket_array(ca);
582 ca->alloc_heap.used = 0;
583 now = atomic64_read(&c->io_clock[READ].now);
584 last_seq_ondisk = c->journal.last_seq_ondisk;
587 * Find buckets with lowest read priority, by building a maxheap sorted
588 * by read priority and repeatedly replacing the maximum element until
589 * all buckets have been visited.
591 for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) {
592 struct bucket *g = &buckets->b[b];
593 struct bucket_mark m = READ_ONCE(g->mark);
594 unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk);
598 if (!bch2_can_invalidate_bucket(ca, b, m))
601 if (e.nr && e.bucket + e.nr == b && e.key == key) {
605 heap_add_or_replace(&ca->alloc_heap, e,
606 -bucket_alloc_cmp, NULL);
608 e = (struct alloc_heap_entry) {
617 heap_add_or_replace(&ca->alloc_heap, e,
618 -bucket_alloc_cmp, NULL);
620 for (i = 0; i < ca->alloc_heap.used; i++)
621 nr += ca->alloc_heap.data[i].nr;
623 while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) {
624 nr -= ca->alloc_heap.data[0].nr;
625 heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL);
628 up_read(&ca->bucket_lock);
631 static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca)
633 struct bucket_array *buckets = bucket_array(ca);
634 struct bucket_mark m;
637 if (ca->fifo_last_bucket < ca->mi.first_bucket ||
638 ca->fifo_last_bucket >= ca->mi.nbuckets)
639 ca->fifo_last_bucket = ca->mi.first_bucket;
641 start = ca->fifo_last_bucket;
644 ca->fifo_last_bucket++;
645 if (ca->fifo_last_bucket == ca->mi.nbuckets)
646 ca->fifo_last_bucket = ca->mi.first_bucket;
648 b = ca->fifo_last_bucket;
649 m = READ_ONCE(buckets->b[b].mark);
651 if (bch2_can_invalidate_bucket(ca, b, m)) {
652 struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
654 heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
655 if (heap_full(&ca->alloc_heap))
660 } while (ca->fifo_last_bucket != start);
663 static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca)
665 struct bucket_array *buckets = bucket_array(ca);
666 struct bucket_mark m;
670 checked < ca->mi.nbuckets / 2;
672 size_t b = bch2_rand_range(ca->mi.nbuckets -
673 ca->mi.first_bucket) +
676 m = READ_ONCE(buckets->b[b].mark);
678 if (bch2_can_invalidate_bucket(ca, b, m)) {
679 struct alloc_heap_entry e = { .bucket = b, .nr = 1, };
681 heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
682 if (heap_full(&ca->alloc_heap))
689 sort(ca->alloc_heap.data,
691 sizeof(ca->alloc_heap.data[0]),
692 bucket_idx_cmp, NULL);
694 /* remove duplicates: */
695 for (i = 0; i + 1 < ca->alloc_heap.used; i++)
696 if (ca->alloc_heap.data[i].bucket ==
697 ca->alloc_heap.data[i + 1].bucket)
698 ca->alloc_heap.data[i].nr = 0;
701 static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca)
705 ca->inc_gen_needs_gc = 0;
706 ca->inc_gen_really_needs_gc = 0;
708 switch (ca->mi.replacement) {
709 case BCH_CACHE_REPLACEMENT_lru:
710 find_reclaimable_buckets_lru(c, ca);
712 case BCH_CACHE_REPLACEMENT_fifo:
713 find_reclaimable_buckets_fifo(c, ca);
715 case BCH_CACHE_REPLACEMENT_random:
716 find_reclaimable_buckets_random(c, ca);
720 heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL);
722 for (i = 0; i < ca->alloc_heap.used; i++)
723 nr += ca->alloc_heap.data[i].nr;
729 * returns sequence number of most recent journal entry that updated this
732 static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m)
734 if (m.journal_seq_valid) {
735 u64 journal_seq = atomic64_read(&c->journal.seq);
736 u64 bucket_seq = journal_seq;
738 bucket_seq &= ~((u64) U16_MAX);
739 bucket_seq |= m.journal_seq;
741 if (bucket_seq > journal_seq)
742 bucket_seq -= 1 << 16;
750 static int bucket_invalidate_btree(struct btree_trans *trans,
751 struct bch_dev *ca, u64 b)
753 struct bch_fs *c = trans->c;
754 struct bkey_alloc_unpacked u;
755 struct btree_iter iter;
758 bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc,
761 BTREE_ITER_CACHED_NOFILL|
764 ret = bch2_btree_iter_traverse(&iter);
768 u = alloc_mem_to_key(c, &iter);
773 u.cached_sectors = 0;
774 u.read_time = atomic64_read(&c->io_clock[READ].now);
775 u.write_time = atomic64_read(&c->io_clock[WRITE].now);
777 ret = bch2_alloc_write(trans, &iter, &u,
778 BTREE_TRIGGER_BUCKET_INVALIDATE);
780 bch2_trans_iter_exit(trans, &iter);
784 static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca,
785 u64 *journal_seq, unsigned flags)
788 struct bucket_mark m;
792 BUG_ON(!ca->alloc_heap.used ||
793 !ca->alloc_heap.data[0].nr);
794 b = ca->alloc_heap.data[0].bucket;
796 /* first, put on free_inc and mark as owned by allocator: */
797 percpu_down_read(&c->mark_lock);
799 m = READ_ONCE(g->mark);
801 BUG_ON(m.dirty_sectors);
803 bch2_mark_alloc_bucket(c, ca, b, true);
805 spin_lock(&c->freelist_lock);
806 verify_not_on_freelist(c, ca, b);
807 BUG_ON(!fifo_push(&ca->free_inc, b));
808 spin_unlock(&c->freelist_lock);
811 * If we're not invalidating cached data, we only increment the bucket
812 * gen in memory here, the incremented gen will be updated in the btree
813 * by bch2_trans_mark_pointer():
815 if (!m.cached_sectors &&
816 !bucket_needs_journal_commit(m, c->journal.last_seq_ondisk)) {
818 bucket_cmpxchg(g, m, m.gen++);
819 percpu_up_read(&c->mark_lock);
823 percpu_up_read(&c->mark_lock);
826 * If the read-only path is trying to shut down, we can't be generating
829 if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) {
834 ret = bch2_trans_do(c, NULL, journal_seq,
835 BTREE_INSERT_NOCHECK_RW|
837 BTREE_INSERT_JOURNAL_RESERVED|
839 bucket_invalidate_btree(&trans, ca, b));
842 /* remove from alloc_heap: */
843 struct alloc_heap_entry e, *top = ca->alloc_heap.data;
849 heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL);
852 * Make sure we flush the last journal entry that updated this
853 * bucket (i.e. deleting the last reference) before writing to
856 *journal_seq = max(*journal_seq, bucket_journal_seq(c, m));
860 /* remove from free_inc: */
861 percpu_down_read(&c->mark_lock);
862 spin_lock(&c->freelist_lock);
864 bch2_mark_alloc_bucket(c, ca, b, false);
866 BUG_ON(!fifo_pop_back(&ca->free_inc, b2));
869 spin_unlock(&c->freelist_lock);
870 percpu_up_read(&c->mark_lock);
873 return ret < 0 ? ret : 0;
877 * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc:
879 static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca)
884 /* Only use nowait if we've already invalidated at least one bucket: */
886 !fifo_full(&ca->free_inc) &&
887 ca->alloc_heap.used) {
888 if (kthread_should_stop()) {
893 ret = bch2_invalidate_one_bucket(c, ca, &journal_seq,
894 (!fifo_empty(&ca->free_inc)
895 ? BTREE_INSERT_NOWAIT : 0));
897 * We only want to batch up invalidates when they're going to
898 * require flushing the journal:
904 /* If we used NOWAIT, don't return the error: */
905 if (!fifo_empty(&ca->free_inc))
908 bch_err(ca, "error invalidating buckets: %i", ret);
913 ret = bch2_journal_flush_seq(&c->journal, journal_seq);
915 bch_err(ca, "journal error: %i", ret);
922 static void alloc_thread_set_state(struct bch_dev *ca, unsigned new_state)
924 if (ca->allocator_state != new_state) {
925 ca->allocator_state = new_state;
926 closure_wake_up(&ca->fs->freelist_wait);
930 static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
935 spin_lock(&c->freelist_lock);
936 for (i = 0; i < RESERVE_NR; i++) {
938 * Don't strand buckets on the copygc freelist until
939 * after recovery is finished:
941 if (i == RESERVE_MOVINGGC &&
942 !test_bit(BCH_FS_STARTED, &c->flags))
945 if (fifo_push(&ca->free[i], b)) {
946 fifo_pop(&ca->free_inc, b);
951 spin_unlock(&c->freelist_lock);
953 ca->allocator_state = ret
955 : ALLOCATOR_blocked_full;
956 closure_wake_up(&c->freelist_wait);
960 static void discard_one_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b)
962 if (ca->mi.discard &&
963 blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev)))
964 blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, b),
965 ca->mi.bucket_size, GFP_NOFS, 0);
968 static bool allocator_thread_running(struct bch_dev *ca)
970 unsigned state = ca->mi.state == BCH_MEMBER_STATE_rw &&
971 test_bit(BCH_FS_ALLOCATOR_RUNNING, &ca->fs->flags)
974 alloc_thread_set_state(ca, state);
975 return state == ALLOCATOR_running;
978 static int buckets_available(struct bch_dev *ca, unsigned long gc_count)
980 s64 available = dev_buckets_reclaimable(ca) -
981 (gc_count == ca->fs->gc_count ? ca->inc_gen_really_needs_gc : 0);
982 bool ret = available > 0;
984 alloc_thread_set_state(ca, ret
986 : ALLOCATOR_blocked);
991 * bch_allocator_thread - move buckets from free_inc to reserves
993 * The free_inc FIFO is populated by find_reclaimable_buckets(), and
994 * the reserves are depleted by bucket allocation. When we run out
995 * of free_inc, try to invalidate some buckets and write out
998 static int bch2_allocator_thread(void *arg)
1000 struct bch_dev *ca = arg;
1001 struct bch_fs *c = ca->fs;
1002 unsigned long gc_count = c->gc_count;
1009 ret = kthread_wait_freezable(allocator_thread_running(ca));
1013 while (!ca->alloc_heap.used) {
1016 ret = kthread_wait_freezable(buckets_available(ca, gc_count));
1020 gc_count = c->gc_count;
1021 nr = find_reclaimable_buckets(c, ca);
1023 trace_alloc_scan(ca, nr, ca->inc_gen_needs_gc,
1024 ca->inc_gen_really_needs_gc);
1026 if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) ||
1027 ca->inc_gen_really_needs_gc) &&
1029 atomic_inc(&c->kick_gc);
1030 wake_up_process(c->gc_thread);
1034 ret = bch2_invalidate_buckets(c, ca);
1038 while (!fifo_empty(&ca->free_inc)) {
1039 u64 b = fifo_peek(&ca->free_inc);
1041 discard_one_bucket(c, ca, b);
1043 ret = kthread_wait_freezable(push_invalidated_bucket(c, ca, b));
1049 alloc_thread_set_state(ca, ALLOCATOR_stopped);
1053 /* Startup/shutdown (ro/rw): */
1055 void bch2_recalc_capacity(struct bch_fs *c)
1058 u64 capacity = 0, reserved_sectors = 0, gc_reserve;
1059 unsigned bucket_size_max = 0;
1060 unsigned long ra_pages = 0;
1063 lockdep_assert_held(&c->state_lock);
1065 for_each_online_member(ca, c, i) {
1066 struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
1068 ra_pages += bdi->ra_pages;
1071 bch2_set_ra_pages(c, ra_pages);
1073 for_each_rw_member(ca, c, i) {
1074 u64 dev_reserve = 0;
1077 * We need to reserve buckets (from the number
1078 * of currently available buckets) against
1079 * foreground writes so that mainly copygc can
1080 * make forward progress.
1082 * We need enough to refill the various reserves
1083 * from scratch - copygc will use its entire
1084 * reserve all at once, then run against when
1085 * its reserve is refilled (from the formerly
1086 * available buckets).
1088 * This reserve is just used when considering if
1089 * allocations for foreground writes must wait -
1090 * not -ENOSPC calculations.
1092 for (j = 0; j < RESERVE_NONE; j++)
1093 dev_reserve += ca->free[j].size;
1095 dev_reserve += 1; /* btree write point */
1096 dev_reserve += 1; /* copygc write point */
1097 dev_reserve += 1; /* rebalance write point */
1099 dev_reserve *= ca->mi.bucket_size;
1101 capacity += bucket_to_sector(ca, ca->mi.nbuckets -
1102 ca->mi.first_bucket);
1104 reserved_sectors += dev_reserve * 2;
1106 bucket_size_max = max_t(unsigned, bucket_size_max,
1107 ca->mi.bucket_size);
1110 gc_reserve = c->opts.gc_reserve_bytes
1111 ? c->opts.gc_reserve_bytes >> 9
1112 : div64_u64(capacity * c->opts.gc_reserve_percent, 100);
1114 reserved_sectors = max(gc_reserve, reserved_sectors);
1116 reserved_sectors = min(reserved_sectors, capacity);
1118 c->capacity = capacity - reserved_sectors;
1120 c->bucket_size_max = bucket_size_max;
1122 /* Wake up case someone was waiting for buckets */
1123 closure_wake_up(&c->freelist_wait);
1126 static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca)
1128 struct open_bucket *ob;
1131 for (ob = c->open_buckets;
1132 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1134 spin_lock(&ob->lock);
1135 if (ob->valid && !ob->on_partial_list &&
1136 ob->ptr.dev == ca->dev_idx)
1138 spin_unlock(&ob->lock);
1144 /* device goes ro: */
1145 void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca)
1149 BUG_ON(ca->alloc_thread);
1151 /* First, remove device from allocation groups: */
1153 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
1154 clear_bit(ca->dev_idx, c->rw_devs[i].d);
1157 * Capacity is calculated based off of devices in allocation groups:
1159 bch2_recalc_capacity(c);
1161 /* Next, close write points that point to this device... */
1162 for (i = 0; i < ARRAY_SIZE(c->write_points); i++)
1163 bch2_writepoint_stop(c, ca, &c->write_points[i]);
1165 bch2_writepoint_stop(c, ca, &c->copygc_write_point);
1166 bch2_writepoint_stop(c, ca, &c->rebalance_write_point);
1167 bch2_writepoint_stop(c, ca, &c->btree_write_point);
1169 mutex_lock(&c->btree_reserve_cache_lock);
1170 while (c->btree_reserve_cache_nr) {
1171 struct btree_alloc *a =
1172 &c->btree_reserve_cache[--c->btree_reserve_cache_nr];
1174 bch2_open_buckets_put(c, &a->ob);
1176 mutex_unlock(&c->btree_reserve_cache_lock);
1179 struct open_bucket *ob;
1181 spin_lock(&c->freelist_lock);
1182 if (!ca->open_buckets_partial_nr) {
1183 spin_unlock(&c->freelist_lock);
1186 ob = c->open_buckets +
1187 ca->open_buckets_partial[--ca->open_buckets_partial_nr];
1188 ob->on_partial_list = false;
1189 spin_unlock(&c->freelist_lock);
1191 bch2_open_bucket_put(c, ob);
1194 bch2_ec_stop_dev(c, ca);
1197 * Wake up threads that were blocked on allocation, so they can notice
1198 * the device can no longer be removed and the capacity has changed:
1200 closure_wake_up(&c->freelist_wait);
1203 * journal_res_get() can block waiting for free space in the journal -
1204 * it needs to notice there may not be devices to allocate from anymore:
1206 wake_up(&c->journal.wait);
1208 /* Now wait for any in flight writes: */
1210 closure_wait_event(&c->open_buckets_wait,
1211 !bch2_dev_has_open_write_point(c, ca));
1214 /* device goes rw: */
1215 void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
1219 for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++)
1220 if (ca->mi.data_allowed & (1 << i))
1221 set_bit(ca->dev_idx, c->rw_devs[i].d);
1224 void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca)
1226 if (ca->alloc_thread)
1227 closure_wait_event(&c->freelist_wait,
1228 ca->allocator_state != ALLOCATOR_running);
1231 /* stop allocator thread: */
1232 void bch2_dev_allocator_stop(struct bch_dev *ca)
1234 struct task_struct *p;
1236 p = rcu_dereference_protected(ca->alloc_thread, 1);
1237 ca->alloc_thread = NULL;
1240 * We need an rcu barrier between setting ca->alloc_thread = NULL and
1241 * the thread shutting down to avoid bch2_wake_allocator() racing:
1243 * XXX: it would be better to have the rcu barrier be asynchronous
1244 * instead of blocking us here
1254 /* start allocator thread: */
1255 int bch2_dev_allocator_start(struct bch_dev *ca)
1257 struct task_struct *p;
1260 * allocator thread already started?
1262 if (ca->alloc_thread)
1265 p = kthread_create(bch2_allocator_thread, ca,
1266 "bch-alloc/%s", ca->name);
1268 bch_err(ca->fs, "error creating allocator thread: %li",
1274 rcu_assign_pointer(ca->alloc_thread, p);
1279 void bch2_fs_allocator_background_init(struct bch_fs *c)
1281 spin_lock_init(&c->freelist_lock);
1284 void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c)
1286 struct open_bucket *ob;
1288 for (ob = c->open_buckets;
1289 ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
1291 spin_lock(&ob->lock);
1292 if (ob->valid && !ob->on_partial_list) {
1293 pr_buf(out, "%zu ref %u type %s\n",
1294 ob - c->open_buckets,
1295 atomic_read(&ob->pin),
1296 bch2_data_types[ob->type]);
1298 spin_unlock(&ob->lock);