2 * Code for manipulating bucket marks for garbage collection.
4 * Copyright 2014 Datera, Inc.
7 * - free bucket: mark == 0
8 * The bucket contains no data and will not be read
10 * - allocator bucket: owned_by_allocator == 1
11 * The bucket is on a free list, or it is an open bucket
13 * - cached bucket: owned_by_allocator == 0 &&
14 * dirty_sectors == 0 &&
16 * The bucket contains data but may be safely discarded as there are
17 * enough replicas of the data on other cache devices, or it has been
18 * written back to the backing device
20 * - dirty bucket: owned_by_allocator == 0 &&
22 * The bucket contains data that we must not discard (either only copy,
23 * or one of the 'main copies' for data requiring multiple replicas)
25 * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1
26 * This is a btree node, journal or gen/prio bucket
30 * bucket invalidated => bucket on freelist => open bucket =>
31 * [dirty bucket =>] cached bucket => bucket invalidated => ...
33 * Note that cache promotion can skip the dirty bucket step, as data
34 * is copied from a deeper tier to a shallower tier, onto a cached
36 * Note also that a cached bucket can spontaneously become dirty --
39 * Only a traversal of the key space can determine whether a bucket is
40 * truly dirty or cached.
44 * - free => allocator: bucket was invalidated
45 * - cached => allocator: bucket was invalidated
47 * - allocator => dirty: open bucket was filled up
48 * - allocator => cached: open bucket was filled up
49 * - allocator => metadata: metadata was allocated
51 * - dirty => cached: dirty sectors were copied to a deeper tier
52 * - dirty => free: dirty sectors were overwritten or moved (copy gc)
53 * - cached => free: cached sectors were overwritten
55 * - metadata => free: metadata was freed
58 * - cached => dirty: a device was removed so formerly replicated data
59 * is no longer sufficiently replicated
60 * - free => cached: cannot happen
61 * - free => dirty: cannot happen
62 * - free => metadata: cannot happen
66 #include "alloc_background.h"
69 #include "btree_update.h"
76 #include <linux/preempt.h>
77 #include <trace/events/bcachefs.h>
80 * Clear journal_seq_valid for buckets for which it's not needed, to prevent
83 void bch2_bucket_seq_cleanup(struct bch_fs *c)
85 u64 journal_seq = atomic64_read(&c->journal.seq);
86 u16 last_seq_ondisk = c->journal.last_seq_ondisk;
88 struct bucket_array *buckets;
93 if (journal_seq - c->last_bucket_seq_cleanup <
94 (1U << (BUCKET_JOURNAL_SEQ_BITS - 2)))
97 c->last_bucket_seq_cleanup = journal_seq;
99 for_each_member_device(ca, c, i) {
100 down_read(&ca->bucket_lock);
101 buckets = bucket_array(ca);
103 for_each_bucket(g, buckets) {
104 bucket_cmpxchg(g, m, ({
105 if (!m.journal_seq_valid ||
106 bucket_needs_journal_commit(m, last_seq_ondisk))
109 m.journal_seq_valid = 0;
112 up_read(&ca->bucket_lock);
116 void bch2_fs_usage_initialize(struct bch_fs *c)
118 struct bch_fs_usage *usage;
121 percpu_down_write(&c->mark_lock);
122 usage = (void *) bch2_acc_percpu_u64s((void *) c->usage[0],
125 for (i = 0; i < BCH_REPLICAS_MAX; i++)
126 usage->reserved += usage->persistent_reserved[i];
128 for (i = 0; i < c->replicas.nr; i++) {
129 struct bch_replicas_entry *e =
130 cpu_replicas_entry(&c->replicas, i);
132 switch (e->data_type) {
135 usage->data += usage->replicas[i];
137 case BCH_DATA_CACHED:
138 usage->cached += usage->replicas[i];
143 percpu_up_write(&c->mark_lock);
146 void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage)
148 if (fs_usage == c->usage_scratch)
149 mutex_unlock(&c->usage_scratch_lock);
154 struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c)
156 struct bch_fs_usage *ret;
157 unsigned bytes = fs_usage_u64s(c) * sizeof(u64);
159 ret = kzalloc(bytes, GFP_NOWAIT);
163 if (mutex_trylock(&c->usage_scratch_lock))
166 ret = kzalloc(bytes, GFP_NOFS);
170 mutex_lock(&c->usage_scratch_lock);
172 ret = c->usage_scratch;
173 memset(ret, 0, bytes);
177 struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
179 struct bch_dev_usage ret;
181 memset(&ret, 0, sizeof(ret));
182 acc_u64s_percpu((u64 *) &ret,
183 (u64 __percpu *) ca->usage[0],
184 sizeof(ret) / sizeof(u64));
189 struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
191 struct bch_fs_usage *ret;
192 unsigned v, u64s = fs_usage_u64s(c);
194 ret = kzalloc(u64s * sizeof(u64), GFP_NOFS);
198 percpu_down_read_preempt_disable(&c->mark_lock);
200 v = fs_usage_u64s(c);
201 if (unlikely(u64s != v)) {
203 percpu_up_read_preempt_enable(&c->mark_lock);
208 acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s);
213 #define RESERVE_FACTOR 6
215 static u64 reserve_factor(u64 r)
217 return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR);
220 static u64 avail_factor(u64 r)
222 return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
225 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage)
227 return min(fs_usage->hidden +
229 reserve_factor(fs_usage->reserved +
230 fs_usage->online_reserved),
234 static struct bch_fs_usage_short
235 __bch2_fs_usage_read_short(struct bch_fs *c)
237 struct bch_fs_usage_short ret;
240 ret.capacity = c->capacity -
241 percpu_u64_get(&c->usage[0]->hidden);
243 data = percpu_u64_get(&c->usage[0]->data);
244 reserved = percpu_u64_get(&c->usage[0]->reserved) +
245 percpu_u64_get(&c->usage[0]->online_reserved);
247 ret.used = min(ret.capacity, data + reserve_factor(reserved));
248 ret.free = ret.capacity - ret.used;
250 ret.nr_inodes = percpu_u64_get(&c->usage[0]->nr_inodes);
255 struct bch_fs_usage_short
256 bch2_fs_usage_read_short(struct bch_fs *c)
258 struct bch_fs_usage_short ret;
260 percpu_down_read_preempt_disable(&c->mark_lock);
261 ret = __bch2_fs_usage_read_short(c);
262 percpu_up_read_preempt_enable(&c->mark_lock);
267 static inline int is_unavailable_bucket(struct bucket_mark m)
269 return !is_available_bucket(m);
272 static inline int is_fragmented_bucket(struct bucket_mark m,
275 if (!m.owned_by_allocator &&
276 m.data_type == BCH_DATA_USER &&
277 bucket_sectors_used(m))
278 return max_t(int, 0, (int) ca->mi.bucket_size -
279 bucket_sectors_used(m));
283 static inline enum bch_data_type bucket_type(struct bucket_mark m)
285 return m.cached_sectors && !m.dirty_sectors
290 static bool bucket_became_unavailable(struct bucket_mark old,
291 struct bucket_mark new)
293 return is_available_bucket(old) &&
294 !is_available_bucket(new);
297 int bch2_fs_usage_apply(struct bch_fs *c,
298 struct bch_fs_usage *fs_usage,
299 struct disk_reservation *disk_res)
301 s64 added = fs_usage->data + fs_usage->reserved;
302 s64 should_not_have_added;
305 percpu_rwsem_assert_held(&c->mark_lock);
308 * Not allowed to reduce sectors_available except by getting a
311 should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0);
312 if (WARN_ONCE(should_not_have_added > 0,
313 "disk usage increased without a reservation")) {
314 atomic64_sub(should_not_have_added, &c->sectors_available);
315 added -= should_not_have_added;
320 disk_res->sectors -= added;
321 fs_usage->online_reserved -= added;
325 acc_u64s((u64 *) this_cpu_ptr(c->usage[0]),
326 (u64 *) fs_usage, fs_usage_u64s(c));
332 static inline void account_bucket(struct bch_fs_usage *fs_usage,
333 struct bch_dev_usage *dev_usage,
334 enum bch_data_type type,
337 if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
338 fs_usage->hidden += size;
340 dev_usage->buckets[type] += nr;
343 static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
344 struct bch_fs_usage *fs_usage,
345 struct bucket_mark old, struct bucket_mark new,
348 struct bch_dev_usage *dev_usage;
350 percpu_rwsem_assert_held(&c->mark_lock);
352 bch2_fs_inconsistent_on(old.data_type && new.data_type &&
353 old.data_type != new.data_type, c,
354 "different types of data in same bucket: %s, %s",
355 bch2_data_types[old.data_type],
356 bch2_data_types[new.data_type]);
358 dev_usage = this_cpu_ptr(ca->usage[gc]);
360 if (bucket_type(old))
361 account_bucket(fs_usage, dev_usage, bucket_type(old),
362 -1, -ca->mi.bucket_size);
364 if (bucket_type(new))
365 account_bucket(fs_usage, dev_usage, bucket_type(new),
366 1, ca->mi.bucket_size);
368 dev_usage->buckets_alloc +=
369 (int) new.owned_by_allocator - (int) old.owned_by_allocator;
370 dev_usage->buckets_ec +=
371 (int) new.stripe - (int) old.stripe;
372 dev_usage->buckets_unavailable +=
373 is_unavailable_bucket(new) - is_unavailable_bucket(old);
375 dev_usage->sectors[old.data_type] -= old.dirty_sectors;
376 dev_usage->sectors[new.data_type] += new.dirty_sectors;
377 dev_usage->sectors[BCH_DATA_CACHED] +=
378 (int) new.cached_sectors - (int) old.cached_sectors;
379 dev_usage->sectors_fragmented +=
380 is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca);
382 if (!is_available_bucket(old) && is_available_bucket(new))
383 bch2_wake_allocator(ca);
386 void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca)
388 struct bucket_mark old = { .v.counter = 0 };
389 struct bch_fs_usage *fs_usage;
390 struct bucket_array *buckets;
393 percpu_down_read_preempt_disable(&c->mark_lock);
394 fs_usage = this_cpu_ptr(c->usage[0]);
395 buckets = bucket_array(ca);
397 for_each_bucket(g, buckets)
398 if (g->mark.data_type)
399 bch2_dev_usage_update(c, ca, fs_usage, old, g->mark, false);
400 percpu_up_read_preempt_enable(&c->mark_lock);
403 #define bucket_data_cmpxchg(c, ca, fs_usage, g, new, expr) \
405 struct bucket_mark _old = bucket_cmpxchg(g, new, expr); \
407 bch2_dev_usage_update(c, ca, fs_usage, _old, new, gc); \
411 static inline void update_replicas(struct bch_fs *c,
412 struct bch_fs_usage *fs_usage,
413 struct bch_replicas_entry *r,
416 int idx = bch2_replicas_entry_idx(c, r);
421 if (r->data_type == BCH_DATA_CACHED)
422 fs_usage->cached += sectors;
424 fs_usage->data += sectors;
425 fs_usage->replicas[idx] += sectors;
428 static inline void update_cached_sectors(struct bch_fs *c,
429 struct bch_fs_usage *fs_usage,
430 unsigned dev, s64 sectors)
432 struct bch_replicas_padded r;
434 bch2_replicas_entry_cached(&r.e, dev);
436 update_replicas(c, fs_usage, &r.e, sectors);
439 #define do_mark_fn(fn, c, pos, flags, ...) \
443 percpu_rwsem_assert_held(&c->mark_lock); \
445 for (gc = 0; gc < 2 && !ret; gc++) \
446 if (!gc == !(flags & BCH_BUCKET_MARK_GC) || \
447 (gc && gc_visited(c, pos))) \
448 ret = fn(c, __VA_ARGS__, gc); \
452 static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
453 size_t b, struct bucket_mark *ret,
456 struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
457 struct bucket *g = __bucket(ca, b, gc);
458 struct bucket_mark old, new;
460 old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
461 BUG_ON(!is_available_bucket(new));
463 new.owned_by_allocator = true;
466 new.cached_sectors = 0;
467 new.dirty_sectors = 0;
471 if (old.cached_sectors)
472 update_cached_sectors(c, fs_usage, ca->dev_idx,
473 -((s64) old.cached_sectors));
480 void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
481 size_t b, struct bucket_mark *old)
483 do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0,
486 if (!old->owned_by_allocator && old->cached_sectors)
487 trace_invalidate(ca, bucket_to_sector(ca, b),
488 old->cached_sectors);
491 static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
492 size_t b, bool owned_by_allocator,
495 struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage[gc]);
496 struct bucket *g = __bucket(ca, b, gc);
497 struct bucket_mark old, new;
499 old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
500 new.owned_by_allocator = owned_by_allocator;
504 !owned_by_allocator && !old.owned_by_allocator);
509 void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca,
510 size_t b, bool owned_by_allocator,
511 struct gc_pos pos, unsigned flags)
513 do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags,
514 ca, b, owned_by_allocator);
517 static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k,
519 struct bch_fs_usage *fs_usage,
520 unsigned journal_seq, unsigned flags,
523 struct bkey_alloc_unpacked u;
526 struct bucket_mark old, m;
532 * alloc btree is read in by bch2_alloc_read, not gc:
534 if (flags & BCH_BUCKET_MARK_GC)
537 u = bch2_alloc_unpack(bkey_s_c_to_alloc(k).v);
538 ca = bch_dev_bkey_exists(c, k.k->p.inode);
539 g = __bucket(ca, k.k->p.offset, gc);
542 * this should currently only be getting called from the bucket
545 BUG_ON(u.dirty_sectors);
546 BUG_ON(u.cached_sectors);
547 BUG_ON(!g->mark.owned_by_allocator);
549 old = bucket_data_cmpxchg(c, ca, fs_usage, g, m, ({
551 m.data_type = u.data_type;
552 m.dirty_sectors = u.dirty_sectors;
553 m.cached_sectors = u.cached_sectors;
556 g->io_time[READ] = u.read_time;
557 g->io_time[WRITE] = u.write_time;
558 g->oldest_gen = u.oldest_gen;
561 if (old.cached_sectors) {
562 update_cached_sectors(c, fs_usage, ca->dev_idx,
563 -old.cached_sectors);
564 trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset),
571 #define checked_add(a, b) \
573 unsigned _res = (unsigned) (a) + (b); \
574 bool overflow = _res > U16_MAX; \
581 static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
582 size_t b, enum bch_data_type type,
583 unsigned sectors, bool gc)
585 struct bucket *g = __bucket(ca, b, gc);
586 struct bucket_mark old, new;
589 BUG_ON(type != BCH_DATA_SB &&
590 type != BCH_DATA_JOURNAL);
592 old = bucket_cmpxchg(g, new, ({
594 new.data_type = type;
595 overflow = checked_add(new.dirty_sectors, sectors);
598 bch2_fs_inconsistent_on(overflow, c,
599 "bucket sector count overflow: %u + %u > U16_MAX",
600 old.dirty_sectors, sectors);
603 bch2_dev_usage_update(c, ca, this_cpu_ptr(c->usage[gc]),
609 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
610 size_t b, enum bch_data_type type,
611 unsigned sectors, struct gc_pos pos,
614 BUG_ON(type != BCH_DATA_SB &&
615 type != BCH_DATA_JOURNAL);
618 do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags,
619 ca, b, type, sectors);
621 __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0);
625 static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p,
630 * marking a new extent, which _will have size_ @delta
632 * in the bch2_mark_update -> BCH_EXTENT_OVERLAP_MIDDLE
633 * case, we haven't actually created the key we'll be inserting
634 * yet (for the split) - so we don't want to be using
635 * k->size/crc.live_size here:
637 return __ptr_disk_sectors(p, delta);
639 BUG_ON(-delta > p.crc.live_size);
641 return (s64) __ptr_disk_sectors(p, p.crc.live_size + delta) -
642 (s64) ptr_disk_sectors(p);
647 * Checking against gc's position has to be done here, inside the cmpxchg()
648 * loop, to avoid racing with the start of gc clearing all the marks - GC does
649 * that with the gc pos seqlock held.
651 static bool bch2_mark_pointer(struct bch_fs *c,
652 struct extent_ptr_decoded p,
653 s64 sectors, enum bch_data_type data_type,
654 struct bch_fs_usage *fs_usage,
655 unsigned journal_seq, unsigned flags,
658 struct bucket_mark old, new;
659 struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
660 size_t b = PTR_BUCKET_NR(ca, &p.ptr);
661 struct bucket *g = __bucket(ca, b, gc);
665 v = atomic64_read(&g->_mark.v);
667 new.v.counter = old.v.counter = v;
672 * Check this after reading bucket mark to guard against
673 * the allocator invalidating a bucket after we've already
676 if (gen_after(new.gen, p.ptr.gen)) {
677 BUG_ON(!test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags));
678 EBUG_ON(!p.ptr.cached &&
679 test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags));
684 overflow = checked_add(new.dirty_sectors, sectors);
686 overflow = checked_add(new.cached_sectors, sectors);
688 if (!new.dirty_sectors &&
689 !new.cached_sectors) {
693 new.journal_seq_valid = 1;
694 new.journal_seq = journal_seq;
697 new.data_type = data_type;
700 if (flags & BCH_BUCKET_MARK_NOATOMIC) {
704 } while ((v = atomic64_cmpxchg(&g->_mark.v,
706 new.v.counter)) != old.v.counter);
708 bch2_fs_inconsistent_on(overflow, c,
709 "bucket sector count overflow: %u + %lli > U16_MAX",
712 : old.cached_sectors, sectors);
714 bch2_dev_usage_update(c, ca, fs_usage, old, new, gc);
716 BUG_ON(!gc && bucket_became_unavailable(old, new));
721 static int bch2_mark_stripe_ptr(struct bch_fs *c,
722 struct bch_extent_stripe_ptr p,
723 enum bch_data_type data_type,
724 struct bch_fs_usage *fs_usage,
725 s64 sectors, unsigned flags,
729 unsigned old, new, nr_data;
730 int blocks_nonempty_delta;
735 m = genradix_ptr(&c->stripes[gc], p.idx);
737 spin_lock(&c->ec_stripes_heap_lock);
739 if (!m || !m->alive) {
740 spin_unlock(&c->ec_stripes_heap_lock);
741 bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
746 BUG_ON(m->r.e.data_type != data_type);
748 nr_data = m->nr_blocks - m->nr_redundant;
750 parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data);
753 parity_sectors = -parity_sectors;
754 sectors += parity_sectors;
756 old = m->block_sectors[p.block];
757 m->block_sectors[p.block] += sectors;
758 new = m->block_sectors[p.block];
760 blocks_nonempty_delta = (int) !!new - (int) !!old;
761 if (blocks_nonempty_delta) {
762 m->blocks_nonempty += blocks_nonempty_delta;
765 bch2_stripes_heap_update(c, m, p.idx);
770 spin_unlock(&c->ec_stripes_heap_lock);
772 update_replicas(c, fs_usage, &m->r.e, sectors);
777 static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
778 s64 sectors, enum bch_data_type data_type,
779 struct bch_fs_usage *fs_usage,
780 unsigned journal_seq, unsigned flags,
783 struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
784 const union bch_extent_entry *entry;
785 struct extent_ptr_decoded p;
786 struct bch_replicas_padded r;
787 s64 dirty_sectors = 0;
791 r.e.data_type = data_type;
797 bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
798 s64 disk_sectors = data_type == BCH_DATA_BTREE
800 : ptr_disk_sectors_delta(p, sectors);
801 bool stale = bch2_mark_pointer(c, p, disk_sectors, data_type,
802 fs_usage, journal_seq, flags, gc);
805 if (disk_sectors && !stale)
806 update_cached_sectors(c, fs_usage, p.ptr.dev,
808 } else if (!p.ec_nr) {
809 dirty_sectors += disk_sectors;
810 r.e.devs[r.e.nr_devs++] = p.ptr.dev;
812 for (i = 0; i < p.ec_nr; i++) {
813 ret = bch2_mark_stripe_ptr(c, p.ec[i],
815 disk_sectors, flags, gc);
825 update_replicas(c, fs_usage, &r.e, dirty_sectors);
830 static void bucket_set_stripe(struct bch_fs *c,
831 const struct bch_stripe *v,
833 struct bch_fs_usage *fs_usage,
839 for (i = 0; i < v->nr_blocks; i++) {
840 const struct bch_extent_ptr *ptr = v->ptrs + i;
841 struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
842 size_t b = PTR_BUCKET_NR(ca, ptr);
843 struct bucket *g = __bucket(ca, b, gc);
844 struct bucket_mark new, old;
846 BUG_ON(ptr_stale(ca, ptr));
848 old = bucket_data_cmpxchg(c, ca, fs_usage, g, new, ({
850 new.stripe = enabled;
852 new.journal_seq_valid = 1;
853 new.journal_seq = journal_seq;
859 static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
861 struct bch_fs_usage *fs_usage,
862 u64 journal_seq, unsigned flags,
865 struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
866 size_t idx = s.k->p.offset;
867 struct stripe *m = genradix_ptr(&c->stripes[gc], idx);
870 spin_lock(&c->ec_stripes_heap_lock);
872 if (!m || (!inserting && !m->alive)) {
873 spin_unlock(&c->ec_stripes_heap_lock);
874 bch_err_ratelimited(c, "error marking nonexistent stripe %zu",
880 bch2_stripes_heap_del(c, m, idx);
882 memset(m, 0, sizeof(*m));
885 m->sectors = le16_to_cpu(s.v->sectors);
886 m->algorithm = s.v->algorithm;
887 m->nr_blocks = s.v->nr_blocks;
888 m->nr_redundant = s.v->nr_redundant;
890 memset(&m->r, 0, sizeof(m->r));
892 m->r.e.data_type = BCH_DATA_USER;
893 m->r.e.nr_devs = s.v->nr_blocks;
894 m->r.e.nr_required = s.v->nr_blocks - s.v->nr_redundant;
896 for (i = 0; i < s.v->nr_blocks; i++)
897 m->r.e.devs[i] = s.v->ptrs[i].dev;
900 * XXX: account for stripes somehow here
903 update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
906 /* gc recalculates these fields: */
907 if (!(flags & BCH_BUCKET_MARK_GC)) {
908 for (i = 0; i < s.v->nr_blocks; i++) {
909 m->block_sectors[i] =
910 stripe_blockcount_get(s.v, i);
911 m->blocks_nonempty += !!m->block_sectors[i];
916 bch2_stripes_heap_insert(c, m, idx);
921 spin_unlock(&c->ec_stripes_heap_lock);
923 bucket_set_stripe(c, s.v, inserting, fs_usage, 0, gc);
927 static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
928 bool inserting, s64 sectors,
929 struct bch_fs_usage *fs_usage,
930 unsigned journal_seq, unsigned flags,
938 fs_usage = this_cpu_ptr(c->usage[gc]);
942 ret = bch2_mark_alloc(c, k, inserting,
943 fs_usage, journal_seq, flags, gc);
945 case KEY_TYPE_btree_ptr:
946 ret = bch2_mark_extent(c, k, inserting
947 ? c->opts.btree_node_size
948 : -c->opts.btree_node_size,
950 fs_usage, journal_seq, flags, gc);
952 case KEY_TYPE_extent:
953 ret = bch2_mark_extent(c, k, sectors, BCH_DATA_USER,
954 fs_usage, journal_seq, flags, gc);
956 case KEY_TYPE_stripe:
957 ret = bch2_mark_stripe(c, k, inserting,
958 fs_usage, journal_seq, flags, gc);
962 fs_usage->nr_inodes++;
964 fs_usage->nr_inodes--;
966 case KEY_TYPE_reservation: {
967 unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
970 replicas = clamp_t(unsigned, replicas, 1,
971 ARRAY_SIZE(fs_usage->persistent_reserved));
973 fs_usage->reserved += sectors;
974 fs_usage->persistent_reserved[replicas - 1] += sectors;
984 int bch2_mark_key_locked(struct bch_fs *c,
986 bool inserting, s64 sectors,
988 struct bch_fs_usage *fs_usage,
989 u64 journal_seq, unsigned flags)
991 return do_mark_fn(__bch2_mark_key, c, pos, flags,
992 k, inserting, sectors, fs_usage,
996 int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
997 bool inserting, s64 sectors,
999 struct bch_fs_usage *fs_usage,
1000 u64 journal_seq, unsigned flags)
1004 percpu_down_read_preempt_disable(&c->mark_lock);
1005 ret = bch2_mark_key_locked(c, k, inserting, sectors,
1006 pos, fs_usage, journal_seq, flags);
1007 percpu_up_read_preempt_enable(&c->mark_lock);
1012 void bch2_mark_update(struct btree_trans *trans,
1013 struct btree_insert_entry *insert,
1014 struct bch_fs_usage *fs_usage)
1016 struct bch_fs *c = trans->c;
1017 struct btree_iter *iter = insert->iter;
1018 struct btree *b = iter->l[0].b;
1019 struct btree_node_iter node_iter = iter->l[0].iter;
1020 struct gc_pos pos = gc_pos_btree_node(b);
1021 struct bkey_packed *_k;
1023 if (!btree_node_type_needs_gc(iter->btree_id))
1026 if (!(trans->flags & BTREE_INSERT_NOMARK))
1027 bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
1028 bpos_min(insert->k->k.p, b->key.k.p).offset -
1029 bkey_start_offset(&insert->k->k),
1030 pos, fs_usage, trans->journal_res.seq, 0);
1032 while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
1033 KEY_TYPE_discard))) {
1034 struct bkey unpacked;
1038 k = bkey_disassemble(b, _k, &unpacked);
1040 if (btree_node_is_extents(b)
1041 ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
1042 : bkey_cmp(insert->k->k.p, k.k->p))
1045 if (btree_node_is_extents(b)) {
1046 switch (bch2_extent_overlap(&insert->k->k, k.k)) {
1047 case BCH_EXTENT_OVERLAP_ALL:
1048 sectors = -((s64) k.k->size);
1050 case BCH_EXTENT_OVERLAP_BACK:
1051 sectors = bkey_start_offset(&insert->k->k) -
1054 case BCH_EXTENT_OVERLAP_FRONT:
1055 sectors = bkey_start_offset(k.k) -
1056 insert->k->k.p.offset;
1058 case BCH_EXTENT_OVERLAP_MIDDLE:
1059 sectors = k.k->p.offset - insert->k->k.p.offset;
1060 BUG_ON(sectors <= 0);
1062 bch2_mark_key_locked(c, k, true, sectors,
1063 pos, fs_usage, trans->journal_res.seq, 0);
1065 sectors = bkey_start_offset(&insert->k->k) -
1070 BUG_ON(sectors >= 0);
1073 bch2_mark_key_locked(c, k, false, sectors,
1074 pos, fs_usage, trans->journal_res.seq, 0);
1076 bch2_btree_node_iter_advance(&node_iter, b);
1080 void bch2_trans_fs_usage_apply(struct btree_trans *trans,
1081 struct bch_fs_usage *fs_usage)
1083 struct bch_fs *c = trans->c;
1084 struct btree_insert_entry *i;
1085 static int warned_disk_usage = 0;
1086 u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
1089 if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res) ||
1090 warned_disk_usage ||
1091 xchg(&warned_disk_usage, 1))
1094 pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors);
1096 trans_for_each_update_iter(trans, i) {
1097 struct btree_iter *iter = i->iter;
1098 struct btree *b = iter->l[0].b;
1099 struct btree_node_iter node_iter = iter->l[0].iter;
1100 struct bkey_packed *_k;
1102 pr_err("while inserting");
1103 bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k));
1105 pr_err("overlapping with");
1107 node_iter = iter->l[0].iter;
1108 while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
1109 KEY_TYPE_discard))) {
1110 struct bkey unpacked;
1113 k = bkey_disassemble(b, _k, &unpacked);
1115 if (btree_node_is_extents(b)
1116 ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0
1117 : bkey_cmp(i->k->k.p, k.k->p))
1120 bch2_bkey_val_to_text(&PBUF(buf), c, k);
1123 bch2_btree_node_iter_advance(&node_iter, b);
1128 /* Disk reservations: */
1130 static u64 bch2_recalc_sectors_available(struct bch_fs *c)
1132 percpu_u64_set(&c->pcpu->sectors_available, 0);
1134 return avail_factor(__bch2_fs_usage_read_short(c).free);
1137 void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res)
1139 percpu_down_read_preempt_disable(&c->mark_lock);
1140 this_cpu_sub(c->usage[0]->online_reserved,
1142 percpu_up_read_preempt_enable(&c->mark_lock);
1147 #define SECTORS_CACHE 1024
1149 int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
1150 unsigned sectors, int flags)
1152 struct bch_fs_pcpu *pcpu;
1154 s64 sectors_available;
1157 percpu_down_read_preempt_disable(&c->mark_lock);
1158 pcpu = this_cpu_ptr(c->pcpu);
1160 if (sectors <= pcpu->sectors_available)
1163 v = atomic64_read(&c->sectors_available);
1166 get = min((u64) sectors + SECTORS_CACHE, old);
1168 if (get < sectors) {
1169 percpu_up_read_preempt_enable(&c->mark_lock);
1172 } while ((v = atomic64_cmpxchg(&c->sectors_available,
1173 old, old - get)) != old);
1175 pcpu->sectors_available += get;
1178 pcpu->sectors_available -= sectors;
1179 this_cpu_add(c->usage[0]->online_reserved, sectors);
1180 res->sectors += sectors;
1182 percpu_up_read_preempt_enable(&c->mark_lock);
1186 percpu_down_write(&c->mark_lock);
1188 sectors_available = bch2_recalc_sectors_available(c);
1190 if (sectors <= sectors_available ||
1191 (flags & BCH_DISK_RESERVATION_NOFAIL)) {
1192 atomic64_set(&c->sectors_available,
1193 max_t(s64, 0, sectors_available - sectors));
1194 this_cpu_add(c->usage[0]->online_reserved, sectors);
1195 res->sectors += sectors;
1198 atomic64_set(&c->sectors_available, sectors_available);
1202 percpu_up_write(&c->mark_lock);
1207 /* Startup/shutdown: */
1209 static void buckets_free_rcu(struct rcu_head *rcu)
1211 struct bucket_array *buckets =
1212 container_of(rcu, struct bucket_array, rcu);
1215 sizeof(struct bucket_array) +
1216 buckets->nbuckets * sizeof(struct bucket));
1219 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
1221 struct bucket_array *buckets = NULL, *old_buckets = NULL;
1222 unsigned long *buckets_nouse = NULL;
1223 unsigned long *buckets_written = NULL;
1224 alloc_fifo free[RESERVE_NR];
1225 alloc_fifo free_inc;
1226 alloc_heap alloc_heap;
1227 copygc_heap copygc_heap;
1229 size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE,
1230 ca->mi.bucket_size / c->opts.btree_node_size);
1231 /* XXX: these should be tunable */
1232 size_t reserve_none = max_t(size_t, 1, nbuckets >> 9);
1233 size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7);
1234 size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12),
1236 bool resize = ca->buckets[0] != NULL,
1237 start_copygc = ca->copygc_thread != NULL;
1241 memset(&free, 0, sizeof(free));
1242 memset(&free_inc, 0, sizeof(free_inc));
1243 memset(&alloc_heap, 0, sizeof(alloc_heap));
1244 memset(©gc_heap, 0, sizeof(copygc_heap));
1246 if (!(buckets = kvpmalloc(sizeof(struct bucket_array) +
1247 nbuckets * sizeof(struct bucket),
1248 GFP_KERNEL|__GFP_ZERO)) ||
1249 !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) *
1250 sizeof(unsigned long),
1251 GFP_KERNEL|__GFP_ZERO)) ||
1252 !(buckets_written = kvpmalloc(BITS_TO_LONGS(nbuckets) *
1253 sizeof(unsigned long),
1254 GFP_KERNEL|__GFP_ZERO)) ||
1255 !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) ||
1256 !init_fifo(&free[RESERVE_MOVINGGC],
1257 copygc_reserve, GFP_KERNEL) ||
1258 !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) ||
1259 !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) ||
1260 !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) ||
1261 !init_heap(©gc_heap, copygc_reserve, GFP_KERNEL))
1264 buckets->first_bucket = ca->mi.first_bucket;
1265 buckets->nbuckets = nbuckets;
1267 bch2_copygc_stop(ca);
1270 down_write(&c->gc_lock);
1271 down_write(&ca->bucket_lock);
1272 percpu_down_write(&c->mark_lock);
1275 old_buckets = bucket_array(ca);
1278 size_t n = min(buckets->nbuckets, old_buckets->nbuckets);
1282 n * sizeof(struct bucket));
1283 memcpy(buckets_nouse,
1285 BITS_TO_LONGS(n) * sizeof(unsigned long));
1286 memcpy(buckets_written,
1287 ca->buckets_written,
1288 BITS_TO_LONGS(n) * sizeof(unsigned long));
1291 rcu_assign_pointer(ca->buckets[0], buckets);
1292 buckets = old_buckets;
1294 swap(ca->buckets_nouse, buckets_nouse);
1295 swap(ca->buckets_written, buckets_written);
1298 percpu_up_write(&c->mark_lock);
1300 spin_lock(&c->freelist_lock);
1301 for (i = 0; i < RESERVE_NR; i++) {
1302 fifo_move(&free[i], &ca->free[i]);
1303 swap(ca->free[i], free[i]);
1305 fifo_move(&free_inc, &ca->free_inc);
1306 swap(ca->free_inc, free_inc);
1307 spin_unlock(&c->freelist_lock);
1309 /* with gc lock held, alloc_heap can't be in use: */
1310 swap(ca->alloc_heap, alloc_heap);
1312 /* and we shut down copygc: */
1313 swap(ca->copygc_heap, copygc_heap);
1315 nbuckets = ca->mi.nbuckets;
1318 up_write(&ca->bucket_lock);
1319 up_write(&c->gc_lock);
1323 bch2_copygc_start(c, ca))
1324 bch_err(ca, "error restarting copygc thread");
1328 free_heap(©gc_heap);
1329 free_heap(&alloc_heap);
1330 free_fifo(&free_inc);
1331 for (i = 0; i < RESERVE_NR; i++)
1332 free_fifo(&free[i]);
1333 kvpfree(buckets_nouse,
1334 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
1335 kvpfree(buckets_written,
1336 BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
1338 call_rcu(&old_buckets->rcu, buckets_free_rcu);
1343 void bch2_dev_buckets_free(struct bch_dev *ca)
1347 free_heap(&ca->copygc_heap);
1348 free_heap(&ca->alloc_heap);
1349 free_fifo(&ca->free_inc);
1350 for (i = 0; i < RESERVE_NR; i++)
1351 free_fifo(&ca->free[i]);
1352 kvpfree(ca->buckets_written,
1353 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
1354 kvpfree(ca->buckets_nouse,
1355 BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
1356 kvpfree(rcu_dereference_protected(ca->buckets[0], 1),
1357 sizeof(struct bucket_array) +
1358 ca->mi.nbuckets * sizeof(struct bucket));
1360 free_percpu(ca->usage[0]);
1363 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
1365 if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage)))
1368 return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);;